From 4202f4fec07a0fc76a255bc5c07de85a5f89e584 Mon Sep 17 00:00:00 2001 From: David Son Date: Thu, 6 Jun 2024 16:42:48 +0000 Subject: [PATCH] Add support for idmapped layers Signed-off-by: David Son --- Makefile | 5 +- config/config.toml | 1 + config/fs.go | 1 + fs/fs.go | 108 +++- fs/fs_test.go | 11 +- fs/layer/layer.go | 7 +- fs/layer/node.go | 40 +- fs/layer/node_test.go | 3 +- fs/layer/util_test.go | 3 +- fs/source/source.go | 3 + go.mod | 2 +- idtools/idmap.go | 67 ++ integration/config/ctr.patch | 1115 ++++++++++++++++++++++++++++++++++ integration/run_test.go | 130 ++++ service/service.go | 3 + snapshot/snapshot.go | 133 +++- snapshot/snapshot_test.go | 17 + 17 files changed, 1597 insertions(+), 52 deletions(-) create mode 100644 integration/config/ctr.patch diff --git a/Makefile b/Makefile index 7c0f7c0fe..3803510a4 100644 --- a/Makefile +++ b/Makefile @@ -186,9 +186,10 @@ ctr-with-idmapping: $(OUTDIR)/ctr-with-idmapping $(OUTDIR)/ctr-with-idmapping: # Use a custom fork for testing ID-mapping as containerd doesn't fully support this yet. - git clone https://github.com/sondavidb/containerd.git tempfolder + git clone https://github.com/containerd/containerd.git tempfolder cd tempfolder && \ - git checkout multi-uid-gid && \ + git checkout v1.7.22 && \ + git apply $(SOCI_SNAPSHOTTER_PROJECT_ROOT)/integration/config/ctr.patch && \ make bin/ctr && \ cp bin/ctr $(OUTDIR)/ctr-with-idmapping && \ cd ../ diff --git a/config/config.toml b/config/config.toml index 21385f5d0..3e75d284e 100644 --- a/config/config.toml +++ b/config/config.toml @@ -16,6 +16,7 @@ filesystem_cache_type="" resolve_result_entry=0 debug=false allow_no_verification=true +allow_idmap=true # disable_verification=false # Causes TestRunWithDefaultConfig to break, but # fine to use in /etc/soci-snapshotter-grpc-config.toml diff --git a/config/fs.go b/config/fs.go index 4088d9b6e..4d3af996e 100644 --- a/config/fs.go +++ b/config/fs.go @@ -56,6 +56,7 @@ type FSConfig struct { NoPrometheus bool `toml:"no_prometheus"` MountTimeoutSec int64 `toml:"mount_timeout_sec"` FuseMetricsEmitWaitDurationSec int64 `toml:"fuse_metrics_emit_wait_duration_sec"` + AllowIDMap bool `toml:"allow_idmap" default:"true"` RetryableHTTPClientConfig `toml:"http"` BlobConfig `toml:"blob"` diff --git a/fs/fs.go b/fs/fs.go index 3f3dbbb58..8c430eee3 100644 --- a/fs/fs.go +++ b/fs/fs.go @@ -45,9 +45,13 @@ package fs import ( "context" "fmt" + "io" golog "log" "net/http" + "os" "os/exec" + "path/filepath" + "strings" "sync" "syscall" "time" @@ -59,6 +63,7 @@ import ( layermetrics "github.com/awslabs/soci-snapshotter/fs/metrics/layer" "github.com/awslabs/soci-snapshotter/fs/remote" "github.com/awslabs/soci-snapshotter/fs/source" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/awslabs/soci-snapshotter/metadata" "github.com/awslabs/soci-snapshotter/snapshot" "github.com/awslabs/soci-snapshotter/soci" @@ -67,6 +72,7 @@ import ( ctdsnapshotters "github.com/containerd/containerd/pkg/snapshotters" "github.com/containerd/containerd/reference" "github.com/containerd/containerd/remotes/docker" + "github.com/containerd/errdefs" "github.com/containerd/log" metrics "github.com/docker/go-metrics" fusefs "github.com/hanwen/go-fuse/v2/fs" @@ -455,6 +461,58 @@ func (fs *filesystem) getSociContext(ctx context.Context, imageRef, indexDigest, return c, err } +func getIDMappedMountpoint(mountpoint, activeLayerKey string) string { + d := filepath.Dir(mountpoint) + return filepath.Join(fmt.Sprintf("%s_%s", d, activeLayerKey), "fs") +} + +func (fs *filesystem) IDMapMount(ctx context.Context, mountpoint, activeLayerKey string, idmapper idtools.IDMap) (string, error) { + newMountpoint := getIDMappedMountpoint(mountpoint, activeLayerKey) + logger := log.G(ctx).WithField("mountpoint", newMountpoint) + + logger.Debug("creating remote id-mapped mount") + if err := os.Mkdir(filepath.Dir(newMountpoint), 0700); err != nil { + return "", err + } + if err := os.Mkdir(newMountpoint, 0755); err != nil { + return "", err + } + + fs.layerMu.Lock() + l := fs.layer[mountpoint] + if l == nil { + fs.layerMu.Unlock() + logger.Error("failed to create remote id-mapped mount") + return "", errdefs.ErrNotFound + } + fs.layer[newMountpoint] = l + fs.layerMu.Unlock() + node, err := l.RootNode(0, idmapper) + if err != nil { + return "", err + } + + fuseLogger := log.L. + WithField("mountpoint", mountpoint). + WriterLevel(logrus.TraceLevel) + + return newMountpoint, fs.setupFuseServer(ctx, newMountpoint, node, l, fuseLogger, nil) +} + +func (fs *filesystem) IDMapMountLocal(ctx context.Context, mountpoint, activeLayerKey string, idmapper idtools.IDMap) (string, error) { + newMountpoint := getIDMappedMountpoint(mountpoint, activeLayerKey) + logger := log.G(ctx).WithField("mountpoint", newMountpoint) + + logger.Debug("creating local id-mapped mount") + if err := idtools.RemapDir(ctx, mountpoint, newMountpoint, idmapper); err != nil { + logger.WithError(err).Error("failed to create local mount") + return "", err + } + + logger.Debug("successfully created local mountpoint") + return newMountpoint, nil +} + func (fs *filesystem) Mount(ctx context.Context, mountpoint string, labels map[string]string) (retErr error) { // Setting the start time to measure the Mount operation duration. start := time.Now() @@ -560,7 +618,7 @@ func (fs *filesystem) Mount(ctx context.Context, mountpoint string, labels map[s } }() - node, err := l.RootNode(0) + node, err := l.RootNode(0, idtools.IDMap{}) if err != nil { log.G(ctx).WithError(err).Warnf("Failed to get root node") retErr = fmt.Errorf("failed to get root node: %w", err) @@ -577,6 +635,16 @@ func (fs *filesystem) Mount(ctx context.Context, mountpoint string, labels map[s fs.layerMu.Unlock() fs.metricsController.Add(mountpoint, l) + // Pass in a logger to go-fuse with the layer digest + // The go-fuse logs are useful for tracing exactly what's happening at the fuse level. + fuseLogger := log.L. + WithField("layerDigest", labels[ctdsnapshotters.TargetLayerDigestLabel]). + WriterLevel(logrus.TraceLevel) + + return fs.setupFuseServer(ctx, mountpoint, node, l, fuseLogger, c) +} + +func (fs *filesystem) setupFuseServer(ctx context.Context, mountpoint string, node fusefs.InodeEmbedder, l layer.Layer, logger *io.PipeWriter, c *sociContext) error { // mount the node to the specified mountpoint // TODO: bind mount the state directory as a read-only fs on snapshotter's side rawFS := fusefs.NewNodeFS(node, &fusefs.Options{ @@ -585,40 +653,37 @@ func (fs *filesystem) Mount(ctx context.Context, mountpoint string, labels map[s NegativeTimeout: &fs.negativeTimeout, NullPermissions: true, }) - // Pass in a logger to go-fuse with the layer digest - // The go-fuse logs are useful for tracing exactly what's happening at the fuse level. - logger := log.L. - WithField("layerDigest", labels[ctdsnapshotters.TargetLayerDigestLabel]). - WriterLevel(logrus.TraceLevel) mountOpts := &fuse.MountOptions{ AllowOther: true, // allow users other than root&mounter to access fs FsName: "soci", // name this filesystem as "soci" Debug: fs.debug, Logger: golog.New(logger, "", 0), DisableXAttrs: l.DisableXAttrs(), + Options: []string{"default_permissions", "ro"}, } if _, err := exec.LookPath(fusermountBin); err == nil { mountOpts.Options = []string{"suid"} // option for fusermount; allow setuid inside container } else { - log.G(ctx).WithError(err).Infof("%s not installed; trying direct mount", fusermountBin) + log.G(ctx).WithField("binary", fusermountBin).WithError(err).Info("fuse binary not installed; trying direct mount") mountOpts.DirectMount = true } server, err := fuse.NewServer(rawFS, mountpoint, mountOpts) if err != nil { - log.G(ctx).WithError(err).Debug("failed to make filesystem server") - retErr = err - return + log.G(ctx).WithError(err).Error("failed to make filesystem server") + return err } go server.Serve() - // Send a signal to the background fetcher that a new image is being mounted - // and to pause all background fetches. - c.bgFetchPauseOnce.Do(func() { - if fs.bgFetcher != nil { - fs.bgFetcher.Pause() - } - }) + if c != nil { + // Send a signal to the background fetcher that a new image is being mounted + // and to pause all background fetches. + c.bgFetchPauseOnce.Do(func() { + if fs.bgFetcher != nil { + fs.bgFetcher.Pause() + } + }) + } return server.WaitMount() } @@ -681,6 +746,11 @@ func (fs *filesystem) check(ctx context.Context, l layer.Layer, labels map[strin return rErr } +func isIdMappedDir(mountpoint string) bool { + dirName := filepath.Base(mountpoint) + return len(strings.Split(dirName, "_")) > 1 +} + func (fs *filesystem) Unmount(ctx context.Context, mountpoint string) error { fs.layerMu.Lock() l, ok := fs.layer[mountpoint] @@ -688,7 +758,9 @@ func (fs *filesystem) Unmount(ctx context.Context, mountpoint string) error { fs.layerMu.Unlock() return fmt.Errorf("specified path %q isn't a mountpoint", mountpoint) } - delete(fs.layer, mountpoint) // unregisters the corresponding layer + if !isIdMappedDir(mountpoint) { + delete(fs.layer, mountpoint) // unregisters the corresponding layer + } l.Done() fs.layerMu.Unlock() fs.metricsController.Remove(mountpoint) diff --git a/fs/fs_test.go b/fs/fs_test.go index 2591a270a..119ec9c2c 100644 --- a/fs/fs_test.go +++ b/fs/fs_test.go @@ -46,6 +46,7 @@ import ( "github.com/awslabs/soci-snapshotter/fs/layer" "github.com/awslabs/soci-snapshotter/fs/remote" "github.com/awslabs/soci-snapshotter/fs/source" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/containerd/containerd/reference" "github.com/containerd/containerd/remotes/docker" fusefs "github.com/hanwen/go-fuse/v2/fs" @@ -83,10 +84,12 @@ func (l *breakableLayer) Info() layer.Info { Size: 1, } } -func (l *breakableLayer) DisableXAttrs() bool { return false } -func (l *breakableLayer) RootNode(uint32) (fusefs.InodeEmbedder, error) { return nil, nil } -func (l *breakableLayer) Verify(tocDigest digest.Digest) error { return nil } -func (l *breakableLayer) SkipVerify() {} +func (l *breakableLayer) DisableXAttrs() bool { return false } +func (l *breakableLayer) RootNode(uint32, idtools.IDMap) (fusefs.InodeEmbedder, error) { + return nil, nil +} +func (l *breakableLayer) Verify(tocDigest digest.Digest) error { return nil } +func (l *breakableLayer) SkipVerify() {} func (l *breakableLayer) ReadAt([]byte, int64, ...remote.Option) (int, error) { return 0, fmt.Errorf("fail") } diff --git a/fs/layer/layer.go b/fs/layer/layer.go index 5d802fee9..d5ef17fe9 100644 --- a/fs/layer/layer.go +++ b/fs/layer/layer.go @@ -58,6 +58,7 @@ import ( "github.com/awslabs/soci-snapshotter/fs/remote" spanmanager "github.com/awslabs/soci-snapshotter/fs/span-manager" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/awslabs/soci-snapshotter/metadata" "github.com/awslabs/soci-snapshotter/soci" "github.com/awslabs/soci-snapshotter/util/lrucache" @@ -86,7 +87,7 @@ type Layer interface { Info() Info // RootNode returns the root node of this layer. - RootNode(baseInode uint32) (fusefs.InodeEmbedder, error) + RootNode(baseInode uint32, idMapper idtools.IDMap) (fusefs.InodeEmbedder, error) // Check checks if the layer is still connectable. Check() error @@ -456,11 +457,11 @@ func (l *layerRef) Done() { l.done() } -func (l *layer) RootNode(baseInode uint32) (fusefs.InodeEmbedder, error) { +func (l *layer) RootNode(baseInode uint32, idMapper idtools.IDMap) (fusefs.InodeEmbedder, error) { if l.isClosed() { return nil, fmt.Errorf("layer is already closed") } - return newNode(l.desc.Digest, l.r, l.blob, baseInode, l.resolver.overlayOpaqueType, l.resolver.config.LogFuseOperations, l.fuseOperationCounter) + return newNode(l.desc.Digest, l.r, l.blob, baseInode, l.resolver.overlayOpaqueType, l.resolver.config.LogFuseOperations, l.fuseOperationCounter, idMapper) } func (l *layer) ReadAt(p []byte, offset int64, opts ...remote.Option) (int, error) { diff --git a/fs/layer/node.go b/fs/layer/node.go index 6459c1a48..630ecd9f9 100644 --- a/fs/layer/node.go +++ b/fs/layer/node.go @@ -56,6 +56,7 @@ import ( commonmetrics "github.com/awslabs/soci-snapshotter/fs/metrics/common" "github.com/awslabs/soci-snapshotter/fs/reader" "github.com/awslabs/soci-snapshotter/fs/remote" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/awslabs/soci-snapshotter/metadata" "github.com/containerd/log" fusefs "github.com/hanwen/go-fuse/v2/fs" @@ -189,7 +190,7 @@ func (f *FuseOperationCounter) Run(ctx context.Context) { // logFSOperations may cause sensitive information to be emitted to logs // e.g. filenames and paths within an image -func newNode(layerDgst digest.Digest, r reader.Reader, blob remote.Blob, baseInode uint32, opaque OverlayOpaqueType, logFSOperations bool, opCounter *FuseOperationCounter) (fusefs.InodeEmbedder, error) { +func newNode(layerDgst digest.Digest, r reader.Reader, blob remote.Blob, baseInode uint32, opaque OverlayOpaqueType, logFSOperations bool, opCounter *FuseOperationCounter, idMapper idtools.IDMap) (fusefs.InodeEmbedder, error) { rootID := r.Metadata().RootID() rootAttr, err := r.Metadata().GetAttr(rootID) if err != nil { @@ -210,9 +211,10 @@ func newNode(layerDgst digest.Digest, r reader.Reader, blob remote.Blob, baseIno } ffs.s = ffs.newState(layerDgst, blob) return &node{ - id: rootID, - attr: rootAttr, - fs: ffs, + id: rootID, + attr: rootAttr, + fs: ffs, + idMapper: idMapper, }, nil } @@ -272,9 +274,10 @@ func (fs *fs) inodeOfID(id uint32) (uint64, error) { // node is a filesystem inode abstraction. type node struct { fusefs.Inode - fs *fs - id uint32 - attr metadata.Attr + fs *fs + id uint32 + attr metadata.Attr + idMapper idtools.IDMap ents []fuse.DirEntry entsCached bool @@ -407,14 +410,14 @@ func (n *node) Lookup(ctx context.Context, name string, out *fuse.EntryOut) (*fu n.fs.reportFailure(fuseOpLookup, fmt.Errorf("%s: %v", fuseOpLookup, err)) return nil, syscall.EIO } - entryToAttr(ino, tn.attr, &out.Attr) + n.entryToAttr(ino, tn.attr, &out.Attr) case *whiteout: ino, err := n.fs.inodeOfID(tn.id) if err != nil { n.fs.reportFailure(fuseOpLookup, fmt.Errorf("%s: %v", fuseOpLookup, err)) return nil, syscall.EIO } - entryToAttr(ino, tn.attr, &out.Attr) + n.entryToAttr(ino, tn.attr, &out.Attr) default: n.fs.reportFailure(fuseOpLookup, fmt.Errorf("%s: unknown node type detected", fuseOpLookup)) return nil, syscall.EIO @@ -463,10 +466,11 @@ func (n *node) Lookup(ctx context.Context, name string, out *fuse.EntryOut) (*fu return nil, syscall.EIO } return n.NewInode(ctx, &node{ - id: id, - fs: n.fs, - attr: ce, - }, entryToAttr(ino, ce, &out.Attr)), 0 + id: id, + fs: n.fs, + attr: ce, + idMapper: n.idMapper, + }, n.entryToAttr(ino, ce, &out.Attr)), 0 } var _ = (fusefs.NodeOpener)((*node)(nil)) @@ -495,7 +499,7 @@ func (n *node) Getattr(ctx context.Context, f fusefs.FileHandle, out *fuse.AttrO n.fs.reportFailure(fuseOpGetattr, fmt.Errorf("%s: %v", fuseOpGetattr, err)) return syscall.EIO } - entryToAttr(ino, n.attr, &out.Attr) + n.entryToAttr(ino, n.attr, &out.Attr) return 0 } @@ -594,7 +598,7 @@ func (f *file) Getattr(ctx context.Context, out *fuse.AttrOut) syscall.Errno { f.n.fs.reportFailure(fuseOpFileGetattr, fmt.Errorf("%s: %v", fuseOpFileGetattr, err)) return syscall.EIO } - entryToAttr(ino, f.n.attr, &out.Attr) + f.n.entryToAttr(ino, f.n.attr, &out.Attr) return 0 } @@ -797,7 +801,7 @@ func (sf *statFile) updateStatUnlocked() ([]byte, error) { } // entryToAttr converts metadata.Attr to go-fuse's Attr. -func entryToAttr(ino uint64, e metadata.Attr, out *fuse.Attr) fusefs.StableAttr { +func (n *node) entryToAttr(ino uint64, e metadata.Attr, out *fuse.Attr) fusefs.StableAttr { out.Ino = ino out.Size = uint64(e.Size) if e.Mode&os.ModeSymlink != 0 { @@ -808,7 +812,9 @@ func entryToAttr(ino uint64, e metadata.Attr, out *fuse.Attr) fusefs.StableAttr mtime := e.ModTime out.SetTimes(nil, &mtime, nil) out.Mode = fileModeToSystemMode(e.Mode) - out.Owner = fuse.Owner{Uid: uint32(e.UID), Gid: uint32(e.GID)} + // Potentially dangerous casting int -> uint32? But probably fine. + mappedID, _ := n.idMapper.ToHost(idtools.User{Uid: uint32(e.UID), Gid: uint32(e.GID)}) + out.Owner = fuse.Owner{Uid: mappedID.Uid, Gid: mappedID.Gid} out.Rdev = uint32(unix.Mkdev(uint32(e.DevMajor), uint32(e.DevMinor))) out.Nlink = uint32(e.NumLink) if out.Nlink == 0 { diff --git a/fs/layer/node_test.go b/fs/layer/node_test.go index 432b3b66e..9f12e6810 100644 --- a/fs/layer/node_test.go +++ b/fs/layer/node_test.go @@ -50,7 +50,8 @@ func TestEntryToAttr(t *testing.T) { tc := tc t.Run(tc.name, func(t *testing.T) { var actual fuse.Attr - entryToAttr(0, tc.attr, &actual) + var n node + n.entryToAttr(0, tc.attr, &actual) tc.expected.Mtime = actual.Mtime if actual != tc.expected { t.Fatalf("unexpected fuse attr. actual %v expected %v", actual, tc.expected) diff --git a/fs/layer/util_test.go b/fs/layer/util_test.go index 2fec52d89..dcb24420f 100644 --- a/fs/layer/util_test.go +++ b/fs/layer/util_test.go @@ -56,6 +56,7 @@ import ( "github.com/awslabs/soci-snapshotter/fs/reader" "github.com/awslabs/soci-snapshotter/fs/remote" spanmanager "github.com/awslabs/soci-snapshotter/fs/span-manager" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/awslabs/soci-snapshotter/metadata" "github.com/awslabs/soci-snapshotter/util/testutil" "github.com/awslabs/soci-snapshotter/ztoc" @@ -360,7 +361,7 @@ func hasSize(name string, size int) check { } func getRootNode(t *testing.T, r reader.Reader, opaque OverlayOpaqueType) *node { - rootNode, err := newNode(testStateLayerDigest, &testReader{r}, &testBlobState{10, 5}, 100, opaque, false, nil) + rootNode, err := newNode(testStateLayerDigest, &testReader{r}, &testBlobState{10, 5}, 100, opaque, false, nil, idtools.IDMap{}) if err != nil { t.Fatalf("failed to get root node: %v", err) } diff --git a/fs/source/source.go b/fs/source/source.go index ce394d840..9da0b36b2 100644 --- a/fs/source/source.go +++ b/fs/source/source.go @@ -84,6 +84,9 @@ const ( // TargetSociIndexDigestLabel is a label which contains the digest of the soci index. TargetSociIndexDigestLabel = "containerd.io/snapshot/remote/soci.index.digest" + + // HasSociIndexDigest is a label that tells if the layer was pulled with a SOCI index. + HasSociIndexDigest = "containerd.io/snapshot/remote/has.soci.index.digest" ) // RegistryHosts is copied from [github.com/awslabs/soci-snapshotter/service/resolver.RegistryHosts] diff --git a/go.mod b/go.mod index 520b1de5b..74d0df687 100644 --- a/go.mod +++ b/go.mod @@ -27,6 +27,7 @@ require ( github.com/prometheus/client_golang v1.20.5 github.com/rs/xid v1.6.0 github.com/sirupsen/logrus v1.9.3 + github.com/stretchr/testify v1.9.0 go.etcd.io/bbolt v1.3.11 golang.org/x/crypto v0.28.0 golang.org/x/sync v0.8.0 @@ -90,7 +91,6 @@ require ( github.com/prometheus/common v0.55.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/spf13/pflag v1.0.5 // indirect - github.com/stretchr/testify v1.9.0 // indirect go.opencensus.io v0.24.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.45.0 // indirect go.opentelemetry.io/otel v1.21.0 // indirect diff --git a/idtools/idmap.go b/idtools/idmap.go index d5d677285..9a89a588e 100644 --- a/idtools/idmap.go +++ b/idtools/idmap.go @@ -37,10 +37,18 @@ package idtools import ( + "context" + "encoding/json" "errors" "fmt" + "os" + "os/exec" + "path/filepath" "strings" + "syscall" + "github.com/containerd/containerd/mount" + "github.com/containerd/containerd/snapshots" "github.com/opencontainers/runtime-spec/specs-go" ) @@ -64,6 +72,21 @@ type IDMap struct { GidMap []specs.LinuxIDMapping `json:"GidMap"` } +func LoadIDMap(id string, labels map[string]string) (IDMap, error) { + var idmap IDMap + uidmapJSON, okUID := labels[snapshots.LabelSnapshotUIDMapping] + gidmapJSON, okGID := labels[snapshots.LabelSnapshotGIDMapping] + if okUID && okGID { + if err := json.Unmarshal([]byte(uidmapJSON), &idmap.UidMap); err != nil { + return IDMap{}, err + } + if err := json.Unmarshal([]byte(gidmapJSON), &idmap.GidMap); err != nil { + return IDMap{}, err + } + } + return idmap, nil +} + // ToHost returns the host user ID pair for the container ID pair. func (i IDMap) ToHost(pair User) (User, error) { var ( @@ -167,3 +190,47 @@ func safeSum(x, y uint32) (uint32, error) { } return z, nil } + +func RemapDir(ctx context.Context, originalMountpoint, newMountpoint string, idMap IDMap) error { + idmappedSnapshotBase := filepath.Dir(newMountpoint) + if err := os.Mkdir(idmappedSnapshotBase, 0755); err != nil { + return err + } + + if err := exec.Command("cp", "-a", originalMountpoint, idmappedSnapshotBase).Run(); err != nil { + return err + } + return filepath.Walk(newMountpoint, chown(idMap)) +} + +func RemapRoot(ctx context.Context, root string, idMap IDMap) error { + return filepath.Walk(root, chown(idMap)) +} + +func RemapRootFS(ctx context.Context, mounts []mount.Mount, idmap IDMap) error { + return mount.WithTempMount(ctx, mounts, func(root string) error { + return filepath.Walk(root, chown(idmap)) + }) +} + +func chown(idMap IDMap) filepath.WalkFunc { + return func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + stat := info.Sys().(*syscall.Stat_t) + h, cerr := idMap.ToHost(User{Uid: stat.Uid, Gid: stat.Gid}) + if cerr != nil { + return cerr + } + // be sure the lchown the path as to not de-reference the symlink to a host file + if cerr = os.Lchown(path, int(h.Uid), int(h.Gid)); cerr != nil { + return cerr + } + // we must retain special permissions such as setuid, setgid and sticky bits + if mode := info.Mode(); mode&os.ModeSymlink == 0 && mode&(os.ModeSetuid|os.ModeSetgid|os.ModeSticky) != 0 { + return os.Chmod(path, mode) + } + return nil + } +} diff --git a/integration/config/ctr.patch b/integration/config/ctr.patch new file mode 100644 index 000000000..13a3f5984 --- /dev/null +++ b/integration/config/ctr.patch @@ -0,0 +1,1115 @@ +diff --git a/cmd/ctr/commands/run/run_unix.go b/cmd/ctr/commands/run/run_unix.go +index b3e5d0e2d..a69eb9343 100644 +--- a/cmd/ctr/commands/run/run_unix.go ++++ b/cmd/ctr/commands/run/run_unix.go +@@ -34,6 +34,7 @@ import ( + "github.com/containerd/containerd/contrib/nvidia" + "github.com/containerd/containerd/contrib/seccomp" + "github.com/containerd/containerd/oci" ++ "github.com/containerd/containerd/pkg/idtools" + runtimeoptions "github.com/containerd/containerd/pkg/runtimeoptions/v1" + "github.com/containerd/containerd/runtime/v2/runc/options" + "github.com/containerd/containerd/snapshots" +@@ -56,13 +57,9 @@ var platformRunFlags = []cli.Flag{ + Name: "runc-systemd-cgroup", + Usage: "Start runc with systemd cgroup manager", + }, +- cli.StringFlag{ +- Name: "uidmap", +- Usage: "Run inside a user namespace with the specified UID mapping range; specified with the format `container-uid:host-uid:length`", +- }, +- cli.StringFlag{ +- Name: "gidmap", +- Usage: "Run inside a user namespace with the specified GID mapping range; specified with the format `container-gid:host-gid:length`", ++ &cli.StringFlag{ ++ Name: "userns-remap", ++ Usage: "Run inside a user namespace with the specified user", + }, + cli.BoolFlag{ + Name: "remap-labels", +@@ -162,25 +159,22 @@ func NewContainer(ctx gocontext.Context, client *containerd.Client, context *cli + containerd.WithImageConfigLabels(image), + containerd.WithAdditionalContainerLabels(labels), + containerd.WithSnapshotter(snapshotter)) +- if uidmap, gidmap := context.String("uidmap"), context.String("gidmap"); uidmap != "" && gidmap != "" { +- uidMap, err := parseIDMapping(uidmap) +- if err != nil { +- return nil, err +- } +- gidMap, err := parseIDMapping(gidmap) ++ ++ userns := context.String("userns-remap") ++ if userns != "" { ++ idmap, err := idtools.LoadIdentityMapping(userns) + if err != nil { + return nil, err + } +- opts = append(opts, +- oci.WithUserNamespace([]specs.LinuxIDMapping{uidMap}, []specs.LinuxIDMapping{gidMap})) ++ uidSpecs, gidSpecs := idmap.ToSpec() ++ opts = append(opts, oci.WithUserNamespace(uidSpecs, gidSpecs)) + // use snapshotter opts or the remapped snapshot support to shift the filesystem + // currently the only snapshotter known to support the labels is fuse-overlayfs: + // https://github.com/AkihiroSuda/containerd-fuse-overlayfs + if context.Bool("remap-labels") { +- cOpts = append(cOpts, containerd.WithNewSnapshot(id, image, +- containerd.WithRemapperLabels(0, uidMap.HostID, 0, gidMap.HostID, uidMap.Size))) ++ cOpts = append(cOpts, containerd.WithNewSnapshot(id, image, containerd.WithMultiRemapperLabels(idmap))) + } else { +- cOpts = append(cOpts, containerd.WithRemappedSnapshot(id, image, uidMap.HostID, gidMap.HostID)) ++ cOpts = append(cOpts, containerd.WithMultiRemappedSnapshot(id, image, idmap)) + } + } else { + // Even when "read-only" is set, we don't use KindView snapshot here. (#1495) +@@ -434,6 +428,7 @@ func getRuntimeOptions(context *cli.Context) (interface{}, error) { + return nil, nil + } + ++//lint:ignore U1000 Ignore unused function + func parseIDMapping(mapping string) (specs.LinuxIDMapping, error) { + // We expect 3 parts, but limit to 4 to allow detection of invalid values. + parts := strings.SplitN(mapping, ":", 4) +diff --git a/container_opts_unix.go b/container_opts_unix.go +index e0e8bad88..68c471b89 100644 +--- a/container_opts_unix.go ++++ b/container_opts_unix.go +@@ -27,6 +27,7 @@ import ( + + "github.com/containerd/containerd/containers" + "github.com/containerd/containerd/mount" ++ "github.com/containerd/containerd/pkg/idtools" + "github.com/containerd/errdefs" + "github.com/opencontainers/image-spec/identity" + ) +@@ -34,25 +35,63 @@ import ( + // WithRemappedSnapshot creates a new snapshot and remaps the uid/gid for the + // filesystem to be used by a container with user namespaces + func WithRemappedSnapshot(id string, i Image, uid, gid uint32) NewContainerOpts { +- return withRemappedSnapshotBase(id, i, uid, gid, false) ++ idmap := idtools.IdentityMapping{ ++ UIDMaps: []idtools.IDMap{ ++ { ++ ContainerID: 0, ++ HostID: int(uid), ++ Size: 1, ++ }, ++ }, ++ GIDMaps: []idtools.IDMap{ ++ { ++ ContainerID: 0, ++ HostID: int(gid), ++ Size: 1, ++ }, ++ }, ++ } ++ return withRemappedSnapshotBase(id, i, idmap, false) ++} ++func WithMultiRemappedSnapshot(id string, i Image, idmap idtools.IdentityMapping) NewContainerOpts { ++ return withRemappedSnapshotBase(id, i, idmap, false) + } + + // WithRemappedSnapshotView is similar to WithRemappedSnapshot but rootfs is mounted as read-only. + func WithRemappedSnapshotView(id string, i Image, uid, gid uint32) NewContainerOpts { +- return withRemappedSnapshotBase(id, i, uid, gid, true) ++ idmap := idtools.IdentityMapping{ ++ UIDMaps: []idtools.IDMap{ ++ { ++ ContainerID: 0, ++ HostID: int(uid), ++ Size: 1, ++ }, ++ }, ++ GIDMaps: []idtools.IDMap{ ++ { ++ ContainerID: 0, ++ HostID: int(gid), ++ Size: 1, ++ }, ++ }, ++ } ++ return withRemappedSnapshotBase(id, i, idmap, true) ++} ++func WithMultiRemappedSnapshotView(id string, i Image, idmap idtools.IdentityMapping) NewContainerOpts { ++ return withRemappedSnapshotBase(id, i, idmap, true) + } + +-func withRemappedSnapshotBase(id string, i Image, uid, gid uint32, readonly bool) NewContainerOpts { ++func withRemappedSnapshotBase(id string, i Image, idmap idtools.IdentityMapping, readonly bool) NewContainerOpts { + return func(ctx context.Context, client *Client, c *containers.Container) error { + diffIDs, err := i.(*image).i.RootFS(ctx, client.ContentStore(), client.platform) + if err != nil { + return err + } + +- var ( +- parent = identity.ChainID(diffIDs).String() +- usernsID = fmt.Sprintf("%s-%d-%d", parent, uid, gid) +- ) ++ parent := identity.ChainID(diffIDs).String() ++ rootMap := idmap.RootPair() ++ usernsID := fmt.Sprintf("%s-%d-%d", parent, rootMap.UID, rootMap.GID) ++ + c.Snapshotter, err = client.resolveSnapshotterName(ctx, c.Snapshotter) + if err != nil { + return err +@@ -74,7 +113,7 @@ func withRemappedSnapshotBase(id string, i Image, uid, gid uint32, readonly bool + if err != nil { + return err + } +- if err := remapRootFS(ctx, mounts, uid, gid); err != nil { ++ if err := remapRootFS(ctx, mounts, idmap); err != nil { + snapshotter.Remove(ctx, usernsID) + return err + } +@@ -95,22 +134,23 @@ func withRemappedSnapshotBase(id string, i Image, uid, gid uint32, readonly bool + } + } + +-func remapRootFS(ctx context.Context, mounts []mount.Mount, uid, gid uint32) error { ++func remapRootFS(ctx context.Context, mounts []mount.Mount, idmap idtools.IdentityMapping) error { + return mount.WithTempMount(ctx, mounts, func(root string) error { +- return filepath.Walk(root, incrementFS(root, uid, gid)) ++ return filepath.Walk(root, chown(root, idmap)) + }) + } + +-func incrementFS(root string, uidInc, gidInc uint32) filepath.WalkFunc { ++func chown(root string, idmap idtools.IdentityMapping) filepath.WalkFunc { + return func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } +- var ( +- stat = info.Sys().(*syscall.Stat_t) +- u, g = int(stat.Uid + uidInc), int(stat.Gid + gidInc) +- ) ++ stat := info.Sys().(*syscall.Stat_t) ++ h, cerr := idmap.ToHost(idtools.Identity{UID: int(stat.Uid), GID: int(stat.Gid)}) ++ if cerr != nil { ++ return cerr ++ } + // be sure the lchown the path as to not de-reference the symlink to a host file +- return os.Lchown(path, u, g) ++ return os.Lchown(path, h.UID, h.GID) + } + } +diff --git a/pkg/idtools/idtools.go b/pkg/idtools/idtools.go +new file mode 100644 +index 000000000..79d9d0957 +--- /dev/null ++++ b/pkg/idtools/idtools.go +@@ -0,0 +1,254 @@ ++//nolint:unused ++package idtools ++ ++import ( ++ "bufio" ++ "fmt" ++ "os" ++ "strconv" ++ "strings" ++ ++ "github.com/opencontainers/runtime-spec/specs-go" ++) ++ ++// IDMap contains a single entry for user namespace range remapping. An array ++// of IDMap entries represents the structure that will be provided to the Linux ++// kernel for creating a user namespace. ++type IDMap struct { ++ ContainerID int `json:"containerID"` ++ HostID int `json:"hostID"` ++ Size int `json:"size"` ++} ++ ++type subIDRange struct { ++ Start int ++ Length int ++} ++ ++type ranges []subIDRange ++ ++func (e ranges) Len() int { return len(e) } ++func (e ranges) Swap(i, j int) { e[i], e[j] = e[j], e[i] } ++func (e ranges) Less(i, j int) bool { return e[i].Start < e[j].Start } ++ ++const ( ++ subuidFileName = "/etc/subuid" ++ subgidFileName = "/etc/subgid" ++) ++ ++// MkdirAllAndChown creates a directory (include any along the path) and then modifies ++// ownership to the requested uid/gid. If the directory already exists, this ++// function will still change ownership and permissions. ++func MkdirAllAndChown(path string, mode os.FileMode, owner Identity) error { ++ return mkdirAs(path, mode, owner, true, true) ++} ++ ++// MkdirAndChown creates a directory and then modifies ownership to the requested uid/gid. ++// If the directory already exists, this function still changes ownership and permissions. ++// Note that unlike os.Mkdir(), this function does not return IsExist error ++// in case path already exists. ++func MkdirAndChown(path string, mode os.FileMode, owner Identity) error { ++ return mkdirAs(path, mode, owner, false, true) ++} ++ ++// MkdirAllAndChownNew creates a directory (include any along the path) and then modifies ++// ownership ONLY of newly created directories to the requested uid/gid. If the ++// directories along the path exist, no change of ownership or permissions will be performed ++func MkdirAllAndChownNew(path string, mode os.FileMode, owner Identity) error { ++ return mkdirAs(path, mode, owner, true, false) ++} ++ ++// GetRootUIDGID retrieves the remapped root uid/gid pair from the set of maps. ++// If the maps are empty, then the root uid/gid will default to "real" 0/0 ++func GetRootUIDGID(uidMap, gidMap []IDMap) (int, int, error) { ++ uid, err := toHost(0, uidMap) ++ if err != nil { ++ return -1, -1, err ++ } ++ gid, err := toHost(0, gidMap) ++ if err != nil { ++ return -1, -1, err ++ } ++ return uid, gid, nil ++} ++ ++// toContainer takes an id mapping, and uses it to translate a ++// host ID to the remapped ID. If no map is provided, then the translation ++// assumes a 1-to-1 mapping and returns the passed in id ++func toContainer(hostID int, idMap []IDMap) (int, error) { ++ if idMap == nil { ++ return hostID, nil ++ } ++ for _, m := range idMap { ++ if (hostID >= m.HostID) && (hostID <= (m.HostID + m.Size - 1)) { ++ contID := m.ContainerID + (hostID - m.HostID) ++ return contID, nil ++ } ++ } ++ return -1, fmt.Errorf("Host ID %d cannot be mapped to a container ID", hostID) ++} ++ ++// toHost takes an id mapping and a remapped ID, and translates the ++// ID to the mapped host ID. If no map is provided, then the translation ++// assumes a 1-to-1 mapping and returns the passed in id # ++func toHost(contID int, idMap []IDMap) (int, error) { ++ if idMap == nil { ++ return contID, nil ++ } ++ for _, m := range idMap { ++ if (contID >= m.ContainerID) && (contID <= (m.ContainerID + m.Size - 1)) { ++ hostID := m.HostID + (contID - m.ContainerID) ++ return hostID, nil ++ } ++ } ++ return -1, fmt.Errorf("Container ID %d cannot be mapped to a host ID", contID) ++} ++ ++// Identity is either a UID and GID pair or a SID (but not both) ++type Identity struct { ++ UID int ++ GID int ++ SID string ++} ++ ++// Chown changes the numeric uid and gid of the named file to id.UID and id.GID. ++func (id Identity) Chown(name string) error { ++ return os.Chown(name, id.UID, id.GID) ++} ++ ++// IdentityMapping contains a mappings of UIDs and GIDs. ++// The zero value represents an empty mapping. ++type IdentityMapping struct { ++ UIDMaps []IDMap `json:"UIDMaps"` ++ GIDMaps []IDMap `json:"GIDMaps"` ++} ++ ++// RootPair returns a uid and gid pair for the root user. The error is ignored ++// because a root user always exists, and the defaults are correct when the uid ++// and gid maps are empty. ++func (i IdentityMapping) RootPair() Identity { ++ uid, gid, _ := GetRootUIDGID(i.UIDMaps, i.GIDMaps) ++ return Identity{UID: uid, GID: gid} ++} ++ ++// ToHost returns the host UID and GID for the container uid, gid. ++// Remapping is only performed if the ids aren't already the remapped root ids ++func (i IdentityMapping) ToHost(pair Identity) (Identity, error) { ++ var err error ++ target := i.RootPair() ++ ++ if pair.UID != target.UID { ++ target.UID, err = toHost(pair.UID, i.UIDMaps) ++ if err != nil { ++ return target, err ++ } ++ } ++ ++ if pair.GID != target.GID { ++ target.GID, err = toHost(pair.GID, i.GIDMaps) ++ } ++ return target, err ++} ++ ++// ToContainer returns the container UID and GID for the host uid and gid ++func (i IdentityMapping) ToContainer(pair Identity) (int, int, error) { ++ uid, err := toContainer(pair.UID, i.UIDMaps) ++ if err != nil { ++ return -1, -1, err ++ } ++ gid, err := toContainer(pair.GID, i.GIDMaps) ++ return uid, gid, err ++} ++ ++// Empty returns true if there are no id mappings ++func (i IdentityMapping) Empty() bool { ++ return len(i.UIDMaps) == 0 && len(i.GIDMaps) == 0 ++} ++ ++func (i IdentityMapping) ToSpec() (uidSpecs []specs.LinuxIDMapping, gidSpecs []specs.LinuxIDMapping) { ++ for _, m := range i.UIDMaps { ++ uidSpecs = append(uidSpecs, ++ specs.LinuxIDMapping{ ++ ContainerID: uint32(m.ContainerID), ++ HostID: uint32(m.HostID), ++ Size: uint32(m.Size), ++ }, ++ ) ++ } ++ for _, m := range i.GIDMaps { ++ gidSpecs = append(gidSpecs, ++ specs.LinuxIDMapping{ ++ ContainerID: uint32(m.ContainerID), ++ HostID: uint32(m.HostID), ++ Size: uint32(m.Size), ++ }, ++ ) ++ } ++ return ++} ++ ++func createIDMap(subidRanges ranges) []IDMap { ++ idMap := []IDMap{} ++ ++ containerID := 0 ++ for _, idrange := range subidRanges { ++ idMap = append(idMap, IDMap{ ++ ContainerID: containerID, ++ HostID: idrange.Start, ++ Size: idrange.Length, ++ }) ++ containerID = containerID + idrange.Length ++ } ++ return idMap ++} ++ ++func parseSubuid(username string) (ranges, error) { ++ return parseSubidFile(subuidFileName, username) ++} ++ ++func parseSubgid(username string) (ranges, error) { ++ return parseSubidFile(subgidFileName, username) ++} ++ ++// parseSubidFile will read the appropriate file (/etc/subuid or /etc/subgid) ++// and return all found ranges for a specified username. If the special value ++// "ALL" is supplied for username, then all ranges in the file will be returned ++func parseSubidFile(path, username string) (ranges, error) { ++ var rangeList ranges ++ ++ subidFile, err := os.Open(path) ++ if err != nil { ++ return rangeList, err ++ } ++ defer subidFile.Close() ++ ++ s := bufio.NewScanner(subidFile) ++ for s.Scan() { ++ text := strings.TrimSpace(s.Text()) ++ if text == "" || strings.HasPrefix(text, "#") { ++ continue ++ } ++ parts := strings.Split(text, ":") ++ if len(parts) != 3 { ++ return rangeList, fmt.Errorf("Cannot parse subuid/gid information: Format not correct for %s file", path) ++ } ++ if parts[0] == username || username == "ALL" { ++ startid, err := strconv.Atoi(parts[1]) ++ if err != nil { ++ return rangeList, fmt.Errorf("String to int conversion failed during subuid/gid parsing of %s: %v", path, err) ++ } ++ length, err := strconv.Atoi(parts[2]) ++ if err != nil { ++ return rangeList, fmt.Errorf("String to int conversion failed during subuid/gid parsing of %s: %v", path, err) ++ } ++ rangeList = append(rangeList, subIDRange{startid, length}) ++ } ++ } ++ ++ return rangeList, s.Err() ++} ++ ++// CurrentIdentity returns the identity of the current process ++func CurrentIdentity() Identity { ++ return Identity{UID: os.Getuid(), GID: os.Getegid()} ++} +diff --git a/pkg/idtools/idtools_unix.go b/pkg/idtools/idtools_unix.go +new file mode 100644 +index 000000000..794bf7119 +--- /dev/null ++++ b/pkg/idtools/idtools_unix.go +@@ -0,0 +1,285 @@ ++//go:build !windows ++// +build !windows ++ ++package idtools ++ ++import ( ++ "bytes" ++ "fmt" ++ "io" ++ "os" ++ "os/exec" ++ "path/filepath" ++ "strconv" ++ "sync" ++ "syscall" ++ ++ "github.com/moby/sys/user" ++) ++ ++var ( ++ entOnce sync.Once ++ getentCmd string ++) ++ ++func mkdirAs(path string, mode os.FileMode, owner Identity, mkAll, chownExisting bool) error { ++ path, err := filepath.Abs(path) ++ if err != nil { ++ return err ++ } ++ ++ stat, err := os.Stat(path) ++ if err == nil { ++ if !stat.IsDir() { ++ return &os.PathError{Op: "mkdir", Path: path, Err: syscall.ENOTDIR} ++ } ++ if !chownExisting { ++ return nil ++ } ++ ++ // short-circuit -- we were called with an existing directory and chown was requested ++ return setPermissions(path, mode, owner, stat) ++ } ++ ++ // make an array containing the original path asked for, plus (for mkAll == true) ++ // all path components leading up to the complete path that don't exist before we MkdirAll ++ // so that we can chown all of them properly at the end. If chownExisting is false, we won't ++ // chown the full directory path if it exists ++ var paths []string ++ if os.IsNotExist(err) { ++ paths = []string{path} ++ } ++ ++ if mkAll { ++ // walk back to "/" looking for directories which do not exist ++ // and add them to the paths array for chown after creation ++ dirPath := path ++ for { ++ dirPath = filepath.Dir(dirPath) ++ if dirPath == "/" { ++ break ++ } ++ if _, err = os.Stat(dirPath); err != nil && os.IsNotExist(err) { ++ paths = append(paths, dirPath) ++ } ++ } ++ if err = os.MkdirAll(path, mode); err != nil { ++ return err ++ } ++ } else if err = os.Mkdir(path, mode); err != nil { ++ return err ++ } ++ // even if it existed, we will chown the requested path + any subpaths that ++ // didn't exist when we called MkdirAll ++ for _, pathComponent := range paths { ++ if err = setPermissions(pathComponent, mode, owner, nil); err != nil { ++ return err ++ } ++ } ++ return nil ++} ++ ++// LookupUser uses traditional local system files lookup (from libcontainer/user) on a username, ++// followed by a call to `getent` for supporting host configured non-files passwd and group dbs ++func LookupUser(name string) (user.User, error) { ++ // first try a local system files lookup using existing capabilities ++ usr, err := user.LookupUser(name) ++ if err == nil { ++ return usr, nil ++ } ++ // local files lookup failed; attempt to call `getent` to query configured passwd dbs ++ usr, err = getentUser(name) ++ if err != nil { ++ return user.User{}, err ++ } ++ return usr, nil ++} ++ ++// LookupUID uses traditional local system files lookup (from libcontainer/user) on a uid, ++// followed by a call to `getent` for supporting host configured non-files passwd and group dbs ++func LookupUID(uid int) (user.User, error) { ++ // first try a local system files lookup using existing capabilities ++ usr, err := user.LookupUid(uid) ++ if err == nil { ++ return usr, nil ++ } ++ // local files lookup failed; attempt to call `getent` to query configured passwd dbs ++ return getentUser(strconv.Itoa(uid)) ++} ++ ++func getentUser(name string) (user.User, error) { ++ reader, err := callGetent("passwd", name) ++ if err != nil { ++ return user.User{}, err ++ } ++ users, err := user.ParsePasswd(reader) ++ if err != nil { ++ return user.User{}, err ++ } ++ if len(users) == 0 { ++ return user.User{}, fmt.Errorf("getent failed to find passwd entry for %q", name) ++ } ++ return users[0], nil ++} ++ ++// LookupGroup uses traditional local system files lookup (from libcontainer/user) on a group name, ++// followed by a call to `getent` for supporting host configured non-files passwd and group dbs ++func LookupGroup(name string) (user.Group, error) { ++ // first try a local system files lookup using existing capabilities ++ group, err := user.LookupGroup(name) ++ if err == nil { ++ return group, nil ++ } ++ // local files lookup failed; attempt to call `getent` to query configured group dbs ++ return getentGroup(name) ++} ++ ++// LookupGID uses traditional local system files lookup (from libcontainer/user) on a group ID, ++// followed by a call to `getent` for supporting host configured non-files passwd and group dbs ++func LookupGID(gid int) (user.Group, error) { ++ // first try a local system files lookup using existing capabilities ++ group, err := user.LookupGid(gid) ++ if err == nil { ++ return group, nil ++ } ++ // local files lookup failed; attempt to call `getent` to query configured group dbs ++ return getentGroup(strconv.Itoa(gid)) ++} ++ ++func getentGroup(name string) (user.Group, error) { ++ reader, err := callGetent("group", name) ++ if err != nil { ++ return user.Group{}, err ++ } ++ groups, err := user.ParseGroup(reader) ++ if err != nil { ++ return user.Group{}, err ++ } ++ if len(groups) == 0 { ++ return user.Group{}, fmt.Errorf("getent failed to find groups entry for %q", name) ++ } ++ return groups[0], nil ++} ++ ++func callGetent(database, key string) (io.Reader, error) { ++ entOnce.Do(func() { getentCmd, _ = resolveBinary("getent") }) ++ // if no `getent` command on host, can't do anything else ++ if getentCmd == "" { ++ return nil, fmt.Errorf("unable to find getent command") ++ } ++ command := exec.Command(getentCmd, database, key) ++ // we run getent within container filesystem, but without /dev so /dev/null is not available for exec to mock stdin ++ command.Stdin = io.NopCloser(bytes.NewReader(nil)) ++ out, err := command.CombinedOutput() ++ if err != nil { ++ exitCode, errC := getExitCode(err) ++ if errC != nil { ++ return nil, err ++ } ++ switch exitCode { ++ case 1: ++ return nil, fmt.Errorf("getent reported invalid parameters/database unknown") ++ case 2: ++ return nil, fmt.Errorf("getent unable to find entry %q in %s database", key, database) ++ case 3: ++ return nil, fmt.Errorf("getent database doesn't support enumeration") ++ default: ++ return nil, err ++ } ++ } ++ return bytes.NewReader(out), nil ++} ++ ++// getExitCode returns the ExitStatus of the specified error if its type is ++// exec.ExitError, returns 0 and an error otherwise. ++func getExitCode(err error) (int, error) { ++ exitCode := 0 ++ if exiterr, ok := err.(*exec.ExitError); ok { ++ if procExit, ok := exiterr.Sys().(syscall.WaitStatus); ok { ++ return procExit.ExitStatus(), nil ++ } ++ } ++ return exitCode, fmt.Errorf("failed to get exit code") ++} ++ ++// setPermissions performs a chown/chmod only if the uid/gid don't match what's requested ++// Normally a Chown is a no-op if uid/gid match, but in some cases this can still cause an error, e.g. if the ++// dir is on an NFS share, so don't call chown unless we absolutely must. ++// Likewise for setting permissions. ++func setPermissions(p string, mode os.FileMode, owner Identity, stat os.FileInfo) error { ++ if stat == nil { ++ var err error ++ stat, err = os.Stat(p) ++ if err != nil { ++ return err ++ } ++ } ++ if stat.Mode().Perm() != mode.Perm() { ++ if err := os.Chmod(p, mode.Perm()); err != nil { ++ return err ++ } ++ } ++ ssi := stat.Sys().(*syscall.Stat_t) ++ if ssi.Uid == uint32(owner.UID) && ssi.Gid == uint32(owner.GID) { ++ return nil ++ } ++ return os.Chown(p, owner.UID, owner.GID) ++} ++ ++// LoadIdentityMapping takes a requested username and ++// using the data from /etc/sub{uid,gid} ranges, creates the ++// proper uid and gid remapping ranges for that user/group pair ++func LoadIdentityMapping(name string) (IdentityMapping, error) { ++ usr, err := LookupUser(name) ++ if err != nil { ++ return IdentityMapping{}, fmt.Errorf("could not get user for username %s: %v", name, err) ++ } ++ ++ subuidRanges, err := lookupSubUIDRanges(usr) ++ if err != nil { ++ return IdentityMapping{}, err ++ } ++ subgidRanges, err := lookupSubGIDRanges(usr) ++ if err != nil { ++ return IdentityMapping{}, err ++ } ++ ++ return IdentityMapping{ ++ UIDMaps: subuidRanges, ++ GIDMaps: subgidRanges, ++ }, nil ++} ++ ++func lookupSubUIDRanges(usr user.User) ([]IDMap, error) { ++ rangeList, err := parseSubuid(strconv.Itoa(usr.Uid)) ++ if err != nil { ++ return nil, err ++ } ++ if len(rangeList) == 0 { ++ rangeList, err = parseSubuid(usr.Name) ++ if err != nil { ++ return nil, err ++ } ++ } ++ if len(rangeList) == 0 { ++ return nil, fmt.Errorf("no subuid ranges found for user %q", usr.Name) ++ } ++ return createIDMap(rangeList), nil ++} ++ ++func lookupSubGIDRanges(usr user.User) ([]IDMap, error) { ++ rangeList, err := parseSubgid(strconv.Itoa(usr.Uid)) ++ if err != nil { ++ return nil, err ++ } ++ if len(rangeList) == 0 { ++ rangeList, err = parseSubgid(usr.Name) ++ if err != nil { ++ return nil, err ++ } ++ } ++ if len(rangeList) == 0 { ++ return nil, fmt.Errorf("no subgid ranges found for user %q", usr.Name) ++ } ++ return createIDMap(rangeList), nil ++} +diff --git a/pkg/idtools/idtools_windows.go b/pkg/idtools/idtools_windows.go +new file mode 100644 +index 000000000..5cf69deba +--- /dev/null ++++ b/pkg/idtools/idtools_windows.go +@@ -0,0 +1,23 @@ ++package idtools ++ ++import ( ++ "errors" ++ "os" ++) ++ ++const ( ++ SeTakeOwnershipPrivilege = "SeTakeOwnershipPrivilege" ++) ++ ++const ( ++ ContainerAdministratorSidString = "S-1-5-93-2-1" ++ ContainerUserSidString = "S-1-5-93-2-2" ++) ++ ++// This is currently a wrapper around MkdirAll, however, since currently ++// permissions aren't set through this path, the identity isn't utilized. ++// Ownership is handled elsewhere, but in the future could be support here ++// too. ++func mkdirAs(_ string, _ os.FileMode, _ Identity, _, _ bool) error { ++ return errors.New("Not implemented") ++} +diff --git a/pkg/idtools/usergroupadd_linux.go b/pkg/idtools/usergroupadd_linux.go +new file mode 100644 +index 000000000..36647563e +--- /dev/null ++++ b/pkg/idtools/usergroupadd_linux.go +@@ -0,0 +1,166 @@ ++package idtools ++ ++import ( ++ "fmt" ++ "os/exec" ++ "regexp" ++ "sort" ++ "strconv" ++ "strings" ++ "sync" ++) ++ ++// add a user and/or group to Linux /etc/passwd, /etc/group using standard ++// Linux distribution commands: ++// adduser --system --shell /bin/false --disabled-login --disabled-password --no-create-home --group ++// useradd -r -s /bin/false ++ ++var ( ++ once sync.Once ++ userCommand string ++ idOutRegexp = regexp.MustCompile(`uid=([0-9]+).*gid=([0-9]+)`) ++) ++ ++const ( ++ // default length for a UID/GID subordinate range ++ defaultRangeLen = 65536 ++ defaultRangeStart = 100000 ++) ++ ++// AddNamespaceRangesUser takes a username and uses the standard system ++// utility to create a system user/group pair used to hold the ++// /etc/sub{uid,gid} ranges which will be used for user namespace ++// mapping ranges in containers. ++func AddNamespaceRangesUser(name string) (int, int, error) { ++ if err := addUser(name); err != nil { ++ return -1, -1, fmt.Errorf("error adding user %q: %v", name, err) ++ } ++ ++ // Query the system for the created uid and gid pair ++ out, err := exec.Command("id", name).CombinedOutput() ++ if err != nil { ++ return -1, -1, fmt.Errorf("error trying to find uid/gid for new user %q: %v", name, err) ++ } ++ matches := idOutRegexp.FindStringSubmatch(strings.TrimSpace(string(out))) ++ if len(matches) != 3 { ++ return -1, -1, fmt.Errorf("can't find uid, gid from `id` output: %q", string(out)) ++ } ++ uid, err := strconv.Atoi(matches[1]) ++ if err != nil { ++ return -1, -1, fmt.Errorf("can't convert found uid (%s) to int: %v", matches[1], err) ++ } ++ gid, err := strconv.Atoi(matches[2]) ++ if err != nil { ++ return -1, -1, fmt.Errorf("Can't convert found gid (%s) to int: %v", matches[2], err) ++ } ++ ++ // Now we need to create the subuid/subgid ranges for our new user/group (system users ++ // do not get auto-created ranges in subuid/subgid) ++ ++ if err := createSubordinateRanges(name); err != nil { ++ return -1, -1, fmt.Errorf("couldn't create subordinate ID ranges: %v", err) ++ } ++ return uid, gid, nil ++} ++ ++func addUser(name string) error { ++ once.Do(func() { ++ // set up which commands are used for adding users/groups dependent on distro ++ if _, err := resolveBinary("adduser"); err == nil { ++ userCommand = "adduser" ++ } else if _, err := resolveBinary("useradd"); err == nil { ++ userCommand = "useradd" ++ } ++ }) ++ var args []string ++ switch userCommand { ++ case "adduser": ++ args = []string{"--system", "--shell", "/bin/false", "--no-create-home", "--disabled-login", "--disabled-password", "--group", name} ++ case "useradd": ++ args = []string{"-r", "-s", "/bin/false", name} ++ default: ++ return fmt.Errorf("cannot add user; no useradd/adduser binary found") ++ } ++ ++ if out, err := exec.Command(userCommand, args...).CombinedOutput(); err != nil { ++ return fmt.Errorf("failed to add user with error: %v; output: %q", err, string(out)) ++ } ++ return nil ++} ++ ++func createSubordinateRanges(name string) error { ++ // first, we should verify that ranges weren't automatically created ++ // by the distro tooling ++ ranges, err := parseSubuid(name) ++ if err != nil { ++ return fmt.Errorf("error while looking for subuid ranges for user %q: %v", name, err) ++ } ++ if len(ranges) == 0 { ++ // no UID ranges; let's create one ++ startID, err := findNextUIDRange() ++ if err != nil { ++ return fmt.Errorf("can't find available subuid range: %v", err) ++ } ++ idRange := fmt.Sprintf("%d-%d", startID, startID+defaultRangeLen-1) ++ out, err := exec.Command("usermod", "-v", idRange, name).CombinedOutput() ++ if err != nil { ++ return fmt.Errorf("unable to add subuid range to user: %q; output: %s, err: %v", name, out, err) ++ } ++ } ++ ++ ranges, err = parseSubgid(name) ++ if err != nil { ++ return fmt.Errorf("error while looking for subgid ranges for user %q: %v", name, err) ++ } ++ if len(ranges) == 0 { ++ // no GID ranges; let's create one ++ startID, err := findNextGIDRange() ++ if err != nil { ++ return fmt.Errorf("can't find available subgid range: %v", err) ++ } ++ idRange := fmt.Sprintf("%d-%d", startID, startID+defaultRangeLen-1) ++ out, err := exec.Command("usermod", "-w", idRange, name).CombinedOutput() ++ if err != nil { ++ return fmt.Errorf("unable to add subgid range to user: %q; output: %s, err: %v", name, out, err) ++ } ++ } ++ return nil ++} ++ ++func findNextUIDRange() (int, error) { ++ ranges, err := parseSubuid("ALL") ++ if err != nil { ++ return -1, fmt.Errorf("couldn't parse all ranges in /etc/subuid file: %v", err) ++ } ++ sort.Sort(ranges) ++ return findNextRangeStart(ranges) ++} ++ ++func findNextGIDRange() (int, error) { ++ ranges, err := parseSubgid("ALL") ++ if err != nil { ++ return -1, fmt.Errorf("couldn't parse all ranges in /etc/subgid file: %v", err) ++ } ++ sort.Sort(ranges) ++ return findNextRangeStart(ranges) ++} ++ ++func findNextRangeStart(rangeList ranges) (int, error) { ++ startID := defaultRangeStart ++ for _, arange := range rangeList { ++ if wouldOverlap(arange, startID) { ++ startID = arange.Start + arange.Length ++ } ++ } ++ return startID, nil ++} ++ ++func wouldOverlap(arange subIDRange, ID int) bool { ++ low := ID ++ high := ID + defaultRangeLen ++ if (low >= arange.Start && low <= arange.Start+arange.Length) || ++ (high <= arange.Start+arange.Length && high >= arange.Start) { ++ return true ++ } ++ return false ++} +diff --git a/pkg/idtools/usergroupadd_unsupported.go b/pkg/idtools/usergroupadd_unsupported.go +new file mode 100644 +index 000000000..6b3c5fce3 +--- /dev/null ++++ b/pkg/idtools/usergroupadd_unsupported.go +@@ -0,0 +1,13 @@ ++//go:build !linux ++// +build !linux ++ ++package idtools ++ ++import "fmt" ++ ++// AddNamespaceRangesUser takes a name and finds an unused uid, gid pair ++// and calls the appropriate helper function to add the group and then ++// the user to the group in /etc/group and /etc/passwd respectively. ++func AddNamespaceRangesUser(name string) (int, int, error) { ++ return -1, -1, fmt.Errorf("No support for adding users or groups on this OS") ++} +diff --git a/pkg/idtools/utils_unix.go b/pkg/idtools/utils_unix.go +new file mode 100644 +index 000000000..744079501 +--- /dev/null ++++ b/pkg/idtools/utils_unix.go +@@ -0,0 +1,27 @@ ++//go:build !windows ++// +build !windows ++ ++package idtools ++ ++import ( ++ "fmt" ++ "os/exec" ++ "path/filepath" ++) ++ ++func resolveBinary(binname string) (string, error) { ++ binaryPath, err := exec.LookPath(binname) ++ if err != nil { ++ return "", err ++ } ++ resolvedPath, err := filepath.EvalSymlinks(binaryPath) ++ if err != nil { ++ return "", err ++ } ++ // only return no error if the final resolved binary basename ++ // matches what was searched for ++ if filepath.Base(resolvedPath) == binname { ++ return resolvedPath, nil ++ } ++ return "", fmt.Errorf("Binary %q does not resolve to a binary of that name in $PATH (%q)", binname, resolvedPath) ++} +diff --git a/snapshots/snapshotter.go b/snapshots/snapshotter.go +index 5fa5aa530..b8fd5d81b 100644 +--- a/snapshots/snapshotter.go ++++ b/snapshots/snapshotter.go +@@ -38,6 +38,8 @@ const ( + LabelSnapshotUIDMapping = "containerd.io/snapshot/uidmapping" + // LabelSnapshotGIDMapping is the label used for GID mappings + LabelSnapshotGIDMapping = "containerd.io/snapshot/gidmapping" ++ // LabelSnapshotUserNSMapping is the label used for user ns mappings ++ LabelSnapshotUserNSMapping = "containerd.io/snapshot/usernsmapping" + ) + + // Kind identifies the kind of snapshot. +diff --git a/snapshotter_opts_unix.go b/snapshotter_opts_unix.go +index 4739e192f..a98359e64 100644 +--- a/snapshotter_opts_unix.go ++++ b/snapshotter_opts_unix.go +@@ -20,8 +20,10 @@ package containerd + + import ( + "context" ++ "encoding/json" + "fmt" + ++ "github.com/containerd/containerd/pkg/idtools" + "github.com/containerd/containerd/snapshots" + ) + +@@ -38,73 +40,25 @@ func WithRemapperLabels(ctrUID, hostUID, ctrGID, hostGID, length uint32) snapsho + snapshots.LabelSnapshotGIDMapping: fmt.Sprintf("%d:%d:%d", ctrGID, hostGID, length)}) + } + +-func resolveSnapshotOptions(ctx context.Context, client *Client, snapshotterName string, snapshotter snapshots.Snapshotter, parent string, opts ...snapshots.Opt) (string, error) { +- capabs, err := client.GetSnapshotterCapabilities(ctx, snapshotterName) ++func WithMultiRemapperLabels(idmap idtools.IdentityMapping) snapshots.Opt { ++ uidMap, err := json.Marshal(idmap.UIDMaps) + if err != nil { +- return "", err +- } +- +- for _, capab := range capabs { +- if capab == capabRemapIDs { +- // Snapshotter supports ID remapping, we don't need to do anything. +- return parent, nil +- } +- } +- +- var local snapshots.Info +- for _, opt := range opts { +- opt(&local) +- } +- +- needsRemap := false +- var uidMap, gidMap string +- +- if value, ok := local.Labels[snapshots.LabelSnapshotUIDMapping]; ok { +- needsRemap = true +- uidMap = value +- } +- if value, ok := local.Labels[snapshots.LabelSnapshotGIDMapping]; ok { +- needsRemap = true +- gidMap = value ++ return snapshots.WithLabels(map[string]string{}) + } + +- if !needsRemap { +- return parent, nil +- } +- +- var ctrUID, hostUID, length uint32 +- _, err = fmt.Sscanf(uidMap, "%d:%d:%d", &ctrUID, &hostUID, &length) ++ gidMap, err := json.Marshal(idmap.GIDMaps) + if err != nil { +- return "", fmt.Errorf("uidMap unparsable: %w", err) ++ return snapshots.WithLabels(map[string]string{}) + } + +- var ctrGID, hostGID, lengthGID uint32 +- _, err = fmt.Sscanf(gidMap, "%d:%d:%d", &ctrGID, &hostGID, &lengthGID) +- if err != nil { +- return "", fmt.Errorf("gidMap unparsable: %w", err) +- } +- +- if ctrUID != 0 || ctrGID != 0 { +- return "", fmt.Errorf("Container UID/GID of 0 only supported currently (%d/%d)", ctrUID, ctrGID) +- } ++ return snapshots.WithLabels(map[string]string{ ++ snapshots.LabelSnapshotUIDMapping: string(uidMap), ++ snapshots.LabelSnapshotGIDMapping: string(gidMap), ++ }) + +- // TODO(dgl): length isn't taken into account for the intermediate snapshot id. +- usernsID := fmt.Sprintf("%s-%d-%d", parent, hostUID, hostGID) +- if _, err := snapshotter.Stat(ctx, usernsID); err == nil { +- return usernsID, nil +- } +- mounts, err := snapshotter.Prepare(ctx, usernsID+"-remap", parent) +- if err != nil { +- return "", err +- } +- // TODO(dgl): length isn't taken into account here yet either. +- if err := remapRootFS(ctx, mounts, hostUID, hostGID); err != nil { +- snapshotter.Remove(ctx, usernsID+"-remap") +- return "", err +- } +- if err := snapshotter.Commit(ctx, usernsID, usernsID+"-remap"); err != nil { +- return "", err +- } ++} + +- return usernsID, nil ++func resolveSnapshotOptions(ctx context.Context, client *Client, snapshotterName string, snapshotter snapshots.Snapshotter, parent string, opts ...snapshots.Opt) (string, error) { ++ // Snapshotter supports ID remapping, we don't need to do anything. ++ return parent, nil + } + \ No newline at end of file diff --git a/integration/run_test.go b/integration/run_test.go index 260eb7a56..8361491d2 100644 --- a/integration/run_test.go +++ b/integration/run_test.go @@ -37,6 +37,7 @@ import ( "bytes" "fmt" "os" + "path/filepath" "regexp" "strconv" "strings" @@ -520,3 +521,132 @@ func TestRunInNamespace(t *testing.T) { } } } + +func TestRunWithIdMap(t *testing.T) { + tests := []struct { + name string + imageName string + indexBuilderFn func(sh *shell.Shell, src imageInfo, opts ...indexBuildOption) string + remapUser string + remapGroup string + remapUID string + remapGid string + checkLocation string + expectedOwner string + }{ + { + name: "with only FUSE layers", + imageName: rabbitmqImage, + indexBuilderFn: func(sh *shell.Shell, src imageInfo, opts ...indexBuildOption) string { + opts = append(opts, withMinLayerSize(0)) + return buildIndex(sh, src, opts...) + }, + remapUser: "dummy-user", + remapGroup: "dummy-group", + remapUID: "123456", + remapGid: "123456", + checkLocation: "usr", + expectedOwner: "123456", + }, + { + name: "with mixed layers", + imageName: rabbitmqImage, + indexBuilderFn: func(sh *shell.Shell, src imageInfo, opts ...indexBuildOption) string { + return buildIndex(sh, src, opts...) + }, + remapUser: "dummy-user", + remapGroup: "dummy-group", + remapUID: "123456", + remapGid: "123456", + checkLocation: "usr", + expectedOwner: "123456", + }, + { + name: "with no SOCI index", + imageName: rabbitmqImage, + indexBuilderFn: func(sh *shell.Shell, src imageInfo, opts ...indexBuildOption) string { + return "" + }, + remapUser: "dummy-user", + remapGroup: "dummy-group", + remapUID: "123456", + remapGid: "123456", + checkLocation: "usr", + expectedOwner: "123456", + }, + } + + baseSnapshotDir := "/var/lib/soci-snapshotter-grpc/snapshotter/snapshots" + baseRuntimeDir := "/run/containerd/io.containerd.runtime.v2.task/default" + testContainerName := "testidmap" + uidPath := "/etc/subuid" + gidPath := "/etc/subgid" + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + regConfig := newRegistryConfig() + sh, done := newShellWithRegistry(t, regConfig) + defer done() + + sh.X("groupadd", "-g", tt.remapGid, tt.remapGroup) + sh.X("useradd", "-u", tt.remapUID, "-g", tt.remapGid, "-m", tt.remapUser) + + subUIDFile := fmt.Sprintf("%s:%s:%s", tt.remapUser, tt.remapUID, "1000") + subGidFile := fmt.Sprintf("%s:%s:%s", tt.remapUser, tt.remapGid, "1000") + sh.Pipe(nil, shell.C("echo", subUIDFile), shell.C("tee", uidPath)) + sh.Pipe(nil, shell.C("echo", subGidFile), shell.C("tee", gidPath)) + + rebootContainerd(t, sh, "", getSnapshotterConfigToml(t, false)) + imageInfo := dockerhub(tt.imageName) + sh.X("nerdctl", "pull", "-q", tt.imageName) + + filenames, err := sh.OLog("ls", baseSnapshotDir) + if err != nil { + t.Fatalf("error listing files in %s", baseSnapshotDir) + } + + // Copy image, remove blobs, and re-pull with SOCI + copyImage(sh, dockerhub(tt.imageName), regConfig.mirror(tt.imageName)) + indexDigest := tt.indexBuilderFn(sh, regConfig.mirror(tt.imageName)) + if indexDigest != "" { + sh.X("soci", "push", "--user", regConfig.creds(), regConfig.mirror(tt.imageName).ref) + } + sh.X("rm", "-rf", filepath.Join(store.DefaultSociContentStorePath, "blobs", "sha256")) + + pullCmd := imagePullCmd + if indexDigest != "" { + pullCmd = append(pullCmd, "--soci-index-digest", indexDigest) + } + sh.X(append(pullCmd, regConfig.mirror(tt.imageName).ref)...) + // time.Sleep(999999999999999999) + sh.X("ctr-with-idmapping", "run", "-d", + "--remap-labels", + "--userns-remap", tt.remapUser, + "--snapshotter", "soci", + imageInfo.ref, testContainerName, "sleep", "infinity", + ) + + newFilenames, err := sh.OLog("ls", baseSnapshotDir) + if err != nil { + t.Fatalf("error listing files in %s", baseSnapshotDir) + } + + if len(filenames) == len(newFilenames) { + t.Fatalf("error: id-mapping failed") + } + + fullCheckPath := filepath.Join(baseRuntimeDir, testContainerName, "rootfs", tt.checkLocation) + stat, err := sh.OLog("stat", fullCheckPath) + if err != nil { + t.Fatalf("error stat files in %s", fullCheckPath) + } + + strStat := string(stat) + t.Log(strStat) + matchUID := fmt.Sprintf("Uid: (%s", tt.expectedOwner) + if !strings.Contains(strStat, matchUID) { + t.Fatalf("error: file %s did not have uid %s", tt.checkLocation, tt.expectedOwner) + } + }) + } +} diff --git a/service/service.go b/service/service.go index 167168e09..044828310 100644 --- a/service/service.go +++ b/service/service.go @@ -119,6 +119,9 @@ func NewSociSnapshotterService(ctx context.Context, root string, serviceCfg *con if serviceCfg.SnapshotterConfig.AllowInvalidMountsOnRestart { snOpts = append(snOpts, snbase.AllowInvalidMountsOnRestart) } + if serviceCfg.FSConfig.AllowIDMap { + snOpts = append(snOpts, snbase.AllowIDMap) + } snapshotter, err = snbase.NewSnapshotter(ctx, snapshotterRoot(root), fs, snOpts...) if err != nil { diff --git a/snapshot/snapshot.go b/snapshot/snapshot.go index f770b5b5b..d0e52ad95 100644 --- a/snapshot/snapshot.go +++ b/snapshot/snapshot.go @@ -44,6 +44,7 @@ import ( commonmetrics "github.com/awslabs/soci-snapshotter/fs/metrics/common" "github.com/awslabs/soci-snapshotter/fs/source" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/containerd/containerd/mount" ctdsnapshotters "github.com/containerd/containerd/pkg/snapshotters" "github.com/containerd/containerd/snapshots" @@ -105,6 +106,8 @@ type FileSystem interface { Check(ctx context.Context, mountpoint string, labels map[string]string) error Unmount(ctx context.Context, mountpoint string) error MountLocal(ctx context.Context, mountpoint string, labels map[string]string, mounts []mount.Mount) error + IDMapMount(ctx context.Context, mountpoint, activeLayerKey string, idmap idtools.IDMap) (string, error) + IDMapMountLocal(ctx context.Context, mountpoint, activeLayerKey string, idmap idtools.IDMap) (string, error) } // SnapshotterConfig is used to configure the remote snapshotter instance @@ -113,6 +116,7 @@ type SnapshotterConfig struct { // minLayerSize skips remote mounting of smaller layers minLayerSize int64 allowInvalidMountsOnRestart bool + allowIDMap bool } // Opt is an option to configure the remote snapshotter @@ -140,6 +144,11 @@ func AllowInvalidMountsOnRestart(config *SnapshotterConfig) error { return nil } +func AllowIDMap(config *SnapshotterConfig) error { + config.allowIDMap = true + return nil +} + type snapshotter struct { root string ms *storage.MetaStore @@ -150,6 +159,8 @@ type snapshotter struct { userxattr bool // whether to enable "userxattr" mount option minLayerSize int64 // minimum layer size for remote mounting allowInvalidMountsOnRestart bool + allowIDMap bool + idmapped map[string]interface{} } // NewSnapshotter returns a Snapshotter which can use unpacked remote layers @@ -200,6 +211,11 @@ func NewSnapshotter(ctx context.Context, root string, targetFs FileSystem, opts userxattr: userxattr, minLayerSize: config.minLayerSize, allowInvalidMountsOnRestart: config.allowInvalidMountsOnRestart, + allowIDMap: config.allowIDMap, + } + + if o.allowIDMap { + o.idmapped = make(map[string]interface{}) } if err := o.restoreRemoteSnapshot(ctx); err != nil { @@ -285,6 +301,51 @@ func (o *snapshotter) Usage(ctx context.Context, key string) (snapshots.Usage, e return usage, nil } +func (o *snapshotter) setupIDMap(ctx context.Context, s storage.Snapshot, parent string, labels map[string]string) error { + // load id-map if appropriate labels are present. + idmap, err := idtools.LoadIDMap(s.ID, labels) + if err != nil { + log.G(ctx).WithError(err).Error("failed to load id-map") + return err + } + + if !idmap.Empty() { + if !o.allowIDMap { + return errors.New("id-map labels found but config does not allow id-mapping") + } + parentSnapshot, err := o.Stat(ctx, parent) + if err != nil { + log.G(ctx).WithError(err).Error("failed to stat parent snapshot") + return err + } + + // If there is no SOCI index, you can safely mount from the root without copying over every single layer + if _, ok := parentSnapshot.Labels[source.HasSociIndexDigest]; !ok { + // Fallback to overlay + log.G(ctx).Debug("no SOCI index found, remapping from root") + mounts, err := o.mounts(ctx, s, parent) + if err != nil { + return err + } + + err = idtools.RemapRootFS(ctx, mounts, idmap) + if err != nil { + return err + } + } else { + o.idmapped[s.ID] = struct{}{} + err = o.createIDMapMounts(ctx, s, idmap) + if err != nil { + log.G(ctx).WithError(err).Error("failed to create id-mapped mounts") + return err + } + } + + log.G(ctx).Debug("id-mapping successful") + } + return nil +} + func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) { log.G(ctx).WithField("key", key).WithField("parent", parent).Debug("prepare") s, err := o.createSnapshot(ctx, snapshots.KindActive, key, parent, opts) @@ -302,7 +363,13 @@ func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...s } target, ok := base.Labels[targetSnapshotLabel] + // !ok means we are in an active snapshot if !ok { + // Setup id-mapped mounts if config allows. + // Any error here needs to stop the container from starting. + if err := o.setupIDMap(ctx, s, parent, base.Labels); err != nil { + return nil, err + } return o.mounts(ctx, s, parent) } @@ -319,7 +386,8 @@ func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...s if !o.skipRemoteSnapshotPrepare(lCtx, base.Labels) { err := o.prepareRemoteSnapshot(lCtx, key, base.Labels) if err == nil { - base.Labels[remoteLabel] = remoteLabelVal // Mark this snapshot as remote + base.Labels[remoteLabel] = remoteLabelVal // Mark this snapshot as remote + base.Labels[source.HasSociIndexDigest] = "true" // Mark that this snapshot was loaded with a SOCI index err := o.commit(ctx, true, target, key, append(opts, snapshots.WithLabels(base.Labels))...) if err == nil || errdefs.IsAlreadyExists(err) { // count also AlreadyExists as "success" @@ -361,6 +429,7 @@ func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...s log.G(ctx).WithField("layerDigest", base.Labels[ctdsnapshotters.TargetLayerDigestLabel]).Info("preparing snapshot as local snapshot") err = o.prepareLocalSnapshot(lCtx, key, base.Labels, mounts) if err == nil { + base.Labels[source.HasSociIndexDigest] = "true" // Mark that this snapshot was loaded with a SOCI index err := o.commit(ctx, false, target, key, append(opts, snapshots.WithLabels(base.Labels))...) if err == nil || errdefs.IsAlreadyExists(err) { // count also AlreadyExists as "success" @@ -578,7 +647,18 @@ func (o *snapshotter) getCleanupDirectories(ctx context.Context, t storage.Trans cleanup := []string{} for _, d := range dirs { if !cleanupCommitted { - if _, ok := ids[d]; ok { + // If the directory name is just a number (e.g '2'), + // we want to check if the dir name (2) must be cleaned + // If the directory has an underscore (e.g. '1_2'), + // we want to check the suffix (2) to determine if + // the directory must be cleaned + cleanupID := d + temp := strings.Split(d, "_") + if len(temp) > 1 { + cleanupID = temp[1] + } + + if _, ok := ids[cleanupID]; ok { continue } } @@ -757,15 +837,16 @@ func (o *snapshotter) mounts(ctx context.Context, s storage.Snapshot, checkKey s }, nil } - parentPaths := make([]string, len(s.ParentIDs)) - for i := range s.ParentIDs { - parentPaths[i] = o.upperPath(s.ParentIDs[i]) + parentPaths, err := o.getParentPaths(s) + if err != nil { + return nil, err } options = append(options, fmt.Sprintf("lowerdir=%s", strings.Join(parentPaths, ":"))) if o.userxattr { options = append(options, "userxattr") } + return []mount.Mount{ { Type: "overlay", @@ -773,7 +854,49 @@ func (o *snapshotter) mounts(ctx context.Context, s storage.Snapshot, checkKey s Options: options, }, }, nil +} + +func (o *snapshotter) getParentPaths(s storage.Snapshot) ([]string, error) { + parentPaths := make([]string, len(s.ParentIDs)) + + for i, id := range s.ParentIDs { + if _, ok := o.idmapped[s.ID]; ok { + id = fmt.Sprintf("%s_%s", id, s.ID) + } + parentPaths[i] = o.upperPath(id) + } + + return parentPaths, nil +} + +func (o *snapshotter) createIDMapMounts(ctx context.Context, s storage.Snapshot, idmap idtools.IDMap) error { + log.G(ctx).Debug("mapping ids") + + for _, id := range s.ParentIDs { + err := o.createIDMapMount(ctx, o.upperPath(id), s.ID, idmap) + if err != nil { + return err + } + } + return idtools.RemapRoot(ctx, o.upperPath(s.ID), idmap) +} + +func (o *snapshotter) createIDMapMount(ctx context.Context, path, id string, idmap idtools.IDMap) error { + // s.ID is the shortest unique identifier for each new container, + // so append it to the end of the new mountpoint + _, err := o.fs.IDMapMount(ctx, path, id, idmap) + if errdefs.IsNotFound(err) { + // Remote mount failed, attempt to create a local id-mapped mount + + // Cleanup dirty snapshot folder — perhaps we can have a return cleanup func? + dirtyDir := fmt.Sprintf("%s_%s", filepath.Dir(path), id) + if err := os.RemoveAll(dirtyDir); err != nil { + return err + } + _, err = o.fs.IDMapMountLocal(ctx, path, id, idmap) + } + return err } // upperPath produces a file path like "{snapshotter.root}/snapshots/{id}/fs" diff --git a/snapshot/snapshot_test.go b/snapshot/snapshot_test.go index cc798a607..e03c23ab6 100644 --- a/snapshot/snapshot_test.go +++ b/snapshot/snapshot_test.go @@ -41,6 +41,7 @@ import ( "syscall" "testing" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/containerd/containerd/mount" "github.com/containerd/containerd/pkg/testutil" "github.com/containerd/containerd/snapshots" @@ -417,6 +418,14 @@ func (fs *bindFs) MountLocal(ctx context.Context, mountpoint string, labels map[ return nil } +func (fs *bindFs) IDMapMount(ctx context.Context, mountpoint, activeLayerKey string, idmap idtools.IDMap) (string, error) { + return mountpoint, nil +} + +func (fs *bindFs) IDMapMountLocal(ctx context.Context, mountpoint, activeLayerKey string, idmap idtools.IDMap) (string, error) { + return mountpoint, nil +} + func dummyFileSystem() FileSystem { return &dummyFs{} } type dummyFs struct{} @@ -437,6 +446,14 @@ func (fs *dummyFs) MountLocal(ctx context.Context, mountpoint string, labels map return fmt.Errorf("dummy") } +func (fs *dummyFs) IDMapMount(ctx context.Context, mountpoint, activeLayerKey string, idmap idtools.IDMap) (string, error) { + return "", fmt.Errorf("dummy") +} + +func (fs *dummyFs) IDMapMountLocal(ctx context.Context, mountpoint, activeLayerKey string, idmap idtools.IDMap) (string, error) { + return "", fmt.Errorf("dummy") +} + // ============================================================================= // Tests backword-comaptibility of overlayfs snapshotter.