From 204614b403fd33cd63d92e82c2f901e7c5ad92cb Mon Sep 17 00:00:00 2001 From: David Son Date: Thu, 6 Jun 2024 16:42:48 +0000 Subject: [PATCH] Add support for idmapped layers Signed-off-by: David Son --- Dockerfile | 2 +- config/config.toml | 1 + config/fs.go | 1 + fs/fs.go | 109 ++++++++++++++++++++++----- fs/fs_test.go | 11 ++- fs/layer/layer.go | 7 +- fs/layer/node.go | 40 +++++----- fs/layer/node_test.go | 3 +- fs/layer/util_test.go | 3 +- fs/source/source.go | 3 + go.mod | 2 +- idtools/idmap.go | 66 +++++++++++++++++ integration/run_test.go | 150 ++++++++++++++++++++++++++++++++++++++ integration/util_test.go | 1 + service/service.go | 3 + snapshot/snapshot.go | 133 +++++++++++++++++++++++++++++++-- snapshot/snapshot_test.go | 17 +++++ 17 files changed, 501 insertions(+), 51 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6eae87e04..a6e0fb944 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG CONTAINERD_VERSION=1.6.34 +ARG CONTAINERD_VERSION=1.7.23 ARG RUNC_VERSION=1.1.12 ARG NERDCTL_VERSION=1.7.1 diff --git a/config/config.toml b/config/config.toml index 21385f5d0..3e75d284e 100644 --- a/config/config.toml +++ b/config/config.toml @@ -16,6 +16,7 @@ filesystem_cache_type="" resolve_result_entry=0 debug=false allow_no_verification=true +allow_idmap=true # disable_verification=false # Causes TestRunWithDefaultConfig to break, but # fine to use in /etc/soci-snapshotter-grpc-config.toml diff --git a/config/fs.go b/config/fs.go index 4088d9b6e..4d3af996e 100644 --- a/config/fs.go +++ b/config/fs.go @@ -56,6 +56,7 @@ type FSConfig struct { NoPrometheus bool `toml:"no_prometheus"` MountTimeoutSec int64 `toml:"mount_timeout_sec"` FuseMetricsEmitWaitDurationSec int64 `toml:"fuse_metrics_emit_wait_duration_sec"` + AllowIDMap bool `toml:"allow_idmap" default:"true"` RetryableHTTPClientConfig `toml:"http"` BlobConfig `toml:"blob"` diff --git a/fs/fs.go b/fs/fs.go index 3f3dbbb58..11cbe677a 100644 --- a/fs/fs.go +++ b/fs/fs.go @@ -45,9 +45,13 @@ package fs import ( "context" "fmt" + "io" golog "log" "net/http" + "os" "os/exec" + "path/filepath" + "strings" "sync" "syscall" "time" @@ -59,6 +63,7 @@ import ( layermetrics "github.com/awslabs/soci-snapshotter/fs/metrics/layer" "github.com/awslabs/soci-snapshotter/fs/remote" "github.com/awslabs/soci-snapshotter/fs/source" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/awslabs/soci-snapshotter/metadata" "github.com/awslabs/soci-snapshotter/snapshot" "github.com/awslabs/soci-snapshotter/soci" @@ -67,6 +72,7 @@ import ( ctdsnapshotters "github.com/containerd/containerd/pkg/snapshotters" "github.com/containerd/containerd/reference" "github.com/containerd/containerd/remotes/docker" + "github.com/containerd/errdefs" "github.com/containerd/log" metrics "github.com/docker/go-metrics" fusefs "github.com/hanwen/go-fuse/v2/fs" @@ -455,6 +461,58 @@ func (fs *filesystem) getSociContext(ctx context.Context, imageRef, indexDigest, return c, err } +func getIDMappedMountpoint(mountpoint, activeLayerKey string) string { + d := filepath.Dir(mountpoint) + return filepath.Join(fmt.Sprintf("%s_%s", d, activeLayerKey), "fs") +} + +func (fs *filesystem) IDMapMount(ctx context.Context, mountpoint, activeLayerKey string, idmapper idtools.IDMap) (string, error) { + newMountpoint := getIDMappedMountpoint(mountpoint, activeLayerKey) + logger := log.G(ctx).WithField("mountpoint", newMountpoint) + + logger.Debug("creating remote id-mapped mount") + if err := os.Mkdir(filepath.Dir(newMountpoint), 0700); err != nil { + return "", err + } + if err := os.Mkdir(newMountpoint, 0755); err != nil { + return "", err + } + + fs.layerMu.Lock() + l := fs.layer[mountpoint] + if l == nil { + fs.layerMu.Unlock() + logger.Error("failed to create remote id-mapped mount") + return "", errdefs.ErrNotFound + } + fs.layer[newMountpoint] = l + fs.layerMu.Unlock() + node, err := l.RootNode(0, idmapper) + if err != nil { + return "", err + } + + fuseLogger := log.L. + WithField("mountpoint", mountpoint). + WriterLevel(logrus.TraceLevel) + + return newMountpoint, fs.setupFuseServer(ctx, newMountpoint, node, l, fuseLogger, nil) +} + +func (fs *filesystem) IDMapMountLocal(ctx context.Context, mountpoint, activeLayerKey string, idmapper idtools.IDMap) (string, error) { + newMountpoint := getIDMappedMountpoint(mountpoint, activeLayerKey) + logger := log.G(ctx).WithField("mountpoint", newMountpoint) + + logger.Debug("creating local id-mapped mount") + if err := idtools.RemapDir(ctx, mountpoint, newMountpoint, idmapper); err != nil { + logger.WithError(err).Error("failed to create local mount") + return "", err + } + + logger.Debug("successfully created local mountpoint") + return newMountpoint, nil +} + func (fs *filesystem) Mount(ctx context.Context, mountpoint string, labels map[string]string) (retErr error) { // Setting the start time to measure the Mount operation duration. start := time.Now() @@ -560,7 +618,7 @@ func (fs *filesystem) Mount(ctx context.Context, mountpoint string, labels map[s } }() - node, err := l.RootNode(0) + node, err := l.RootNode(0, idtools.IDMap{}) if err != nil { log.G(ctx).WithError(err).Warnf("Failed to get root node") retErr = fmt.Errorf("failed to get root node: %w", err) @@ -577,6 +635,17 @@ func (fs *filesystem) Mount(ctx context.Context, mountpoint string, labels map[s fs.layerMu.Unlock() fs.metricsController.Add(mountpoint, l) + // Pass in a logger to go-fuse with the layer digest + // The go-fuse logs are useful for tracing exactly what's happening at the fuse level. + fuseLogger := log.L. + WithField("layerDigest", labels[ctdsnapshotters.TargetLayerDigestLabel]). + WriterLevel(logrus.TraceLevel) + + retErr = fs.setupFuseServer(ctx, mountpoint, node, l, fuseLogger, c) + return +} + +func (fs *filesystem) setupFuseServer(ctx context.Context, mountpoint string, node fusefs.InodeEmbedder, l layer.Layer, logger *io.PipeWriter, c *sociContext) error { // mount the node to the specified mountpoint // TODO: bind mount the state directory as a read-only fs on snapshotter's side rawFS := fusefs.NewNodeFS(node, &fusefs.Options{ @@ -585,40 +654,37 @@ func (fs *filesystem) Mount(ctx context.Context, mountpoint string, labels map[s NegativeTimeout: &fs.negativeTimeout, NullPermissions: true, }) - // Pass in a logger to go-fuse with the layer digest - // The go-fuse logs are useful for tracing exactly what's happening at the fuse level. - logger := log.L. - WithField("layerDigest", labels[ctdsnapshotters.TargetLayerDigestLabel]). - WriterLevel(logrus.TraceLevel) mountOpts := &fuse.MountOptions{ AllowOther: true, // allow users other than root&mounter to access fs FsName: "soci", // name this filesystem as "soci" Debug: fs.debug, Logger: golog.New(logger, "", 0), DisableXAttrs: l.DisableXAttrs(), + // Options: []string{"default_permissions", "ro"}, } if _, err := exec.LookPath(fusermountBin); err == nil { mountOpts.Options = []string{"suid"} // option for fusermount; allow setuid inside container } else { - log.G(ctx).WithError(err).Infof("%s not installed; trying direct mount", fusermountBin) + log.G(ctx).WithField("binary", fusermountBin).WithError(err).Info("fuse binary not installed; trying direct mount") mountOpts.DirectMount = true } server, err := fuse.NewServer(rawFS, mountpoint, mountOpts) if err != nil { - log.G(ctx).WithError(err).Debug("failed to make filesystem server") - retErr = err - return + log.G(ctx).WithError(err).Error("failed to make filesystem server") + return err } go server.Serve() - // Send a signal to the background fetcher that a new image is being mounted - // and to pause all background fetches. - c.bgFetchPauseOnce.Do(func() { - if fs.bgFetcher != nil { - fs.bgFetcher.Pause() - } - }) + if c != nil { + // Send a signal to the background fetcher that a new image is being mounted + // and to pause all background fetches. + c.bgFetchPauseOnce.Do(func() { + if fs.bgFetcher != nil { + fs.bgFetcher.Pause() + } + }) + } return server.WaitMount() } @@ -681,6 +747,11 @@ func (fs *filesystem) check(ctx context.Context, l layer.Layer, labels map[strin return rErr } +func isIDMappedDir(mountpoint string) bool { + dirName := filepath.Base(mountpoint) + return len(strings.Split(dirName, "_")) > 1 +} + func (fs *filesystem) Unmount(ctx context.Context, mountpoint string) error { fs.layerMu.Lock() l, ok := fs.layer[mountpoint] @@ -688,7 +759,9 @@ func (fs *filesystem) Unmount(ctx context.Context, mountpoint string) error { fs.layerMu.Unlock() return fmt.Errorf("specified path %q isn't a mountpoint", mountpoint) } - delete(fs.layer, mountpoint) // unregisters the corresponding layer + if !isIDMappedDir(mountpoint) { + delete(fs.layer, mountpoint) // unregisters the corresponding layer + } l.Done() fs.layerMu.Unlock() fs.metricsController.Remove(mountpoint) diff --git a/fs/fs_test.go b/fs/fs_test.go index 2591a270a..119ec9c2c 100644 --- a/fs/fs_test.go +++ b/fs/fs_test.go @@ -46,6 +46,7 @@ import ( "github.com/awslabs/soci-snapshotter/fs/layer" "github.com/awslabs/soci-snapshotter/fs/remote" "github.com/awslabs/soci-snapshotter/fs/source" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/containerd/containerd/reference" "github.com/containerd/containerd/remotes/docker" fusefs "github.com/hanwen/go-fuse/v2/fs" @@ -83,10 +84,12 @@ func (l *breakableLayer) Info() layer.Info { Size: 1, } } -func (l *breakableLayer) DisableXAttrs() bool { return false } -func (l *breakableLayer) RootNode(uint32) (fusefs.InodeEmbedder, error) { return nil, nil } -func (l *breakableLayer) Verify(tocDigest digest.Digest) error { return nil } -func (l *breakableLayer) SkipVerify() {} +func (l *breakableLayer) DisableXAttrs() bool { return false } +func (l *breakableLayer) RootNode(uint32, idtools.IDMap) (fusefs.InodeEmbedder, error) { + return nil, nil +} +func (l *breakableLayer) Verify(tocDigest digest.Digest) error { return nil } +func (l *breakableLayer) SkipVerify() {} func (l *breakableLayer) ReadAt([]byte, int64, ...remote.Option) (int, error) { return 0, fmt.Errorf("fail") } diff --git a/fs/layer/layer.go b/fs/layer/layer.go index 5d802fee9..d5ef17fe9 100644 --- a/fs/layer/layer.go +++ b/fs/layer/layer.go @@ -58,6 +58,7 @@ import ( "github.com/awslabs/soci-snapshotter/fs/remote" spanmanager "github.com/awslabs/soci-snapshotter/fs/span-manager" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/awslabs/soci-snapshotter/metadata" "github.com/awslabs/soci-snapshotter/soci" "github.com/awslabs/soci-snapshotter/util/lrucache" @@ -86,7 +87,7 @@ type Layer interface { Info() Info // RootNode returns the root node of this layer. - RootNode(baseInode uint32) (fusefs.InodeEmbedder, error) + RootNode(baseInode uint32, idMapper idtools.IDMap) (fusefs.InodeEmbedder, error) // Check checks if the layer is still connectable. Check() error @@ -456,11 +457,11 @@ func (l *layerRef) Done() { l.done() } -func (l *layer) RootNode(baseInode uint32) (fusefs.InodeEmbedder, error) { +func (l *layer) RootNode(baseInode uint32, idMapper idtools.IDMap) (fusefs.InodeEmbedder, error) { if l.isClosed() { return nil, fmt.Errorf("layer is already closed") } - return newNode(l.desc.Digest, l.r, l.blob, baseInode, l.resolver.overlayOpaqueType, l.resolver.config.LogFuseOperations, l.fuseOperationCounter) + return newNode(l.desc.Digest, l.r, l.blob, baseInode, l.resolver.overlayOpaqueType, l.resolver.config.LogFuseOperations, l.fuseOperationCounter, idMapper) } func (l *layer) ReadAt(p []byte, offset int64, opts ...remote.Option) (int, error) { diff --git a/fs/layer/node.go b/fs/layer/node.go index 6459c1a48..630ecd9f9 100644 --- a/fs/layer/node.go +++ b/fs/layer/node.go @@ -56,6 +56,7 @@ import ( commonmetrics "github.com/awslabs/soci-snapshotter/fs/metrics/common" "github.com/awslabs/soci-snapshotter/fs/reader" "github.com/awslabs/soci-snapshotter/fs/remote" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/awslabs/soci-snapshotter/metadata" "github.com/containerd/log" fusefs "github.com/hanwen/go-fuse/v2/fs" @@ -189,7 +190,7 @@ func (f *FuseOperationCounter) Run(ctx context.Context) { // logFSOperations may cause sensitive information to be emitted to logs // e.g. filenames and paths within an image -func newNode(layerDgst digest.Digest, r reader.Reader, blob remote.Blob, baseInode uint32, opaque OverlayOpaqueType, logFSOperations bool, opCounter *FuseOperationCounter) (fusefs.InodeEmbedder, error) { +func newNode(layerDgst digest.Digest, r reader.Reader, blob remote.Blob, baseInode uint32, opaque OverlayOpaqueType, logFSOperations bool, opCounter *FuseOperationCounter, idMapper idtools.IDMap) (fusefs.InodeEmbedder, error) { rootID := r.Metadata().RootID() rootAttr, err := r.Metadata().GetAttr(rootID) if err != nil { @@ -210,9 +211,10 @@ func newNode(layerDgst digest.Digest, r reader.Reader, blob remote.Blob, baseIno } ffs.s = ffs.newState(layerDgst, blob) return &node{ - id: rootID, - attr: rootAttr, - fs: ffs, + id: rootID, + attr: rootAttr, + fs: ffs, + idMapper: idMapper, }, nil } @@ -272,9 +274,10 @@ func (fs *fs) inodeOfID(id uint32) (uint64, error) { // node is a filesystem inode abstraction. type node struct { fusefs.Inode - fs *fs - id uint32 - attr metadata.Attr + fs *fs + id uint32 + attr metadata.Attr + idMapper idtools.IDMap ents []fuse.DirEntry entsCached bool @@ -407,14 +410,14 @@ func (n *node) Lookup(ctx context.Context, name string, out *fuse.EntryOut) (*fu n.fs.reportFailure(fuseOpLookup, fmt.Errorf("%s: %v", fuseOpLookup, err)) return nil, syscall.EIO } - entryToAttr(ino, tn.attr, &out.Attr) + n.entryToAttr(ino, tn.attr, &out.Attr) case *whiteout: ino, err := n.fs.inodeOfID(tn.id) if err != nil { n.fs.reportFailure(fuseOpLookup, fmt.Errorf("%s: %v", fuseOpLookup, err)) return nil, syscall.EIO } - entryToAttr(ino, tn.attr, &out.Attr) + n.entryToAttr(ino, tn.attr, &out.Attr) default: n.fs.reportFailure(fuseOpLookup, fmt.Errorf("%s: unknown node type detected", fuseOpLookup)) return nil, syscall.EIO @@ -463,10 +466,11 @@ func (n *node) Lookup(ctx context.Context, name string, out *fuse.EntryOut) (*fu return nil, syscall.EIO } return n.NewInode(ctx, &node{ - id: id, - fs: n.fs, - attr: ce, - }, entryToAttr(ino, ce, &out.Attr)), 0 + id: id, + fs: n.fs, + attr: ce, + idMapper: n.idMapper, + }, n.entryToAttr(ino, ce, &out.Attr)), 0 } var _ = (fusefs.NodeOpener)((*node)(nil)) @@ -495,7 +499,7 @@ func (n *node) Getattr(ctx context.Context, f fusefs.FileHandle, out *fuse.AttrO n.fs.reportFailure(fuseOpGetattr, fmt.Errorf("%s: %v", fuseOpGetattr, err)) return syscall.EIO } - entryToAttr(ino, n.attr, &out.Attr) + n.entryToAttr(ino, n.attr, &out.Attr) return 0 } @@ -594,7 +598,7 @@ func (f *file) Getattr(ctx context.Context, out *fuse.AttrOut) syscall.Errno { f.n.fs.reportFailure(fuseOpFileGetattr, fmt.Errorf("%s: %v", fuseOpFileGetattr, err)) return syscall.EIO } - entryToAttr(ino, f.n.attr, &out.Attr) + f.n.entryToAttr(ino, f.n.attr, &out.Attr) return 0 } @@ -797,7 +801,7 @@ func (sf *statFile) updateStatUnlocked() ([]byte, error) { } // entryToAttr converts metadata.Attr to go-fuse's Attr. -func entryToAttr(ino uint64, e metadata.Attr, out *fuse.Attr) fusefs.StableAttr { +func (n *node) entryToAttr(ino uint64, e metadata.Attr, out *fuse.Attr) fusefs.StableAttr { out.Ino = ino out.Size = uint64(e.Size) if e.Mode&os.ModeSymlink != 0 { @@ -808,7 +812,9 @@ func entryToAttr(ino uint64, e metadata.Attr, out *fuse.Attr) fusefs.StableAttr mtime := e.ModTime out.SetTimes(nil, &mtime, nil) out.Mode = fileModeToSystemMode(e.Mode) - out.Owner = fuse.Owner{Uid: uint32(e.UID), Gid: uint32(e.GID)} + // Potentially dangerous casting int -> uint32? But probably fine. + mappedID, _ := n.idMapper.ToHost(idtools.User{Uid: uint32(e.UID), Gid: uint32(e.GID)}) + out.Owner = fuse.Owner{Uid: mappedID.Uid, Gid: mappedID.Gid} out.Rdev = uint32(unix.Mkdev(uint32(e.DevMajor), uint32(e.DevMinor))) out.Nlink = uint32(e.NumLink) if out.Nlink == 0 { diff --git a/fs/layer/node_test.go b/fs/layer/node_test.go index 432b3b66e..9f12e6810 100644 --- a/fs/layer/node_test.go +++ b/fs/layer/node_test.go @@ -50,7 +50,8 @@ func TestEntryToAttr(t *testing.T) { tc := tc t.Run(tc.name, func(t *testing.T) { var actual fuse.Attr - entryToAttr(0, tc.attr, &actual) + var n node + n.entryToAttr(0, tc.attr, &actual) tc.expected.Mtime = actual.Mtime if actual != tc.expected { t.Fatalf("unexpected fuse attr. actual %v expected %v", actual, tc.expected) diff --git a/fs/layer/util_test.go b/fs/layer/util_test.go index 2fec52d89..dcb24420f 100644 --- a/fs/layer/util_test.go +++ b/fs/layer/util_test.go @@ -56,6 +56,7 @@ import ( "github.com/awslabs/soci-snapshotter/fs/reader" "github.com/awslabs/soci-snapshotter/fs/remote" spanmanager "github.com/awslabs/soci-snapshotter/fs/span-manager" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/awslabs/soci-snapshotter/metadata" "github.com/awslabs/soci-snapshotter/util/testutil" "github.com/awslabs/soci-snapshotter/ztoc" @@ -360,7 +361,7 @@ func hasSize(name string, size int) check { } func getRootNode(t *testing.T, r reader.Reader, opaque OverlayOpaqueType) *node { - rootNode, err := newNode(testStateLayerDigest, &testReader{r}, &testBlobState{10, 5}, 100, opaque, false, nil) + rootNode, err := newNode(testStateLayerDigest, &testReader{r}, &testBlobState{10, 5}, 100, opaque, false, nil, idtools.IDMap{}) if err != nil { t.Fatalf("failed to get root node: %v", err) } diff --git a/fs/source/source.go b/fs/source/source.go index ce394d840..9da0b36b2 100644 --- a/fs/source/source.go +++ b/fs/source/source.go @@ -84,6 +84,9 @@ const ( // TargetSociIndexDigestLabel is a label which contains the digest of the soci index. TargetSociIndexDigestLabel = "containerd.io/snapshot/remote/soci.index.digest" + + // HasSociIndexDigest is a label that tells if the layer was pulled with a SOCI index. + HasSociIndexDigest = "containerd.io/snapshot/remote/has.soci.index.digest" ) // RegistryHosts is copied from [github.com/awslabs/soci-snapshotter/service/resolver.RegistryHosts] diff --git a/go.mod b/go.mod index 520b1de5b..74d0df687 100644 --- a/go.mod +++ b/go.mod @@ -27,6 +27,7 @@ require ( github.com/prometheus/client_golang v1.20.5 github.com/rs/xid v1.6.0 github.com/sirupsen/logrus v1.9.3 + github.com/stretchr/testify v1.9.0 go.etcd.io/bbolt v1.3.11 golang.org/x/crypto v0.28.0 golang.org/x/sync v0.8.0 @@ -90,7 +91,6 @@ require ( github.com/prometheus/common v0.55.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/spf13/pflag v1.0.5 // indirect - github.com/stretchr/testify v1.9.0 // indirect go.opencensus.io v0.24.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.45.0 // indirect go.opentelemetry.io/otel v1.21.0 // indirect diff --git a/idtools/idmap.go b/idtools/idmap.go index d5d677285..cce9e73eb 100644 --- a/idtools/idmap.go +++ b/idtools/idmap.go @@ -37,10 +37,17 @@ package idtools import ( + "context" "errors" "fmt" + "os" + "os/exec" + "path/filepath" "strings" + "syscall" + "github.com/containerd/containerd/mount" + "github.com/containerd/containerd/snapshots" "github.com/opencontainers/runtime-spec/specs-go" ) @@ -64,6 +71,18 @@ type IDMap struct { GidMap []specs.LinuxIDMapping `json:"GidMap"` } +func LoadIDMap(id string, labels map[string]string) (IDMap, error) { + var idmap IDMap + uidmapJSON, okUID := labels[snapshots.LabelSnapshotUIDMapping] + gidmapJSON, okGID := labels[snapshots.LabelSnapshotGIDMapping] + if okUID && okGID { + if err := idmap.Unmarshal(uidmapJSON, gidmapJSON); err != nil { + return IDMap{}, err + } + } + return idmap, nil +} + // ToHost returns the host user ID pair for the container ID pair. func (i IDMap) ToHost(pair User) (User, error) { var ( @@ -167,3 +186,50 @@ func safeSum(x, y uint32) (uint32, error) { } return z, nil } + +func RemapDir(ctx context.Context, originalMountpoint, newMountpoint string, idMap IDMap) error { + idmappedSnapshotBase := filepath.Dir(newMountpoint) + if err := os.Mkdir(idmappedSnapshotBase, 0755); err != nil { + return err + } + + // opt for '-pR' instead of '-a' to not copy hardlinks. + // (Copying hardlinks results in issues with chown as + // it will attempt to chown twice on the same inode) + if err := exec.Command("cp", "-pR", originalMountpoint, idmappedSnapshotBase).Run(); err != nil { + return err + } + return filepath.Walk(newMountpoint, chown(idMap)) +} + +func RemapRoot(ctx context.Context, root string, idMap IDMap) error { + return filepath.Walk(root, chown(idMap)) +} + +func RemapRootFS(ctx context.Context, mounts []mount.Mount, idmap IDMap) error { + return mount.WithTempMount(ctx, mounts, func(root string) error { + return filepath.Walk(root, chown(idmap)) + }) +} + +func chown(idMap IDMap) filepath.WalkFunc { + return func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + stat := info.Sys().(*syscall.Stat_t) + h, cerr := idMap.ToHost(User{Uid: stat.Uid, Gid: stat.Gid}) + if cerr != nil { + return cerr + } + // be sure the lchown the path as to not de-reference the symlink to a host file + if cerr = os.Lchown(path, int(h.Uid), int(h.Gid)); cerr != nil { + return cerr + } + // we must retain special permissions such as setuid, setgid and sticky bits + if mode := info.Mode(); mode&os.ModeSymlink == 0 && mode&(os.ModeSetuid|os.ModeSetgid|os.ModeSticky) != 0 { + return os.Chmod(path, mode) + } + return nil + } +} diff --git a/integration/run_test.go b/integration/run_test.go index 260eb7a56..9c140e77a 100644 --- a/integration/run_test.go +++ b/integration/run_test.go @@ -37,6 +37,7 @@ import ( "bytes" "fmt" "os" + "path/filepath" "regexp" "strconv" "strings" @@ -520,3 +521,152 @@ func TestRunInNamespace(t *testing.T) { } } } + +func TestRunWithIdMap(t *testing.T) { + type checker struct { + path string + expectedUID string + expectedGID string + } + + imageName := rabbitmqImage + + baseSnapshotDir := "/var/lib/soci-snapshotter-grpc/snapshotter/snapshots" + baseRuntimeDir := "/run/containerd/io.containerd.runtime.v2.task/default" + + uidPath := "/etc/subuid" + gidPath := "/etc/subgid" + + dummyuser := "dummy-user" + dummygroup := "dummy-group" + + modes := []struct { + name string + indexBuilderFn func(sh *shell.Shell, src imageInfo, opts ...indexBuildOption) string + }{ + { + name: "with only FUSE layers", + indexBuilderFn: func(sh *shell.Shell, src imageInfo, opts ...indexBuildOption) string { + opts = append(opts, withMinLayerSize(0)) + return buildIndex(sh, src, opts...) + }, + }, + { + name: "with mixed layers", + indexBuilderFn: func(sh *shell.Shell, src imageInfo, opts ...indexBuildOption) string { + return buildIndex(sh, src, opts...) + }, + }, + { + name: "with no SOCI index", + indexBuilderFn: func(sh *shell.Shell, src imageInfo, opts ...indexBuildOption) string { + return "" + }, + }, + } + + tests := []struct { + name string + subUIDContents string + subGIDContents string + checkFiles []checker + }{ + { + name: "with one set of substitutions", + subUIDContents: fmt.Sprintf("%s:12345:1001", dummyuser), + subGIDContents: fmt.Sprintf("%s:12345:1001", dummyuser), + checkFiles: []checker{ + { + path: "/usr/bin/sh", + expectedUID: "12345", + expectedGID: "12345", + }, + }, + }, + { + name: "with multiple substitutions", + subUIDContents: fmt.Sprintf("%s:12345:1000\n%s:22222:1", dummyuser, dummyuser), + subGIDContents: fmt.Sprintf("%s:12345:1000\n%s:22222:1", dummyuser, dummyuser), + checkFiles: []checker{ + { + path: "/usr/bin/sh", + expectedUID: "12345", + expectedGID: "12345", + }, + }, + }, + } + + for _, mode := range modes { + mode := mode + for _, tt := range tests { + tt := tt + t.Run(tt.name+" "+mode.name, func(t *testing.T) { + regConfig := newRegistryConfig() + sh, done := newShellWithRegistry(t, regConfig) + defer done() + + sh.X("groupadd", "-g", "12345", dummygroup) + sh.X("useradd", "-u", "12345", "-g", "12345", "-m", dummyuser) + + sh.Pipe(nil, shell.C("echo", tt.subUIDContents), shell.C("tee", uidPath)) + sh.Pipe(nil, shell.C("echo", tt.subGIDContents), shell.C("tee", gidPath)) + + rebootContainerd(t, sh, getContainerdConfigToml(t, false), getSnapshotterConfigToml(t, false)) + imageInfo := dockerhub(imageName) + sh.X("nerdctl", "pull", "-q", imageName) + + filenames, err := sh.OLog("ls", baseSnapshotDir) + if err != nil { + t.Fatalf("error listing files in %s", baseSnapshotDir) + } + + // Copy image, remove blobs, and re-pull with SOCI + copyImage(sh, dockerhub(imageName), regConfig.mirror(imageName)) + indexDigest := mode.indexBuilderFn(sh, regConfig.mirror(imageName)) + if indexDigest != "" { + sh.X("soci", "push", "--user", regConfig.creds(), regConfig.mirror(imageName).ref) + } + sh.X("rm", "-rf", filepath.Join(store.DefaultSociContentStorePath, "blobs", "sha256")) + + pullCmd := imagePullCmd + if indexDigest != "" { + pullCmd = append(pullCmd, "--soci-index-digest", indexDigest) + } + sh.X(append(pullCmd, regConfig.mirror(imageName).ref)...) + // time.Sleep(999999999999999999) + containerID := strings.TrimSpace(string(sh.O("nerdctl-with-idmapping", "run", "-d", + "--net", "none", + "--pull", "never", + "--userns", dummyuser, + "--snapshotter", "soci", + imageInfo.ref, "sleep", "infinity", + ))) + + newFilenames, err := sh.OLog("ls", baseSnapshotDir) + if err != nil { + t.Fatalf("error listing files in %s", baseSnapshotDir) + } + + if len(filenames) == len(newFilenames) { + t.Fatalf("error: id-mapping failed") + } + + for _, check := range tt.checkFiles { + fullCheckPath := filepath.Join(baseRuntimeDir, containerID, "rootfs", check.path) + stat, err := sh.OLog("stat", fullCheckPath) + if err != nil { + t.Fatalf("error stat files in %s", fullCheckPath) + } + + strStat := string(stat) + t.Log(strStat) + matchUID := fmt.Sprintf("Uid: (%s", check.expectedUID) + if !strings.Contains(strStat, matchUID) { + t.Fatalf("error: file %s did not have uid %s", check.path, check.expectedUID) + } + } + }) + } + } +} diff --git a/integration/util_test.go b/integration/util_test.go index 60c3f823f..7c3f38475 100644 --- a/integration/util_test.go +++ b/integration/util_test.go @@ -106,6 +106,7 @@ const proxySnapshotterConfig = ` [proxy_plugins.soci] type = "snapshot" address = "/run/soci-snapshotter-grpc/soci-snapshotter-grpc.sock" + capabilities = ["multi-remap-ids", "remap-ids"] ` const containerdConfigTemplate = ` diff --git a/service/service.go b/service/service.go index 167168e09..044828310 100644 --- a/service/service.go +++ b/service/service.go @@ -119,6 +119,9 @@ func NewSociSnapshotterService(ctx context.Context, root string, serviceCfg *con if serviceCfg.SnapshotterConfig.AllowInvalidMountsOnRestart { snOpts = append(snOpts, snbase.AllowInvalidMountsOnRestart) } + if serviceCfg.FSConfig.AllowIDMap { + snOpts = append(snOpts, snbase.AllowIDMap) + } snapshotter, err = snbase.NewSnapshotter(ctx, snapshotterRoot(root), fs, snOpts...) if err != nil { diff --git a/snapshot/snapshot.go b/snapshot/snapshot.go index f770b5b5b..d0e52ad95 100644 --- a/snapshot/snapshot.go +++ b/snapshot/snapshot.go @@ -44,6 +44,7 @@ import ( commonmetrics "github.com/awslabs/soci-snapshotter/fs/metrics/common" "github.com/awslabs/soci-snapshotter/fs/source" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/containerd/containerd/mount" ctdsnapshotters "github.com/containerd/containerd/pkg/snapshotters" "github.com/containerd/containerd/snapshots" @@ -105,6 +106,8 @@ type FileSystem interface { Check(ctx context.Context, mountpoint string, labels map[string]string) error Unmount(ctx context.Context, mountpoint string) error MountLocal(ctx context.Context, mountpoint string, labels map[string]string, mounts []mount.Mount) error + IDMapMount(ctx context.Context, mountpoint, activeLayerKey string, idmap idtools.IDMap) (string, error) + IDMapMountLocal(ctx context.Context, mountpoint, activeLayerKey string, idmap idtools.IDMap) (string, error) } // SnapshotterConfig is used to configure the remote snapshotter instance @@ -113,6 +116,7 @@ type SnapshotterConfig struct { // minLayerSize skips remote mounting of smaller layers minLayerSize int64 allowInvalidMountsOnRestart bool + allowIDMap bool } // Opt is an option to configure the remote snapshotter @@ -140,6 +144,11 @@ func AllowInvalidMountsOnRestart(config *SnapshotterConfig) error { return nil } +func AllowIDMap(config *SnapshotterConfig) error { + config.allowIDMap = true + return nil +} + type snapshotter struct { root string ms *storage.MetaStore @@ -150,6 +159,8 @@ type snapshotter struct { userxattr bool // whether to enable "userxattr" mount option minLayerSize int64 // minimum layer size for remote mounting allowInvalidMountsOnRestart bool + allowIDMap bool + idmapped map[string]interface{} } // NewSnapshotter returns a Snapshotter which can use unpacked remote layers @@ -200,6 +211,11 @@ func NewSnapshotter(ctx context.Context, root string, targetFs FileSystem, opts userxattr: userxattr, minLayerSize: config.minLayerSize, allowInvalidMountsOnRestart: config.allowInvalidMountsOnRestart, + allowIDMap: config.allowIDMap, + } + + if o.allowIDMap { + o.idmapped = make(map[string]interface{}) } if err := o.restoreRemoteSnapshot(ctx); err != nil { @@ -285,6 +301,51 @@ func (o *snapshotter) Usage(ctx context.Context, key string) (snapshots.Usage, e return usage, nil } +func (o *snapshotter) setupIDMap(ctx context.Context, s storage.Snapshot, parent string, labels map[string]string) error { + // load id-map if appropriate labels are present. + idmap, err := idtools.LoadIDMap(s.ID, labels) + if err != nil { + log.G(ctx).WithError(err).Error("failed to load id-map") + return err + } + + if !idmap.Empty() { + if !o.allowIDMap { + return errors.New("id-map labels found but config does not allow id-mapping") + } + parentSnapshot, err := o.Stat(ctx, parent) + if err != nil { + log.G(ctx).WithError(err).Error("failed to stat parent snapshot") + return err + } + + // If there is no SOCI index, you can safely mount from the root without copying over every single layer + if _, ok := parentSnapshot.Labels[source.HasSociIndexDigest]; !ok { + // Fallback to overlay + log.G(ctx).Debug("no SOCI index found, remapping from root") + mounts, err := o.mounts(ctx, s, parent) + if err != nil { + return err + } + + err = idtools.RemapRootFS(ctx, mounts, idmap) + if err != nil { + return err + } + } else { + o.idmapped[s.ID] = struct{}{} + err = o.createIDMapMounts(ctx, s, idmap) + if err != nil { + log.G(ctx).WithError(err).Error("failed to create id-mapped mounts") + return err + } + } + + log.G(ctx).Debug("id-mapping successful") + } + return nil +} + func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) { log.G(ctx).WithField("key", key).WithField("parent", parent).Debug("prepare") s, err := o.createSnapshot(ctx, snapshots.KindActive, key, parent, opts) @@ -302,7 +363,13 @@ func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...s } target, ok := base.Labels[targetSnapshotLabel] + // !ok means we are in an active snapshot if !ok { + // Setup id-mapped mounts if config allows. + // Any error here needs to stop the container from starting. + if err := o.setupIDMap(ctx, s, parent, base.Labels); err != nil { + return nil, err + } return o.mounts(ctx, s, parent) } @@ -319,7 +386,8 @@ func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...s if !o.skipRemoteSnapshotPrepare(lCtx, base.Labels) { err := o.prepareRemoteSnapshot(lCtx, key, base.Labels) if err == nil { - base.Labels[remoteLabel] = remoteLabelVal // Mark this snapshot as remote + base.Labels[remoteLabel] = remoteLabelVal // Mark this snapshot as remote + base.Labels[source.HasSociIndexDigest] = "true" // Mark that this snapshot was loaded with a SOCI index err := o.commit(ctx, true, target, key, append(opts, snapshots.WithLabels(base.Labels))...) if err == nil || errdefs.IsAlreadyExists(err) { // count also AlreadyExists as "success" @@ -361,6 +429,7 @@ func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...s log.G(ctx).WithField("layerDigest", base.Labels[ctdsnapshotters.TargetLayerDigestLabel]).Info("preparing snapshot as local snapshot") err = o.prepareLocalSnapshot(lCtx, key, base.Labels, mounts) if err == nil { + base.Labels[source.HasSociIndexDigest] = "true" // Mark that this snapshot was loaded with a SOCI index err := o.commit(ctx, false, target, key, append(opts, snapshots.WithLabels(base.Labels))...) if err == nil || errdefs.IsAlreadyExists(err) { // count also AlreadyExists as "success" @@ -578,7 +647,18 @@ func (o *snapshotter) getCleanupDirectories(ctx context.Context, t storage.Trans cleanup := []string{} for _, d := range dirs { if !cleanupCommitted { - if _, ok := ids[d]; ok { + // If the directory name is just a number (e.g '2'), + // we want to check if the dir name (2) must be cleaned + // If the directory has an underscore (e.g. '1_2'), + // we want to check the suffix (2) to determine if + // the directory must be cleaned + cleanupID := d + temp := strings.Split(d, "_") + if len(temp) > 1 { + cleanupID = temp[1] + } + + if _, ok := ids[cleanupID]; ok { continue } } @@ -757,15 +837,16 @@ func (o *snapshotter) mounts(ctx context.Context, s storage.Snapshot, checkKey s }, nil } - parentPaths := make([]string, len(s.ParentIDs)) - for i := range s.ParentIDs { - parentPaths[i] = o.upperPath(s.ParentIDs[i]) + parentPaths, err := o.getParentPaths(s) + if err != nil { + return nil, err } options = append(options, fmt.Sprintf("lowerdir=%s", strings.Join(parentPaths, ":"))) if o.userxattr { options = append(options, "userxattr") } + return []mount.Mount{ { Type: "overlay", @@ -773,7 +854,49 @@ func (o *snapshotter) mounts(ctx context.Context, s storage.Snapshot, checkKey s Options: options, }, }, nil +} + +func (o *snapshotter) getParentPaths(s storage.Snapshot) ([]string, error) { + parentPaths := make([]string, len(s.ParentIDs)) + + for i, id := range s.ParentIDs { + if _, ok := o.idmapped[s.ID]; ok { + id = fmt.Sprintf("%s_%s", id, s.ID) + } + parentPaths[i] = o.upperPath(id) + } + + return parentPaths, nil +} + +func (o *snapshotter) createIDMapMounts(ctx context.Context, s storage.Snapshot, idmap idtools.IDMap) error { + log.G(ctx).Debug("mapping ids") + + for _, id := range s.ParentIDs { + err := o.createIDMapMount(ctx, o.upperPath(id), s.ID, idmap) + if err != nil { + return err + } + } + return idtools.RemapRoot(ctx, o.upperPath(s.ID), idmap) +} + +func (o *snapshotter) createIDMapMount(ctx context.Context, path, id string, idmap idtools.IDMap) error { + // s.ID is the shortest unique identifier for each new container, + // so append it to the end of the new mountpoint + _, err := o.fs.IDMapMount(ctx, path, id, idmap) + if errdefs.IsNotFound(err) { + // Remote mount failed, attempt to create a local id-mapped mount + + // Cleanup dirty snapshot folder — perhaps we can have a return cleanup func? + dirtyDir := fmt.Sprintf("%s_%s", filepath.Dir(path), id) + if err := os.RemoveAll(dirtyDir); err != nil { + return err + } + _, err = o.fs.IDMapMountLocal(ctx, path, id, idmap) + } + return err } // upperPath produces a file path like "{snapshotter.root}/snapshots/{id}/fs" diff --git a/snapshot/snapshot_test.go b/snapshot/snapshot_test.go index cc798a607..e03c23ab6 100644 --- a/snapshot/snapshot_test.go +++ b/snapshot/snapshot_test.go @@ -41,6 +41,7 @@ import ( "syscall" "testing" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/containerd/containerd/mount" "github.com/containerd/containerd/pkg/testutil" "github.com/containerd/containerd/snapshots" @@ -417,6 +418,14 @@ func (fs *bindFs) MountLocal(ctx context.Context, mountpoint string, labels map[ return nil } +func (fs *bindFs) IDMapMount(ctx context.Context, mountpoint, activeLayerKey string, idmap idtools.IDMap) (string, error) { + return mountpoint, nil +} + +func (fs *bindFs) IDMapMountLocal(ctx context.Context, mountpoint, activeLayerKey string, idmap idtools.IDMap) (string, error) { + return mountpoint, nil +} + func dummyFileSystem() FileSystem { return &dummyFs{} } type dummyFs struct{} @@ -437,6 +446,14 @@ func (fs *dummyFs) MountLocal(ctx context.Context, mountpoint string, labels map return fmt.Errorf("dummy") } +func (fs *dummyFs) IDMapMount(ctx context.Context, mountpoint, activeLayerKey string, idmap idtools.IDMap) (string, error) { + return "", fmt.Errorf("dummy") +} + +func (fs *dummyFs) IDMapMountLocal(ctx context.Context, mountpoint, activeLayerKey string, idmap idtools.IDMap) (string, error) { + return "", fmt.Errorf("dummy") +} + // ============================================================================= // Tests backword-comaptibility of overlayfs snapshotter.