diff --git a/deps.bzl b/deps.bzl index baa28b06f43..6ff5d5707bc 100644 --- a/deps.bzl +++ b/deps.bzl @@ -6541,10 +6541,11 @@ def install_static_dependencies(workspace_name = "buildbuddy"): sha256 = "a836972b8a7c34970fb9ecc44768ece172f184c5f7e2972c80033fcdcf8c1870", urls = ["https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-arm64"], ) + # BALLOON: Uses our current Kernel config (intended for v5.15) with CONFIG_BALLOON_COMPACTION=y http_file( name = "org_kernel_git_linux_kernel-vmlinux", - sha256 = "3fd19c602f2b11969ad563d4d4855c9147cf13c34238537c1e434097a11aa6b7", - urls = ["https://storage.googleapis.com/buildbuddy-tools/binaries/linux/vmlinux-v5.15-3fd19c602f2b11969ad563d4d4855c9147cf13c34238537c1e434097a11aa6b7"], + sha256 = "7ce5fd0a6cb1c6864111419cbe4917ce49f69163d3f08cc9a2aed050e4d40612", + urls = ["https://storage.googleapis.com/buildbuddy-tools/binaries/linux/vmlinux-x86_64-v6.2-7ce5fd0a6cb1c6864111419cbe4917ce49f69163d3f08cc9a2aed050e4d40612"], executable = True, ) http_file( diff --git a/enterprise/server/remote_execution/commandutil/commandutil.go b/enterprise/server/remote_execution/commandutil/commandutil.go index dd0fd90a95f..c1066949398 100644 --- a/enterprise/server/remote_execution/commandutil/commandutil.go +++ b/enterprise/server/remote_execution/commandutil/commandutil.go @@ -37,7 +37,7 @@ var ( // by SIGKILL and may be retried. ErrSIGKILL = status.UnavailableErrorf("command was terminated by SIGKILL, likely due to executor shutdown or OOM") - DebugStreamCommandOutputs = flag.Bool("debug_stream_command_outputs", false, "If true, stream command outputs to the terminal. Intended for debugging purposes only and should not be used in production.") + DebugStreamCommandOutputs = flag.Bool("debug_stream_command_outputs", true, "If true, stream command outputs to the terminal. Intended for debugging purposes only and should not be used in production.") ) var ( diff --git a/enterprise/server/remote_execution/containers/firecracker/firecracker.go b/enterprise/server/remote_execution/containers/firecracker/firecracker.go index 16674c354d2..33a95845222 100644 --- a/enterprise/server/remote_execution/containers/firecracker/firecracker.go +++ b/enterprise/server/remote_execution/containers/firecracker/firecracker.go @@ -847,7 +847,7 @@ func (c *FirecrackerContainer) pauseVM(ctx context.Context) error { return nil } -func (c *FirecrackerContainer) saveSnapshot(ctx context.Context, snapshotDetails *snapshotDetails) error { +func (c *FirecrackerContainer) saveSnapshot(ctx context.Context, snapshotDetails *snapshotDetails, removedAddresses map[int64]string) (*snaploader.CacheCowStats, error) { ctx, span := tracing.StartSpan(ctx) defer span.End() @@ -862,7 +862,7 @@ func (c *FirecrackerContainer) saveSnapshot(ctx context.Context, snapshotDetails if snapshotDetails.snapshotType == diffSnapshotType { mergeStart := time.Now() if err := MergeDiffSnapshot(ctx, baseMemSnapshotPath, c.memoryStore, memSnapshotPath, mergeDiffSnapshotConcurrency, mergeDiffSnapshotBlockSize); err != nil { - return status.UnknownErrorf("merge diff snapshot failed: %s", err) + return nil, status.UnknownErrorf("merge diff snapshot failed: %s", err) } log.CtxDebugf(ctx, "VMM merge diff snapshot took %s", time.Since(mergeStart)) // Use the merged memory snapshot. @@ -876,7 +876,7 @@ func (c *FirecrackerContainer) saveSnapshot(ctx context.Context, snapshotDetails memChunkDir := filepath.Join(c.getChroot(), memoryChunkDirName) memoryStore, err := c.convertToCOW(ctx, memSnapshotPath, memChunkDir) if err != nil { - return status.WrapError(err, "convert memory snapshot to COWStore") + return nil, status.WrapError(err, "convert memory snapshot to COWStore") } c.memoryStore = memoryStore } @@ -911,14 +911,16 @@ func (c *FirecrackerContainer) saveSnapshot(ctx context.Context, snapshotDetails } else { opts.MemSnapshotPath = memSnapshotPath } + opts.RemovedAddresses = removedAddresses snaploaderStart := time.Now() - if err := c.loader.CacheSnapshot(ctx, c.snapshotKeySet.GetWriteKey(), opts); err != nil { - return status.WrapError(err, "add snapshot to cache") + memoryStats, err := c.loader.CacheSnapshot(ctx, c.snapshotKeySet.GetWriteKey(), opts) + if err != nil { + return nil, status.WrapError(err, "add snapshot to cache") } log.CtxDebugf(ctx, "snaploader.CacheSnapshot took %s", time.Since(snaploaderStart)) - return nil + return memoryStats, nil } func (c *FirecrackerContainer) getVMMetadata() *fcpb.VMMetadata { @@ -1065,7 +1067,7 @@ func (c *FirecrackerContainer) LoadSnapshot(ctx context.Context) error { return status.WrapError(err, "failed to init virtual block devices") } - if err := c.setupUFFDHandler(ctx); err != nil { + if err := c.setupUFFDHandler(ctx, snap.GetRemovedAddresses()); err != nil { return err } @@ -1706,7 +1708,7 @@ func (c *FirecrackerContainer) setupNetworking(ctx context.Context) error { return nil } -func (c *FirecrackerContainer) setupUFFDHandler(ctx context.Context) error { +func (c *FirecrackerContainer) setupUFFDHandler(ctx context.Context, removedAddresses map[int64]string) error { if c.memoryStore == nil { // No memory file to serve over UFFD; do nothing. return nil @@ -1714,7 +1716,7 @@ func (c *FirecrackerContainer) setupUFFDHandler(ctx context.Context) error { if c.uffdHandler != nil { return status.InternalErrorf("uffd handler is already running") } - h, err := uffd.NewHandler() + h, err := uffd.NewHandler(removedAddresses) if err != nil { return status.WrapError(err, "create uffd handler") } @@ -1964,8 +1966,14 @@ func (c *FirecrackerContainer) create(ctx context.Context) error { return status.WrapError(err, "failed to init VBD mounts") } + // TODO: Add support for free page reporting, and we will not have to manually + // control the balloon + balloon := fcclient.NewCreateBalloonHandler(1, true, 5) machineOpts := []fcclient.Opt{ fcclient.WithLogger(getLogrusLogger()), + func(m *fcclient.Machine) { + m.Handlers.FcInit = m.Handlers.FcInit.AppendAfter(fcclient.CreateMachineHandlerName, balloon) + }, } m, err := fcclient.NewMachine(vmCtx, *fcCfg, machineOpts...) @@ -2081,6 +2089,7 @@ func (c *FirecrackerContainer) dialVMExecServer(ctx context.Context) (*grpc.Clie // Intentionally not returning DeadlineExceededError here since it // is not a Bazel-retryable error, but this particular timeout // should be retryable. + log.Warningf("VM logs are %s", string(c.vmLog.Tail())) return nil, status.UnavailableErrorf("failed to connect to VM: %s", err) } return nil, status.UnavailableErrorf("failed to connect to VM: %s", err) @@ -2284,6 +2293,19 @@ func (c *FirecrackerContainer) Exec(ctx context.Context, cmd *repb.Command, stdi return result } +func (c *FirecrackerContainer) UpdateBalloon(ctx context.Context, sizeInMB int64) error { + if c.uffdHandler != nil { + log.CtxInfof(ctx, "Updating balloon VM: %dMB", sizeInMB) + err := c.machine.UpdateBalloon(ctx, sizeInMB) + if err != nil { + return err + } + // TODO: Do we need the sleep? + time.Sleep(10 * time.Second) + } + return nil +} + func (c *FirecrackerContainer) Signal(ctx context.Context, sig syscall.Signal) error { // TODO: forward the signal as a message on any currently running vmexec // stream. @@ -2497,7 +2519,7 @@ func (c *FirecrackerContainer) Pause(ctx context.Context) error { defer span.End() start := time.Now() - err := c.pause(ctx) + _, err := c.pause(ctx) pauseTime := time.Since(start) log.CtxDebugf(ctx, "Pause took %s", pauseTime) @@ -2506,7 +2528,11 @@ func (c *FirecrackerContainer) Pause(ctx context.Context) error { return err } -func (c *FirecrackerContainer) pause(ctx context.Context) error { +func (c *FirecrackerContainer) PauseWithMemoryStats(ctx context.Context) (*snaploader.CacheCowStats, error) { + return c.pause(ctx) +} + +func (c *FirecrackerContainer) pause(ctx context.Context) (*snaploader.CacheCowStats, error) { ctx, cancel := c.monitorVMContext(ctx) defer cancel() @@ -2514,30 +2540,32 @@ func (c *FirecrackerContainer) pause(ctx context.Context) error { snapDetails, err := c.snapshotDetails(ctx) if err != nil { - return err + return nil, err } if err = c.pauseVM(ctx); err != nil { - return err + return nil, err } // If an older snapshot is present -- nuke it since we're writing a new one. if err = c.cleanupOldSnapshots(snapDetails); err != nil { - return err + return nil, err } if err = c.createSnapshot(ctx, snapDetails); err != nil { - return err + return nil, err } // Stop the VM, UFFD page fault handler, and VBD servers to ensure nothing // is modifying the snapshot files as we save them if err := c.stopMachine(ctx); err != nil { - return err + return nil, err } + var removedAddresses map[int64]string if c.uffdHandler != nil { + removedAddresses = c.uffdHandler.RemovedAddresses() if err := c.uffdHandler.Stop(); err != nil { - return status.WrapError(err, "stop uffd handler") + return nil, status.WrapError(err, "stop uffd handler") } c.uffdHandler = nil } @@ -2546,19 +2574,20 @@ func (c *FirecrackerContainer) pause(ctx context.Context) error { // Don't log errors here because it may succeed the second try, especially // as we are extending the context for that cleanup. if err := c.unmountAllVBDs(ctx, false /*logErrors*/); err != nil { - return status.WrapError(err, "unmount vbds") + return nil, status.WrapError(err, "unmount vbds") } - if err = c.saveSnapshot(ctx, snapDetails); err != nil { - return err + memoryStats, err := c.saveSnapshot(ctx, snapDetails, removedAddresses) + if err != nil { + return nil, err } // Finish cleaning up VM resources if err = c.Remove(ctx); err != nil { - return err + return nil, err } - return nil + return memoryStats, nil } type snapshotDetails struct { diff --git a/enterprise/server/remote_execution/containers/firecracker/firecracker_test.go b/enterprise/server/remote_execution/containers/firecracker/firecracker_test.go index b2fbb9ee315..675a14fe3f9 100644 --- a/enterprise/server/remote_execution/containers/firecracker/firecracker_test.go +++ b/enterprise/server/remote_execution/containers/firecracker/firecracker_test.go @@ -397,6 +397,244 @@ func TestFirecrackerLifecycle(t *testing.T) { assertCommandResult(t, expectedResult, res) } +// This test is supposed to dirty a lot of memory by explicitly writing it in a +// C script. +func TestBalloon_CScript(t *testing.T) { + ctx := context.Background() + env := getTestEnv(ctx, t, envOpts{}) + env.SetAuthenticator(testauth.NewTestAuthenticator(testauth.TestUsers("US1", "GR1"))) + rootDir := testfs.MakeTempDir(t) + workDir := testfs.MakeDirAll(t, rootDir, "work") + + cfg := getExecutorConfig(t) + memorySize := int64(500) + opts := firecracker.ContainerOpts{ + ContainerImage: ubuntuImage, + ActionWorkingDirectory: workDir, + VMConfiguration: &fcpb.VMConfiguration{ + NumCpus: 1, + MemSizeMb: memorySize, + EnableNetworking: true, + ScratchDiskSizeMb: 1000, + KernelVersion: cfg.KernelVersion, + FirecrackerVersion: cfg.FirecrackerVersion, + GuestApiVersion: cfg.GuestAPIVersion, + }, + ExecutorConfig: cfg, + } + task := &repb.ExecutionTask{ + Command: &repb.Command{ + // Note: platform must match in order to share snapshots + Platform: &repb.Platform{Properties: []*repb.Platform_Property{ + {Name: "recycle-runner", Value: "true"}, + }}, + Arguments: []string{"./buildbuddy_ci_runner"}, + }, + } + + c, err := firecracker.NewContainer(ctx, env, task, opts) + if err != nil { + t.Fatal(err) + } + + if err := container.PullImageIfNecessary(ctx, env, c, oci.Credentials{}, opts.ContainerImage); err != nil { + t.Fatalf("unable to pull image: %s", err) + } + + if err := c.Create(ctx, opts.ActionWorkingDirectory); err != nil { + t.Fatalf("unable to Create container: %s", err) + } + t.Cleanup(func() { + if err := c.Remove(ctx); err != nil { + t.Fatal(err) + } + }) + + cmd := &repb.Command{ + // Run a script that increments /workspace/count (on workspacefs) and + // /root/count (on scratchfs), or writes 0 if the file doesn't exist. + // This will let us test whether the scratchfs is sticking around across + // runs, and whether workspacefs is being correctly reset across runs. + Arguments: []string{"sh", "-c", ` +if ! command -v gcc > /dev/null; then + apt update + apt install -y gcc +fi + +cat < fillmem.c +#define _GNU_SOURCE +#include +#include +#include +#include +#define MB (1024 * 1024) + +// Allocate 'mb_count' bytes of memory and set the value to 'value' +int fillmem(int mb_count, int value) { + int i; + char *ptr = NULL; + size_t size = mb_count * MB * sizeof(char); + do { + ptr = mmap( + NULL, + size, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + -1, + 0 + ); + } while (ptr == MAP_FAILED); + memset(ptr, value, size); + + // Iterate through allocated memory and make sure every byte equals the intended value + for (size_t i = 0; i < size; i++) { + if (ptr[i] != value) { + printf("Byte %zu is not %d! It is %d\n", i, value, ptr[i]); + return 1; + } + } + + printf("Success!"); + + return 0; +} +int main(int argc, char *const argv[]) { + if (argc != 3) { + printf("Usage: ./readmem mb_count value\n"); + return -1; + } + + int mb_count = atoi(argv[1]); + int value = atoi(argv[2]); + return fillmem(mb_count, value); +} +EOF + +gcc -o fillmem fillmem.c +./fillmem 300 1 + `}, + } + + res := c.Exec(ctx, cmd, nil /*=stdio*/) + require.NoError(t, res.Error) + err = c.Pause(ctx) + require.NoError(t, err) + + // Try pause, unpause, exec several times. + for i := 1; i <= 5; i++ { + if err := c.Unpause(ctx); err != nil { + t.Fatalf("unable to unpause container: %s", err) + } + + res := c.Exec(ctx, cmd, nil /*=stdio*/) + require.NoError(t, res.Error) + + err = c.UpdateBalloon(ctx, int64(float64(memorySize)*0.7)) + require.NoError(t, err) + err = c.UpdateBalloon(ctx, 0) + require.NoError(t, err) + + memoryStats, err := c.PauseWithMemoryStats(ctx) + require.NoError(t, err) + log.Warningf("Memory stats are %v", memoryStats) + require.Greater(t, memoryStats.CleanedMB, int64(0)) + require.Less(t, memoryStats.NeedToSaveMB, memoryStats.DirtyMB) + } +} + +func TestBalloon_BazelBuild(t *testing.T) { + ctx := context.Background() + env := getTestEnv(ctx, t, envOpts{cacheSize: 20_000_000_000}) + env.SetAuthenticator(testauth.NewTestAuthenticator(testauth.TestUsers("US1", "GR1"))) + rootDir := testfs.MakeTempDir(t) + workDir := testfs.MakeDirAll(t, rootDir, "work") + + cfg := getExecutorConfig(t) + opts := firecracker.ContainerOpts{ + ContainerImage: platform.Ubuntu20_04WorkflowsImage, + ActionWorkingDirectory: workDir, + VMConfiguration: &fcpb.VMConfiguration{ + // NOTE: Getting mounting errors when increasing the num CPUs on GCP VM + NumCpus: 2, + MemSizeMb: 4000, + EnableNetworking: true, + ScratchDiskSizeMb: 5000, + KernelVersion: cfg.KernelVersion, + FirecrackerVersion: cfg.FirecrackerVersion, + GuestApiVersion: cfg.GuestAPIVersion, + }, + ExecutorConfig: cfg, + } + task := &repb.ExecutionTask{ + Command: &repb.Command{ + // Note: platform must match in order to share snapshots + Platform: &repb.Platform{Properties: []*repb.Platform_Property{ + {Name: "recycle-runner", Value: "true"}, + }}, + Arguments: []string{"./buildbuddy_ci_runner"}, + }, + } + + c, err := firecracker.NewContainer(ctx, env, task, opts) + if err != nil { + t.Fatal(err) + } + + if err := container.PullImageIfNecessary(ctx, env, c, oci.Credentials{}, opts.ContainerImage); err != nil { + t.Fatalf("unable to pull image: %s", err) + } + + if err := c.Create(ctx, opts.ActionWorkingDirectory); err != nil { + t.Fatalf("unable to Create container: %s", err) + } + t.Cleanup(func() { + if err := c.Remove(ctx); err != nil { + t.Fatal(err) + } + }) + + cmd := &repb.Command{ + Arguments: []string{"sh", "-c", ` + cd ~ + if [ -d buildbuddy ]; then + echo "Directory exists." + else + git clone https://github.com/buildbuddy-io/buildbuddy --filter=blob:none + fi + cd buildbuddy + # See https://github.com/bazelbuild/bazelisk/issues/220 + echo "USE_BAZEL_VERSION=7.4.0" > .bazeliskrc + bazelisk build //server/util/status:status + `}, + } + + res := c.Exec(ctx, cmd, nil /*=stdio*/) + require.NoError(t, res.Error) + err = c.Pause(ctx) + require.NoError(t, err) + + // Try pause, unpause, exec several times. + for i := 1; i <= 5; i++ { + if err := c.Unpause(ctx); err != nil { + t.Fatalf("unable to unpause container: %s", err) + } + + res := c.Exec(ctx, cmd, nil /*=stdio*/) + require.NoError(t, res.Error) + + err = c.UpdateBalloon(ctx, 200) + require.NoError(t, err) + err = c.UpdateBalloon(ctx, 0) + require.NoError(t, err) + + memoryStats, err := c.PauseWithMemoryStats(ctx) + require.NoError(t, err) + log.Warningf("Memory stats are %v", memoryStats) + require.Greater(t, memoryStats.CleanedMB, int64(0)) + require.Less(t, memoryStats.NeedToSaveMB, memoryStats.DirtyMB) + } +} + func TestFirecrackerSnapshotAndResume(t *testing.T) { // Test for both small and large memory sizes for _, memorySize := range []int64{minMemSizeMB, 4000} { @@ -500,6 +738,11 @@ func TestFirecrackerSnapshotAndResume(t *testing.T) { assert.Equal(t, fmt.Sprintf("/workspace/count: %d\n/root/count: %d\n", countBefore+1, i), string(res.Stdout)) require.NotContains(t, string(res.AuxiliaryLogs["vm_log_tail.txt"]), "is not a multiple of sector size") cpuMillisObservations = append(cpuMillisObservations, float64(res.UsageStats.GetCpuNanos())/1e6) + + err = c.UpdateBalloon(ctx, 100) + require.NoError(t, err) + err = c.UpdateBalloon(ctx, 0) + require.NoError(t, err) } // CPU usage should be reported per-task rather than accumulated over diff --git a/enterprise/server/remote_execution/copy_on_write/copy_on_write.go b/enterprise/server/remote_execution/copy_on_write/copy_on_write.go index 6057d0c3d45..3224ac83c04 100644 --- a/enterprise/server/remote_execution/copy_on_write/copy_on_write.go +++ b/enterprise/server/remote_execution/copy_on_write/copy_on_write.go @@ -143,6 +143,10 @@ type COWStore struct { // usageLock protects chunkOperationToUsageSummary usageLock sync.Mutex chunkOperationToUsageSummary map[string]usageSummary + + // Cleaned pages in the store. Used to track theoretical memory savings because + // logic to save partial chunks hasn't been implemented yet. + cleanedPages map[int64]struct{} } // NewCOWStore creates a COWStore from the given chunks. The chunks should be @@ -185,6 +189,7 @@ func NewCOWStore(ctx context.Context, env environment.Env, name string, chunks [ eagerFetchEg: &errgroup.Group{}, quitChan: make(chan struct{}), chunkOperationToUsageSummary: make(map[string]usageSummary, 0), + cleanedPages: make(map[int64]struct{}, 0), } s.eagerFetchEg.Go(func() error { @@ -251,6 +256,10 @@ func (c *COWStore) GetPageAddress(offset uintptr, write bool) (uintptr, error) { c.storeLock.RLock() chunk := c.chunks[chunkStartOffset] + + // If we're mapping the page, don't consider it cleaned + delete(c.cleanedPages, int64(offset)) + c.storeLock.RUnlock() if chunk == nil { // No data (yet); map into our static zero-filled buf. Note that this @@ -481,6 +490,31 @@ func (s *COWStore) Dirty(chunkOffset int64) bool { return s.dirty[chunkOffset] } +// Mark that a page has been removed, so the corresponding data will not be used. +// This is only used for tracking. In the full implementation, we actually need to +// remove these bytes from the store before caching it. +func (s *COWStore) CleanChunk(chunkOffset int64) { + s.storeLock.Lock() + defer s.storeLock.Unlock() + // BALLOON: Only valid if chunk size = 1 page + //s.dirty[chunkOffset] = false + s.cleanedPages[chunkOffset] = struct{}{} +} + +func (s *COWStore) UseCleanedChunk(chunkOffset int64) { + s.storeLock.Lock() + defer s.storeLock.Unlock() + delete(s.cleanedPages, chunkOffset) +} + +// This represents the amount of dirtied data that we don't need to save, because the balloon +// has expanded into that space +func (s *COWStore) NumCleanPages() int { + cleanedMBs := (len(s.cleanedPages) * os.Getpagesize()) / (1024 * 1024) + log.Warningf("Total cleaned pages: %d, %dMB", len(s.cleanedPages), cleanedMBs) + return len(s.cleanedPages) +} + // UnmapChunk unmaps the chunk containing the input offset func (s *COWStore) UnmapChunk(offset int64) error { chunkStartOffset := s.ChunkStartOffset(offset) diff --git a/enterprise/server/remote_execution/snaploader/snaploader.go b/enterprise/server/remote_execution/snaploader/snaploader.go index 914fc275bed..e8d91f55c54 100644 --- a/enterprise/server/remote_execution/snaploader/snaploader.go +++ b/enterprise/server/remote_execution/snaploader/snaploader.go @@ -2,6 +2,7 @@ package snaploader import ( "context" + "encoding/json" "fmt" "os" "path/filepath" @@ -298,6 +299,10 @@ func (s *Snapshot) GetChunkedFiles() []*fcpb.ChunkedFile { return s.manifest.GetChunkedFiles() } +func (s *Snapshot) GetRemovedAddresses() map[int64]string { + return s.manifest.GetRemovedAddresses() +} + // CacheSnapshotOptions contains any assets or configuration to be associated // with a stored snapshot. // @@ -320,6 +325,10 @@ type CacheSnapshotOptions struct { // Labeled map of chunked artifacts backed by copy_on_write.COWStore storage. ChunkedFiles map[string]*copy_on_write.COWStore + // For memory snapshots, addresses that were removed and should be zeroed + // the next time they are page faulted. + RemovedAddresses map[int64]string + // Whether the snapshot is from a recycled VM Recycled bool @@ -487,18 +496,26 @@ func (l *FileCacheLoader) actionResultToManifest(ctx context.Context, remoteInst } var vmMetadata *fcpb.VMMetadata - if len(snapMetadata) == 2 { + if len(snapMetadata) >= 2 { vmMetadata = &fcpb.VMMetadata{} if err := snapMetadata[1].UnmarshalTo(vmMetadata); err != nil { return nil, status.WrapErrorf(err, "unmarshall vm metadata") } } + var removedAddresses map[int64]string + if len(snapMetadata) >= 3 { + err := json.Unmarshal(snapMetadata[2].GetValue(), &removedAddresses) + if err != nil { + return nil, err + } + } manifest := &fcpb.SnapshotManifest{ - VmMetadata: vmMetadata, - VmConfiguration: vmConfig, - Files: []*repb.FileNode{}, - ChunkedFiles: []*fcpb.ChunkedFile{}, + VmMetadata: vmMetadata, + VmConfiguration: vmConfig, + Files: []*repb.FileNode{}, + ChunkedFiles: []*fcpb.ChunkedFile{}, + RemovedAddresses: removedAddresses, } for _, fileMetadata := range snapshotActionResult.OutputFiles { @@ -598,14 +615,14 @@ func (l *FileCacheLoader) UnpackSnapshot(ctx context.Context, snapshot *Snapshot return unpacked, nil } -func (l *FileCacheLoader) CacheSnapshot(ctx context.Context, key *fcpb.SnapshotKey, opts *CacheSnapshotOptions) error { +func (l *FileCacheLoader) CacheSnapshot(ctx context.Context, key *fcpb.SnapshotKey, opts *CacheSnapshotOptions) (*CacheCowStats, error) { vmConfig, err := anypb.New(opts.VMConfiguration) if err != nil { - return err + return nil, err } vmMetadata, err := anypb.New(opts.VMMetadata) if err != nil { - return err + return nil, err } ar := &repb.ActionResult{ ExecutionMetadata: &repb.ExecutedActionMetadata{ @@ -614,6 +631,17 @@ func (l *FileCacheLoader) CacheSnapshot(ctx context.Context, key *fcpb.SnapshotK OutputFiles: []*repb.OutputFile{}, OutputDirectories: []*repb.OutputDirectory{}, } + // For memory snapshots, addresses that were removed and should be zeroed + // the next time they are page faulted, even across snapshots. + if len(opts.RemovedAddresses) > 0 { + jsonData, err := json.Marshal(opts.RemovedAddresses) + if err != nil { + return nil, err + } + ar.ExecutionMetadata.AuxiliaryMetadata = append(ar.ExecutionMetadata.AuxiliaryMetadata, &anypb.Any{ + Value: jsonData, + }) + } eg, egCtx := errgroup.WithContext(ctx) @@ -658,6 +686,8 @@ func (l *FileCacheLoader) CacheSnapshot(ctx context.Context, key *fcpb.SnapshotK return snaputil.Cache(ctx, l.env.GetFileCache(), l.env.GetByteStreamClient(), opts.Remote, d, key.InstanceName, filePath) }) } + + var memoryStats *CacheCowStats for name, cow := range opts.ChunkedFiles { name, cow := name, cow dir := &repb.OutputDirectory{ @@ -667,23 +697,26 @@ func (l *FileCacheLoader) CacheSnapshot(ctx context.Context, key *fcpb.SnapshotK ar.OutputDirectories = append(ar.OutputDirectories, dir) eg.Go(func() error { ctx := egCtx - treeDigest, err := l.cacheCOW(ctx, name, key.InstanceName, cow, opts) + treeDigest, stats, err := l.cacheCOW(ctx, name, key.InstanceName, cow, opts) if err != nil { return status.WrapErrorf(err, "cache %q COW", name) } + if name == "memory" { + memoryStats = stats + } dir.TreeDigest = treeDigest return nil }) } if err := eg.Wait(); err != nil { - return err + return nil, err } // Write the ActionResult to the cache only after we've successfully // uploaded all snapshot related artifacts. We'll retrieve this later in // order to unpack the snapshot. - return l.cacheActionResult(ctx, key, ar, opts) + return memoryStats, l.cacheActionResult(ctx, key, ar, opts) } func (l *FileCacheLoader) cacheActionResult(ctx context.Context, key *fcpb.SnapshotKey, ar *repb.ActionResult, opts *CacheSnapshotOptions) error { @@ -817,9 +850,21 @@ func (l *FileCacheLoader) unpackCOW(ctx context.Context, file *fcpb.ChunkedFile, return cow, nil } +type CacheCowStats struct { + Name string + // Total number of MB that were written to + DirtyMB int64 + // Total number of MB that were removed by the balloon + CleanedMB int64 + // DirtyMB - CleanedMB + // Because saving partial chunks hasn't been implemented yet, use this as a + // proxy for how much data we'd be saving if we moved all removed pages + NeedToSaveMB int64 +} + // cacheCOW represents a COWStore as an action result tree and saves the store // to the cache. Returns the digest of the tree -func (l *FileCacheLoader) cacheCOW(ctx context.Context, name string, remoteInstanceName string, cow *copy_on_write.COWStore, cacheOpts *CacheSnapshotOptions) (*repb.Digest, error) { +func (l *FileCacheLoader) cacheCOW(ctx context.Context, name string, remoteInstanceName string, cow *copy_on_write.COWStore, cacheOpts *CacheSnapshotOptions) (*repb.Digest, *CacheCowStats, error) { var dirtyBytes, dirtyChunkCount int64 start := time.Now() defer func() { @@ -828,7 +873,7 @@ func (l *FileCacheLoader) cacheCOW(ctx context.Context, name string, remoteInsta size, err := cow.SizeBytes() if err != nil { - return nil, err + return nil, nil, err } tree := &repb.Tree{ @@ -916,20 +961,20 @@ func (l *FileCacheLoader) cacheCOW(ctx context.Context, name string, remoteInsta } if err := eg.Wait(); err != nil { - return nil, status.WrapError(err, "cache chunks") + return nil, nil, status.WrapError(err, "cache chunks") } // Save ActionCache Tree to the cache treeDigest, err := digest.ComputeForMessage(tree, repb.DigestFunction_BLAKE3) if err != nil { - return nil, err + return nil, nil, err } treeBytes, err := proto.Marshal(tree) if err != nil { - return nil, err + return nil, nil, err } if err := snaputil.CacheBytes(ctx, l.env.GetFileCache(), l.env.GetByteStreamClient(), cacheOpts.Remote, treeDigest, remoteInstanceName, treeBytes); err != nil { - return nil, err + return nil, nil, err } metrics.COWSnapshotDirtyChunkRatio.With(prometheus.Labels{ @@ -946,7 +991,15 @@ func (l *FileCacheLoader) cacheCOW(ctx context.Context, name string, remoteInsta }).Observe(float64(count) / float64(len(chunks))) } - return treeDigest, nil + dirtyMB := dirtyBytes / (1024 * 1024) + cleanedMB := int64(cow.NumCleanPages() * os.Getpagesize() / (1024 * 1024)) + stats := CacheCowStats{ + Name: name, + DirtyMB: dirtyMB, + CleanedMB: cleanedMB, + NeedToSaveMB: dirtyMB - cleanedMB, + } + return treeDigest, &stats, nil } type SnapshotService struct { @@ -1051,7 +1104,7 @@ func UnpackContainerImage(ctx context.Context, l *FileCacheLoader, instanceName, Recycled: false, Remote: remoteEnabled, } - if err := l.CacheSnapshot(ctx, key.GetBranchKey(), opts); err != nil { + if _, err := l.CacheSnapshot(ctx, key.GetBranchKey(), opts); err != nil { return nil, status.WrapError(err, "cache containerfs snapshot") } log.CtxDebugf(ctx, "Converted containerfs to COW in %s", time.Since(start)) diff --git a/enterprise/server/remote_execution/snaputil/snaputil.go b/enterprise/server/remote_execution/snaputil/snaputil.go index c2da089f813..c7abbdfe90b 100644 --- a/enterprise/server/remote_execution/snaputil/snaputil.go +++ b/enterprise/server/remote_execution/snaputil/snaputil.go @@ -69,21 +69,22 @@ func (s ChunkSource) String() string { } func GetArtifact(ctx context.Context, localCache interfaces.FileCache, bsClient bytestream.ByteStreamClient, remoteEnabled bool, d *repb.Digest, instanceName string, outputPath string) (ChunkSource, error) { - node := &repb.FileNode{Digest: d} - fetchedLocally := localCache.FastLinkFile(ctx, node, outputPath) - if fetchedLocally { - return ChunkSourceLocalFilecache, nil - } - - if !*EnableRemoteSnapshotSharing || !remoteEnabled { - return 0, status.UnavailableErrorf("snapshot artifact with digest %v not found in local cache", d) - } - - if *VerboseLogging { - start := time.Now() - log.CtxDebugf(ctx, "Fetching snapshot artifact: instance=%q file=%s hash=%s", instanceName, StripChroot(outputPath), d.GetHash()) - defer func() { log.CtxDebugf(ctx, "Fetched remote snapshot artifact in %s", time.Since(start)) }() - } + // BALLOON: Disable local caching to resolve file cache hard link limits + //node := &repb.FileNode{Digest: d} + //fetchedLocally := localCache.FastLinkFile(ctx, node, outputPath) + //if fetchedLocally { + // return ChunkSourceLocalFilecache, nil + //} + // + //if !*EnableRemoteSnapshotSharing || !remoteEnabled { + // return 0, status.UnavailableErrorf("snapshot artifact with digest %v not found in local cache", d) + //} + // + //if *VerboseLogging { + // start := time.Now() + // log.CtxDebugf(ctx, "Fetching snapshot artifact: instance=%q file=%s hash=%s", instanceName, StripChroot(outputPath), d.GetHash()) + // defer func() { log.CtxDebugf(ctx, "Fetched remote snapshot artifact in %s", time.Since(start)) }() + //} // Fetch from remote cache f, err := os.Create(outputPath) @@ -98,9 +99,9 @@ func GetArtifact(ctx context.Context, localCache interfaces.FileCache, bsClient } // Save to local cache so next time fetching won't require a remote get - if err := cacheLocally(ctx, localCache, d, outputPath); err != nil { - log.Warningf("saving %s to local filecache failed: %s", outputPath, err) - } + //if err := cacheLocally(ctx, localCache, d, outputPath); err != nil { + // log.Warningf("saving %s to local filecache failed: %s", outputPath, err) + //} return ChunkSourceRemoteCache, nil } @@ -127,11 +128,12 @@ func GetBytes(ctx context.Context, localCache interfaces.FileCache, bsClient byt // Cache saves a file written to `path` to the local cache, and the remote cache // if remote snapshot sharing is enabled func Cache(ctx context.Context, localCache interfaces.FileCache, bsClient bytestream.ByteStreamClient, remoteEnabled bool, d *repb.Digest, remoteInstanceName string, path string) error { - localCacheErr := cacheLocally(ctx, localCache, d, path) - if !*EnableRemoteSnapshotSharing || *RemoteSnapshotReadonly || !remoteEnabled { - return localCacheErr - } - + // BALLOON: Disable local caching to resolve file cache hard link limits + //localCacheErr := cacheLocally(ctx, localCache, d, path) + //if !*EnableRemoteSnapshotSharing || *RemoteSnapshotReadonly || !remoteEnabled { + // return localCacheErr + //} + // if *VerboseLogging { start := time.Now() log.CtxDebugf(ctx, "Uploading snapshot artifact: instance=%q file=%s hash=%s", remoteInstanceName, StripChroot(path), d.GetHash()) diff --git a/enterprise/server/remote_execution/uffd/BUILD b/enterprise/server/remote_execution/uffd/BUILD index c0f8935e7d3..451590e8764 100644 --- a/enterprise/server/remote_execution/uffd/BUILD +++ b/enterprise/server/remote_execution/uffd/BUILD @@ -9,17 +9,14 @@ go_library( "@platforms//os:linux", ], visibility = ["//visibility:public"], - deps = select({ - "@io_bazel_rules_go//go/platform:linux": [ - "//enterprise/server/remote_execution/copy_on_write", - "//server/metrics", - "//server/util/log", - "//server/util/status", - "@com_github_prometheus_client_golang//prometheus", - "@org_golang_x_sys//unix", - ], - "//conditions:default": [], - }), + deps = [ + "//enterprise/server/remote_execution/copy_on_write", + "//server/metrics", + "//server/util/log", + "//server/util/status", + "@com_github_prometheus_client_golang//prometheus", + "@org_golang_x_sys//unix", + ], ) package(default_visibility = ["//enterprise:__subpackages__"]) diff --git a/enterprise/server/remote_execution/uffd/uffd.go b/enterprise/server/remote_execution/uffd/uffd.go index d7de22984b7..9f4c03d0c4a 100644 --- a/enterprise/server/remote_execution/uffd/uffd.go +++ b/enterprise/server/remote_execution/uffd/uffd.go @@ -1,10 +1,9 @@ -//go:build linux && !android - package uffd import ( "context" "encoding/json" + "flag" "net" "os" "syscall" @@ -25,26 +24,28 @@ import ( */ import "C" +var verboseLogging = flag.Bool("executor.verbose_uffd_logging", false, "Enables verbose UFFD logs") + // UFFD macros - see README for more info const UFFDIO_COPY = 0xc028aa03 +const UFFDIO_ZEROPAGE = 0xc020aa04 -// uffdMsg is a notification from the userfaultfd object about a change in the virtual memory layout of the -// faulting process -// It's defined in the linux header file userfaultfd.h -type uffdMsg struct { - Event uint8 - - Reserved1 uint8 - Reserved2 uint16 - Reserved3 uint32 +type Pagefault struct { + Flags uint64 + Address uint64 + Ptid uint32 +} - PageFault struct { - Flags uint64 - Address uint64 - Ptid uint32 - } +type Remove struct { + Start uint64 + End uint64 } +// uffdMsg is a notification from the userfaultfd object about a change in the +// virtual memory layout of the faulting process +// It's defined in the linux header file userfaultfd.h +type UffdMsg = C.struct_uffd_msg + // uffdioCopy contains input/output data to the UFFDIO_COPY syscall // It's defined in the linux header file userfaultfd.h type uffdioCopy struct { @@ -55,6 +56,19 @@ type uffdioCopy struct { Copy int64 // After the syscall has completed, contains the number of bytes copied, or a negative errno to indicate failure } +// uffdioZeropage contains input and output data to the UFFDIO_ZEROPAGE syscall. +// It's defined in the linux header file userfaultfd.h. +type uffdIoZeropage struct { + Range uffdIoRange + Mode uint64 + // Number of bytes zeroed + Zeropage int64 +} +type uffdIoRange struct { + Start uint64 + Len uint64 +} + // GuestRegionUFFDMapping represents the mapping between a VM memory address to the offset in the corresponding // memory snapshot file. // @@ -119,11 +133,20 @@ type Handler struct { mappedPageFaults map[int64][]PageFaultData pageFaultTotalDuration time.Duration + + // Addresses that have been removed in the guest. + // The next time there is a page fault on this address, we can copy in all 0s + // with UFFDIO_ZEROPAGE. + removedAddresses map[int64]string } -func NewHandler() (*Handler, error) { +func NewHandler(removedAddresses map[int64]string) (*Handler, error) { + if removedAddresses == nil { + removedAddresses = make(map[int64]string) + } return &Handler{ mappedPageFaults: map[int64][]PageFaultData{}, + removedAddresses: removedAddresses, }, nil } @@ -276,8 +299,8 @@ func (h *Handler) handle(ctx context.Context, memoryStore *copy_on_write.COWStor return nil } - // Receive a page fault notification - guestFaultingAddr, err := readFaultingAddress(uffd) + // Receive a UFFD notification + pageFaultEvent, removeEvent, err := readEvent(uffd) if err != nil { if err == unix.EAGAIN { // Try again code @@ -286,89 +309,146 @@ func (h *Handler) handle(ctx context.Context, memoryStore *copy_on_write.COWStor return status.InternalErrorf("read event from uffd failed with errno(%d)", err) } - mapping, err := guestMemoryAddrToMapping(uintptr(guestFaultingAddr), mappings) - if err != nil { - return err - } - - guestPageAddr := pageStartAddress(guestFaultingAddr, pageSize) - if guestPageAddr < mapping.BaseHostVirtAddr { - // Make sure we only try to map addresses that fall within the valid - // guest memory ranges - guestPageAddr = mapping.BaseHostVirtAddr - } - - // Find the memory data in the store that should be used to handle the page fault - faultStoreOffset := guestMemoryAddrToStoreOffset(guestPageAddr, *mapping) - - // To reduce the number of UFFD round trips, try to copy the entire - // chunk containing the faulting address - hostAddr, copySize, err := memoryStore.GetChunkStartAddressAndSize(uintptr(faultStoreOffset), false /*=write*/) - if err != nil { - return status.WrapError(err, "get backing page address") - } - - relOffset := memoryStore.GetRelativeOffsetFromChunkStart(faultStoreOffset) - chunkStartOffset := faultStoreOffset - relOffset - destAddr := guestPageAddr - relOffset - - // If copying the entire chunk would map data falling outside the valid - // guest memory range, only copy the valid parts of the chunk - // - // Check for data below the valid memory range - var invalidBytesAtChunkStart uintptr - if destAddr < mapping.BaseHostVirtAddr { - invalidBytesAtChunkStart = mapping.BaseHostVirtAddr - destAddr - destAddr = mapping.BaseHostVirtAddr - hostAddr += invalidBytesAtChunkStart - copySize -= int64(invalidBytesAtChunkStart) - } - // Check for data above the valid memory range - mappingEndAddr := mapping.BaseHostVirtAddr + mapping.Size - copyEndAddr := destAddr + uintptr(copySize) - var invalidBytesAtChunkEnd int64 - if copyEndAddr > mappingEndAddr { - invalidBytesAtChunkEnd = int64(copyEndAddr - mappingEndAddr) - copySize -= invalidBytesAtChunkEnd - } - - // Should never map a partial page at the end of the file, but just - // warn in this case for now. - if remainder := storeLength - int64(faultStoreOffset); remainder < int64(pageSize) { - log.CtxWarningf(ctx, "uffdio_copy range extends past store length") + if removeEvent != nil { + if *verboseLogging { + log.Warningf("Remove event start is %v, end is %v", removeEvent.Start, removeEvent.End) + } + for removedAddr := int64(removeEvent.Start); removedAddr < int64(removeEvent.End); removedAddr += int64(os.Getpagesize()) { + h.removedAddresses[removedAddr] = "" + + mapping, err := guestMemoryAddrToMapping(uintptr(removedAddr), mappings) + if err != nil { + return err + } + guestPageAddr := pageStartAddress(uint64(removedAddr), pageSize) + faultStoreOffset := guestMemoryAddrToStoreOffset(guestPageAddr, *mapping) + memoryStore.CleanChunk(int64(faultStoreOffset)) + + // TODO: Should we write all 0s to the backing store here? Just + // to be safe? + } } - // Store debug data about page fault request - debugData := PageFaultData{ - // Original data sent from firecracker VM - GuestFaultingAddr: int64(guestFaultingAddr), - GuestRegionMapping: mapping, - - DestAddr: int64(destAddr), - ChunkStartOffset: int64(chunkStartOffset), - ChunkStartAddr: int64(hostAddr), - CopySize: copySize, - InvalidBytesAtChunkStart: int64(invalidBytesAtChunkStart), - InvalidBytesAtChunkEnd: invalidBytesAtChunkEnd, - } - h.mappedPageFaults[int64(chunkStartOffset)] = append(h.mappedPageFaults[int64(chunkStartOffset)], debugData) + if pageFaultEvent != nil { + // The memory location the VM tried to access that triggered the page fault + guestFaultingAddr := pageFaultEvent.Address - _, err = h.resolvePageFault(uffd, uint64(destAddr), uint64(hostAddr), uint64(copySize)) - if err != nil { - mappedRangesForChunk := h.mappedPageFaults[int64(chunkStartOffset)] - mappedRangesStr := "" - for _, r := range mappedRangesForChunk { - mappedRangesStr += r.String() + // Firecracker sends us mappings from certain address ranges in the guest + // to the corresponding memory in the snapshot file. + mapping, err := guestMemoryAddrToMapping(uintptr(guestFaultingAddr), mappings) + if err != nil { + return err } - log.CtxWarningf(ctx, "Failed to resolve page fault %s due to err %s\nMapped ranges for chunk: %s", debugData.String(), err, mappedRangesStr) - return err - } + // Clean up address. Make sure it's aligned to the page size and valid. + guestPageAddr := pageStartAddress(guestFaultingAddr, pageSize) + if guestPageAddr < mapping.BaseHostVirtAddr { + // Make sure we only try to map addresses that fall within the valid + // guest memory ranges + guestPageAddr = mapping.BaseHostVirtAddr + } - // After memory has been copied to the VM, unmap the chunk to save memory - // usage on the executor - if err := memoryStore.UnmapChunk(int64(chunkStartOffset)); err != nil { - return err + if _, removed := h.removedAddresses[int64(guestFaultingAddr)]; removed { + zeroIO := uffdIoZeropage{ + Range: uffdIoRange{ + Start: guestFaultingAddr, + Len: uint64(pageSize), + }, + } + if *verboseLogging { + log.Warningf("Zeroing %v", guestFaultingAddr) + } + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uffd, UFFDIO_ZEROPAGE, uintptr(unsafe.Pointer(&zeroIO))) + if errno != 0 { + log.Warningf("UFFDIO_ZEROPAGE failed with errno(%d)", errno) + return status.InternalErrorf("UFFDIO_ZEROPAGE failed with errno(%d)", errno) + } + + // Remove address from our internal tracking of removed pages + delete(h.removedAddresses, int64(guestFaultingAddr)) + + faultStoreOffset := guestMemoryAddrToStoreOffset(guestPageAddr, *mapping) + memoryStore.UseCleanedChunk(int64(faultStoreOffset)) + } else { + // Find the memory data in the store that should be used to handle the page fault + faultStoreOffset := guestMemoryAddrToStoreOffset(guestPageAddr, *mapping) + + // BALLOON: Only page fault in one page at a time + hostAddr, err := memoryStore.GetPageAddress(faultStoreOffset, false) + if err != nil { + return status.WrapError(err, "get backing page address") + } + copySize := int64(os.Getpagesize()) + // BALLOON: Comment out logic to copy multiple pages + // To reduce the number of UFFD round trips, try to copy the entire + // chunk containing the faulting address + //hostAddr, copySize, err := memoryStore.GetChunkStartAddressAndSize(uintptr(faultStoreOffset), false /*=write*/) + + relOffset := memoryStore.GetRelativeOffsetFromChunkStart(faultStoreOffset) + chunkStartOffset := faultStoreOffset - relOffset + // BALLOON: Comment out logic to copy multiple pages + //destAddr := guestPageAddr - relOffset + destAddr := guestPageAddr + + // If copying the entire chunk would map data falling outside the valid + // guest memory range, only copy the valid parts of the chunk + // + // Check for data below the valid memory range + var invalidBytesAtChunkStart uintptr + if destAddr < mapping.BaseHostVirtAddr { + invalidBytesAtChunkStart = mapping.BaseHostVirtAddr - destAddr + destAddr = mapping.BaseHostVirtAddr + hostAddr += invalidBytesAtChunkStart + copySize -= int64(invalidBytesAtChunkStart) + } + // Check for data above the valid memory range + mappingEndAddr := mapping.BaseHostVirtAddr + mapping.Size + copyEndAddr := destAddr + uintptr(copySize) + var invalidBytesAtChunkEnd int64 + if copyEndAddr > mappingEndAddr { + invalidBytesAtChunkEnd = int64(copyEndAddr - mappingEndAddr) + copySize -= invalidBytesAtChunkEnd + } + + // Should never map a partial page at the end of the file, but just + // warn in this case for now. + if remainder := storeLength - int64(faultStoreOffset); remainder < int64(pageSize) { + log.CtxWarningf(ctx, "uffdio_copy range extends past store length") + } + + // Store debug data about page fault request + debugData := PageFaultData{ + // Original data sent from firecracker VM + GuestFaultingAddr: int64(guestFaultingAddr), + GuestRegionMapping: mapping, + + DestAddr: int64(destAddr), + ChunkStartOffset: int64(chunkStartOffset), + ChunkStartAddr: int64(hostAddr), + CopySize: copySize, + InvalidBytesAtChunkStart: int64(invalidBytesAtChunkStart), + InvalidBytesAtChunkEnd: invalidBytesAtChunkEnd, + } + h.mappedPageFaults[int64(chunkStartOffset)] = append(h.mappedPageFaults[int64(chunkStartOffset)], debugData) + + _, err = h.resolvePageFault(uffd, uint64(destAddr), uint64(hostAddr), uint64(copySize)) + if err != nil { + mappedRangesForChunk := h.mappedPageFaults[int64(chunkStartOffset)] + mappedRangesStr := "" + for _, r := range mappedRangesForChunk { + mappedRangesStr += r.String() + } + + log.CtxWarningf(ctx, "Failed to resolve page fault %s due to err %s\nMapped ranges for chunk: %s", debugData.String(), err, mappedRangesStr) + return err + } + + // After memory has been copied to the VM, unmap the chunk to save memory + // usage on the executor + if err := memoryStore.UnmapChunk(int64(chunkStartOffset)); err != nil { + return err + } + } } } } @@ -391,6 +471,9 @@ func (h *Handler) resolvePageFault(uffd uintptr, faultingRegion uint64, src uint h.pageFaultTotalDuration += time.Since(start) }() + if *verboseLogging { + log.Warningf("Page fault for %v", faultingRegion) + } copyData := uffdioCopy{ Dst: faultingRegion, Src: src, @@ -398,32 +481,42 @@ func (h *Handler) resolvePageFault(uffd uintptr, faultingRegion uint64, src uint } _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uffd, UFFDIO_COPY, uintptr(unsafe.Pointer(©Data))) if errno != 0 { + log.Warningf("Got errno %d at %v", errno, faultingRegion) // If error is due to the page already being mapped, ignore. if errno == unix.EEXIST { return 0, nil } + // Means the copy coincided with an EVENT_REMOVE. + // UFFDIO_COPY is not allowed, to prevent race conditions. Ignore. + // https://github.com/torvalds/linux/commit/df2cc96e77011cf7989208b206da9817e0321028 + if errno == unix.EAGAIN { + return 0, nil + } return 0, status.InternalErrorf("UFFDIO_COPY failed with errno(%d)", errno) } return int64(copyData.Copy), nil } -// readFaultingAddress reads a notification from the uffd object and returns the faulting address +// readEvent reads a notification from the uffd object and returns the +// event type and faulting address // (i.e. the memory location the VM tried to access that triggered the page fault) -func readFaultingAddress(uffd uintptr) (uint64, error) { - var event uffdMsg + +// readEvent reads a notification from the uffd object and returns the event +func readEvent(uffd uintptr) (*Pagefault, *Remove, error) { + var event UffdMsg _, _, errno := syscall.Syscall(syscall.SYS_READ, uffd, uintptr(unsafe.Pointer(&event)), unsafe.Sizeof(event)) if errno != 0 { - return 0, errno + return nil, nil, errno } - if event.Event != C.UFFD_EVENT_PAGEFAULT { - return 0, status.InternalErrorf("unsupported uffd event type %v", event.Event) - } - if event.PageFault.Flags&C.UFFD_PAGEFAULT_FLAG_WP != 0 { - return 0, status.InternalErrorf("got message with WP flag, but write protection is not yet supported") + if event.event == C.UFFD_EVENT_PAGEFAULT { + pagefault := (*(*Pagefault)(unsafe.Pointer(&event.arg[0]))) + return &pagefault, nil, nil + } else if event.event == C.UFFD_EVENT_REMOVE { + remove := (*(*Remove)(unsafe.Pointer(&event.arg[0]))) + return nil, &remove, nil } - - return event.PageFault.Address, nil + return nil, nil, status.InternalErrorf("unsupported uffd event type %v", event.event) } // Gets the address of the start of the memory page containing `addr` @@ -463,6 +556,10 @@ func (h *Handler) Stop() error { return nil } +func (h *Handler) RemovedAddresses() map[int64]string { + return h.removedAddresses +} + // Translate the faulting memory address in the guest to a persisted store offset // based on the memory mappings. func guestMemoryAddrToStoreOffset(addr uintptr, mapping GuestRegionUFFDMapping) uintptr { diff --git a/enterprise/vmsupport/kernel/microvm-kernel-x86_64.config b/enterprise/vmsupport/kernel/microvm-kernel-x86_64.config index 6efa519e9da..0d89de3f478 100644 --- a/enterprise/vmsupport/kernel/microvm-kernel-x86_64.config +++ b/enterprise/vmsupport/kernel/microvm-kernel-x86_64.config @@ -813,7 +813,7 @@ CONFIG_MEMORY_HOTPLUG_SPARSE=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_MEMORY_BALLOON=y -# CONFIG_BALLOON_COMPACTION is not set +CONFIG_BALLOON_COMPACTION=y CONFIG_COMPACTION=y CONFIG_PAGE_REPORTING=y CONFIG_MIGRATION=y diff --git a/proto/firecracker.proto b/proto/firecracker.proto index 141ff3494c7..8cf575c84c6 100644 --- a/proto/firecracker.proto +++ b/proto/firecracker.proto @@ -123,6 +123,8 @@ message SnapshotManifest { // the snapshot creator is expected to change each time the snapshot is // updated. VMMetadata vm_metadata = 4; + + map removed_addresses = 5; } // Represents a chunked file for use with copy-on-write snapshotting. diff --git a/server/util/log/log.go b/server/util/log/log.go index 655b11765b4..19105eda230 100644 --- a/server/util/log/log.go +++ b/server/util/log/log.go @@ -7,7 +7,6 @@ import ( "io" "net/http" "os" - "path" "path/filepath" "runtime" "strconv" @@ -82,9 +81,9 @@ func LogGRPCRequest(ctx context.Context, fullMethod string, dur time.Duration, e } // ByteStream and DistributedCache services share some method names. // We disambiguate them in the logs by adding a D prefix to DistributedCache methods. - fullMethod = strings.Replace(fullMethod, "distributed_cache.DistributedCache/", "D", 1) - shortPath := "/" + path.Base(fullMethod) - CtxDebugf(ctx, "%s %s %s [%s]", "gRPC", shortPath, fmtErr(err), formatDuration(dur)) + //fullMethod = strings.Replace(fullMethod, "distributed_cache.DistributedCache/", "D", 1) + //shortPath := "/" + path.Base(fullMethod) + //CtxDebugf(ctx, "%s %s %s [%s]", "gRPC", shortPath, fmtErr(err), formatDuration(dur)) if *LogErrorStackTraces { if se, ok := err.(interface { StackTrace() status.StackTrace diff --git a/tools/enable_local_firecracker.sh b/tools/enable_local_firecracker.sh index d644e617e6d..5e284678927 100755 --- a/tools/enable_local_firecracker.sh +++ b/tools/enable_local_firecracker.sh @@ -22,7 +22,7 @@ if ! command -v iptables &>/dev/null; then fi # Install firecracker to make sure the local version matches the one in deps.bzl. -tools/install_firecracker.sh +# tools/install_firecracker.sh if ! command -v jailer &>/dev/null; then echo "jailer could not be found (make sure /usr/local/bin is in PATH)"