Skip to content

Commit

Permalink
Fix Firecracker CPU stats (#7131)
Browse files Browse the repository at this point in the history
  • Loading branch information
bduffany committed Jul 31, 2024
1 parent 2628ef1 commit 5eb541d
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ const (
//
// NOTE: this is part of the snapshot cache key, so bumping this version
// will make existing cached snapshots unusable.
GuestAPIVersion = "11"
GuestAPIVersion = "12"

// How long to wait when dialing the vmexec server inside the VM.
vSocketDialTimeout = 60 * time.Second
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"os/exec"
"path/filepath"
"regexp"
"slices"
"sync"
"syscall"
"testing"
Expand Down Expand Up @@ -136,8 +137,8 @@ func TestGuestAPIVersion(t *testing.T) {
// Note that if you go with option 1, ALL VM snapshots will be invalidated
// which will negatively affect customer experience. Be careful!
const (
expectedHash = "2b869dd271a3bbcf19b13f383b3864de9eb836f16bf8f01aa388d863dda16f53"
expectedVersion = "11"
expectedHash = "a2ce5a10e715ce7233334ec2a06f3aaf4ae2238285f0d571677f9397a0ba62e1"
expectedVersion = "12"
)
assert.Equal(t, expectedHash, firecracker.GuestAPIHash)
assert.Equal(t, expectedVersion, firecracker.GuestAPIVersion)
Expand Down Expand Up @@ -464,6 +465,8 @@ func TestFirecrackerSnapshotAndResume(t *testing.T) {
echo "$PWD/count: $(cat count)"
)
done
# Accumulate some CPU usage.
cat /dev/zero | head -c 100000000 >/dev/null
`},
}

Expand All @@ -474,7 +477,8 @@ func TestFirecrackerSnapshotAndResume(t *testing.T) {
require.NotContains(t, string(res.AuxiliaryLogs["vm_log_tail.txt"]), "is not a multiple of sector size")

// Try pause, unpause, exec several times.
for i := 1; i <= 3; i++ {
var cpuMillisObservations []float64
for i := 1; i <= 4; i++ {
if err := c.Pause(ctx); err != nil {
t.Fatalf("unable to pause container: %s", err)
}
Expand All @@ -492,7 +496,16 @@ func TestFirecrackerSnapshotAndResume(t *testing.T) {

assert.Equal(t, fmt.Sprintf("/workspace/count: %d\n/root/count: %d\n", countBefore+1, i), string(res.Stdout))
require.NotContains(t, string(res.AuxiliaryLogs["vm_log_tail.txt"]), "is not a multiple of sector size")
cpuMillisObservations = append(cpuMillisObservations, float64(res.UsageStats.GetCpuNanos())/1e6)
}

// CPU usage should be reported per-task rather than accumulated over
// time. This means that the CPU usage observations should not be
// strictly increasing, or in the rare case that it's strictly
// increasing, the difference between the max and min reported usage
// should be pretty low.
spread := slices.Max(cpuMillisObservations) - slices.Min(cpuMillisObservations)
require.True(t, !slices.IsSorted(cpuMillisObservations) || spread < slices.Min(cpuMillisObservations), "unexpected CPU usage measurements %v", cpuMillisObservations)
}
}

Expand Down
20 changes: 17 additions & 3 deletions enterprise/server/vmexec/vmexec.go
Original file line number Diff line number Diff line change
Expand Up @@ -453,19 +453,29 @@ func (c *command) Run(ctx context.Context, msgs chan *message) (*vmxpb.ExecStrea
// command is running.
var peakFSU []*repb.UsageStats_FileSystemUsage
var peakMem int64
var baselineCPUNanos int64
commandDone := make(chan struct{})
statsDone := make(chan struct{})
go func() {
defer close(statsDone)
delay := initialStatsPollInterval
// Set initial delay to 0 so that we get a baseline measurement
// immediately.
delay := time.Duration(0)
for done := false; !done; {
select {
case <-commandDone:
// Collect stats one more time, then terminate the loop.
done = true
case <-time.After(delay):
}
delay = min(maxStatsPollInterval, time.Duration(float64(delay)*statsPollBackoff))
var isFirstSample bool
if delay == 0 {
isFirstSample = true
delay = initialStatsPollInterval
} else {
isFirstSample = false
delay = min(maxStatsPollInterval, time.Duration(float64(delay)*statsPollBackoff))
}

// Collect disk usage.
stats := &repb.UsageStats{}
Expand All @@ -491,7 +501,11 @@ func (c *command) Run(ctx context.Context, msgs chan *message) (*vmxpb.ExecStrea
ticks := int64(cpu.User + cpu.Nice + cpu.Sys + cpu.Irq + cpu.SoftIrq)
const nanosPerSec = 1e9
nanosPerTick := nanosPerSec / ticksPerSec
stats.CpuNanos = ticks * nanosPerTick
lifetimeCPUNanos := ticks * nanosPerTick
if isFirstSample {
baselineCPUNanos = lifetimeCPUNanos
}
stats.CpuNanos = lifetimeCPUNanos - baselineCPUNanos
}
msgs <- &message{Response: &vmxpb.ExecStreamedResponse{UsageStats: stats}}
}
Expand Down

0 comments on commit 5eb541d

Please sign in to comment.