Skip to content

Commit

Permalink
Fix the CPU burn instability.
Browse files Browse the repository at this point in the history
Signed-off-by: Super-long <[email protected]>
  • Loading branch information
Super-long committed May 25, 2022
1 parent 910dab7 commit aecf663
Showing 1 changed file with 24 additions and 11 deletions.
35 changes: 24 additions & 11 deletions exec/cpu/cpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,18 +337,20 @@ func getQuota(ctx context.Context, slopePercent float64, precpu bool, cpuIndex i

// The root cause of the complexity is that getUsed requires sleep.
func burn(ctx context.Context, quota <-chan int64, slopePercent float64, precpu bool, cpuIndex int) {
var beforeCpuPercent float64 = 0
var beforeCpuPercent float64 = slopePercent
q := getQuota(ctx, slopePercent, precpu, cpuIndex)
cpu.Percent(time.Second, true)
cpu.Percent(0, true)
ds := period - q
if ds < 0 {
ds = 0
}
fmt.Println(q, ds, slopePercent)
for {
select {
// 使用这quota channel有两个问题,一个是数据不准,一个是可能在default之前连续更新两次q,还都不准。
case offset := <-quota:
q = q + offset
fmt.Println("xxxxxxxxxxxx", q, offset)
if q < 0 {
q = 0
}
Expand All @@ -362,24 +364,30 @@ func burn(ctx context.Context, quota <-chan int64, slopePercent float64, precpu
// When the first case executes with a lag, it causes q
// to be increased by multiple offsets, resulting in a
// higher CPU load than expectedPercent.
// cpuPercent cannot be less than zero.
fmt.Println("+++++++++++", cpuPercent)
if cpuPercent == 0 || cpuPercent > slopePercent {
// 这个循环是为了处理quota错误的更新q的,假设cpu_percent为70,目前系统负载10%,有两种情况会使得进入此循环
// 1. q连续两次被quota更新,然后执行default,offset两次都是60
// 2. q先执行一次quota,再执行default,再执行quota,offset两次都是60
if cpuPercent > slopePercent {
totalCpuPercent, err := cpu.Percent(0, true)
if err != nil {
log.Fatalf(ctx, "get cpu usage fail, %s", err.Error())
continue
}
fmt.Println("sssssssssss", totalCpuPercent[cpuIndex])
// 这里其实是一个重试策略,因为这个371行的判断其实是为了防止quota不准的
// 如果真的进入这个判断的话有两种情况:
// 1. 一数据不准,那么不修改q,ds,直接重试
// 2. 确实有其他进程抢占了CPU,q,ds会通过quota修正
if totalCpuPercent[cpuIndex] >= slopePercent {
fmt.Println("current CPU load is higher than slopePercent.")
// 正常情况也可能跑到这里导致CPU频率下降
fmt.Println("current CPU load is higher than slopePercent.", q, ds, cpuPercent)
log.Debugf(ctx, "current CPU load is higher than slopePercent.")
// When the specified CPU frequency is greater than the expected CPU
// frequency of chaos_os, we expect the behavior to be that chaos_os
// does not occupy the CPU.
time.Sleep(time.Second)
cpu.Percent(time.Second, true)
continue
}
// 此时我们认为数据可能是正常的,开始基于totalCpuPercent计算q,ds
// beforeCpuPercent初始时设置为slopePercent,这可能是不准确的,
// 再有其他负载情况时,会造成起始负载较高
other := totalCpuPercent[cpuIndex] - beforeCpuPercent
if other < 0 {
other = 0
Expand All @@ -391,9 +399,10 @@ func burn(ctx context.Context, quota <-chan int64, slopePercent float64, precpu
q = int64(cpuPercent/float64(100)*float64(period))
ds = period - q

fmt.Println("xiufu: ", q, ds, cpuPercent, slopePercent, totalCpuPercent[cpuIndex])
fmt.Println("xiufu: ", q, ds, cpuPercent, slopePercent,beforeCpuPercent, totalCpuPercent[cpuIndex])
}
fmt.Println("------------", q, ds, cpuPercent, float64(q)/float64(q+ds)*100)
// 当cpuPercent为零的时候stress_cpu会跑的很快
stress_cpu(time.Duration(q+ds), cpuPercent)
beforeCpuPercent = cpuPercent
}
Expand Down Expand Up @@ -579,6 +588,10 @@ func stress_cpu_factorial(name string) {
// Make a single CPU load rate reach cpuPercent% within the time interval.
// This function can also be used to implement something similar to stress-ng --cpu-load.
func stress_cpu(interval time.Duration, cpuPercent float64) {
if cpuPercent == 0 {
time.Sleep(time.Duration(interval)*time.Nanosecond)
return
}
bias := 0.0
startTime := time.Now().UnixNano()
nanoInterval := int64(interval/time.Nanosecond)
Expand Down

0 comments on commit aecf663

Please sign in to comment.