From aecf6632dbffc6ac0f0c12b2a52da6956e306deb Mon Sep 17 00:00:00 2001 From: Super-long <0x4f4f4f4f@gmail.com> Date: Tue, 24 May 2022 22:30:32 +0800 Subject: [PATCH] Fix the CPU burn instability. Signed-off-by: Super-long <0x4f4f4f4f@gmail.com> --- exec/cpu/cpu.go | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/exec/cpu/cpu.go b/exec/cpu/cpu.go index 30f4a4f..86cd914 100644 --- a/exec/cpu/cpu.go +++ b/exec/cpu/cpu.go @@ -337,9 +337,9 @@ func getQuota(ctx context.Context, slopePercent float64, precpu bool, cpuIndex i // The root cause of the complexity is that getUsed requires sleep. func burn(ctx context.Context, quota <-chan int64, slopePercent float64, precpu bool, cpuIndex int) { - var beforeCpuPercent float64 = 0 + var beforeCpuPercent float64 = slopePercent q := getQuota(ctx, slopePercent, precpu, cpuIndex) - cpu.Percent(time.Second, true) + cpu.Percent(0, true) ds := period - q if ds < 0 { ds = 0 @@ -347,8 +347,10 @@ func burn(ctx context.Context, quota <-chan int64, slopePercent float64, precpu fmt.Println(q, ds, slopePercent) for { select { + // 使用这quota channel有两个问题,一个是数据不准,一个是可能在default之前连续更新两次q,还都不准。 case offset := <-quota: q = q + offset + fmt.Println("xxxxxxxxxxxx", q, offset) if q < 0 { q = 0 } @@ -362,24 +364,30 @@ func burn(ctx context.Context, quota <-chan int64, slopePercent float64, precpu // When the first case executes with a lag, it causes q // to be increased by multiple offsets, resulting in a // higher CPU load than expectedPercent. - // cpuPercent cannot be less than zero. fmt.Println("+++++++++++", cpuPercent) - if cpuPercent == 0 || cpuPercent > slopePercent { + // 这个循环是为了处理quota错误的更新q的,假设cpu_percent为70,目前系统负载10%,有两种情况会使得进入此循环 + // 1. q连续两次被quota更新,然后执行default,offset两次都是60 + // 2. q先执行一次quota,再执行default,再执行quota,offset两次都是60 + if cpuPercent > slopePercent { totalCpuPercent, err := cpu.Percent(0, true) if err != nil { log.Fatalf(ctx, "get cpu usage fail, %s", err.Error()) continue } + fmt.Println("sssssssssss", totalCpuPercent[cpuIndex]) + // 这里其实是一个重试策略,因为这个371行的判断其实是为了防止quota不准的 + // 如果真的进入这个判断的话有两种情况: + // 1. 一数据不准,那么不修改q,ds,直接重试 + // 2. 确实有其他进程抢占了CPU,q,ds会通过quota修正 if totalCpuPercent[cpuIndex] >= slopePercent { - fmt.Println("current CPU load is higher than slopePercent.") + // 正常情况也可能跑到这里导致CPU频率下降 + fmt.Println("current CPU load is higher than slopePercent.", q, ds, cpuPercent) log.Debugf(ctx, "current CPU load is higher than slopePercent.") - // When the specified CPU frequency is greater than the expected CPU - // frequency of chaos_os, we expect the behavior to be that chaos_os - // does not occupy the CPU. - time.Sleep(time.Second) - cpu.Percent(time.Second, true) continue } + // 此时我们认为数据可能是正常的,开始基于totalCpuPercent计算q,ds + // beforeCpuPercent初始时设置为slopePercent,这可能是不准确的, + // 再有其他负载情况时,会造成起始负载较高 other := totalCpuPercent[cpuIndex] - beforeCpuPercent if other < 0 { other = 0 @@ -391,9 +399,10 @@ func burn(ctx context.Context, quota <-chan int64, slopePercent float64, precpu q = int64(cpuPercent/float64(100)*float64(period)) ds = period - q - fmt.Println("xiufu: ", q, ds, cpuPercent, slopePercent, totalCpuPercent[cpuIndex]) + fmt.Println("xiufu: ", q, ds, cpuPercent, slopePercent,beforeCpuPercent, totalCpuPercent[cpuIndex]) } fmt.Println("------------", q, ds, cpuPercent, float64(q)/float64(q+ds)*100) + // 当cpuPercent为零的时候stress_cpu会跑的很快 stress_cpu(time.Duration(q+ds), cpuPercent) beforeCpuPercent = cpuPercent } @@ -579,6 +588,10 @@ func stress_cpu_factorial(name string) { // Make a single CPU load rate reach cpuPercent% within the time interval. // This function can also be used to implement something similar to stress-ng --cpu-load. func stress_cpu(interval time.Duration, cpuPercent float64) { + if cpuPercent == 0 { + time.Sleep(time.Duration(interval)*time.Nanosecond) + return + } bias := 0.0 startTime := time.Now().UnixNano() nanoInterval := int64(interval/time.Nanosecond)