Fix the CPU burn instability.

Signed-off-by: Super-long <[email protected]>
chaosblade-io · May 25, 2022 · aecf663 · aecf663
1 parent 910dab7
commit aecf663
Showing 1 changed file with 24 additions and 11 deletions.
diff --git a/exec/cpu/cpu.go b/exec/cpu/cpu.go
@@ -337,18 +337,20 @@ func getQuota(ctx context.Context, slopePercent float64, precpu bool, cpuIndex i
 
 // The root cause of the complexity is that getUsed requires sleep.
 func burn(ctx context.Context, quota <-chan int64, slopePercent float64, precpu bool, cpuIndex int) {
-	var beforeCpuPercent float64 = 0
+	var beforeCpuPercent float64 = slopePercent
 	q := getQuota(ctx, slopePercent, precpu, cpuIndex)
-	cpu.Percent(time.Second, true)
+	cpu.Percent(0, true)
 	ds := period - q
 	if ds < 0 {
 		ds = 0
 	}
 	fmt.Println(q, ds, slopePercent)
 	for {
 		select {
+			// 使用这quota channel有两个问题，一个是数据不准，一个是可能在default之前连续更新两次q，还都不准。
 		case offset := <-quota:
 			q = q + offset
+			fmt.Println("xxxxxxxxxxxx", q, offset)
 			if q < 0 {
 				q = 0
 			}
@@ -362,24 +364,30 @@ func burn(ctx context.Context, quota <-chan int64, slopePercent float64, precpu
 			// When the first case executes with a lag, it causes q 
 			// to be increased by multiple offsets, resulting in a 
 			// higher CPU load than expectedPercent.
-			// cpuPercent cannot be less than zero.
 			fmt.Println("+++++++++++", cpuPercent)
-			if cpuPercent == 0 || cpuPercent > slopePercent {
+			// 这个循环是为了处理quota错误的更新q的，假设cpu_percent为70，目前系统负载10%，有两种情况会使得进入此循环
+			// 1. q连续两次被quota更新，然后执行default，offset两次都是60
+			// 2. q先执行一次quota，再执行default，再执行quota，offset两次都是60
+			if cpuPercent > slopePercent {
 				totalCpuPercent, err := cpu.Percent(0, true)
 				if err != nil {
 					log.Fatalf(ctx, "get cpu usage fail, %s", err.Error())
 					continue
 				}
+				fmt.Println("sssssssssss", totalCpuPercent[cpuIndex])
+				// 这里其实是一个重试策略，因为这个371行的判断其实是为了防止quota不准的
+				// 如果真的进入这个判断的话有两种情况：
+				// 1. 一数据不准，那么不修改q，ds，直接重试
+				// 2. 确实有其他进程抢占了CPU，q，ds会通过quota修正
 				if totalCpuPercent[cpuIndex] >= slopePercent {
-					fmt.Println("current CPU load is higher than slopePercent.")
+					// 正常情况也可能跑到这里导致CPU频率下降
+					fmt.Println("current CPU load is higher than slopePercent.", q, ds, cpuPercent)
 					log.Debugf(ctx, "current CPU load is higher than slopePercent.")
-					// When the specified CPU frequency is greater than the expected CPU 
-					// frequency of chaos_os, we expect the behavior to be that chaos_os 
-					// does not occupy the CPU.
-					time.Sleep(time.Second)
-					cpu.Percent(time.Second, true)
 					continue
 				}
+				// 此时我们认为数据可能是正常的，开始基于totalCpuPercent计算q，ds
+				// beforeCpuPercent初始时设置为slopePercent，这可能是不准确的，
+				// 再有其他负载情况时，会造成起始负载较高
 				other := totalCpuPercent[cpuIndex] - beforeCpuPercent
 				if other < 0 {
 					other = 0
@@ -391,9 +399,10 @@ func burn(ctx context.Context, quota <-chan int64, slopePercent float64, precpu
 				q = int64(cpuPercent/float64(100)*float64(period))
 				ds = period - q
 
-				fmt.Println("xiufu: ", q, ds, cpuPercent, slopePercent, totalCpuPercent[cpuIndex])
+				fmt.Println("xiufu: ", q, ds, cpuPercent, slopePercent,beforeCpuPercent, totalCpuPercent[cpuIndex])
 			}
 			fmt.Println("------------", q, ds, cpuPercent, float64(q)/float64(q+ds)*100)
+			// 当cpuPercent为零的时候stress_cpu会跑的很快
 			stress_cpu(time.Duration(q+ds), cpuPercent)
 			beforeCpuPercent = cpuPercent
 		}
@@ -579,6 +588,10 @@ func stress_cpu_factorial(name string) {
 // Make a single CPU load rate reach cpuPercent% within the time interval.
 // This function can also be used to implement something similar to stress-ng --cpu-load.
 func stress_cpu(interval time.Duration, cpuPercent float64) {
+	if cpuPercent == 0 {
+		time.Sleep(time.Duration(interval)*time.Nanosecond)
+		return
+	}
 	bias := 0.0
 	startTime := time.Now().UnixNano()
 	nanoInterval := int64(interval/time.Nanosecond)