Enable CUDA checkpointing with multiple processes

When checkpointing a container, CRIU uses the cgroup path specified by the container runtime (e.g., runc) via the `--freeze-cgroup` option to pause all running processes in the container and obtain a consistent process tree. However, if the container has multiple processes and some of them have CUDA state, we need to "lock" all these processes before freezing the cgroup; otherwise the cuda-checkpoint tool may hang. To address this problem, this patch updates the collect_pstree function to run the CUDA plugin PAUSE_DEVICES hook for all processes in the container cgroup prior to freezing. Signed-off-by: Radostin Stoyanov <[email protected]>
checkpoint-restore · Aug 17, 2024 · 6043fc4 · 6043fc4
1 parent 5ba1f84
commit 6043fc4
Showing 1 changed file with 65 additions and 6 deletions.
diff --git a/criu/seize.c b/criu/seize.c
@@ -638,14 +638,15 @@ static int collect_children(struct pstree_item *item)
 			goto free;
 		}
 
-		ret = run_plugins(PAUSE_DEVICES, pid);
-		if (ret < 0 && ret != -ENOTSUP) {
-			goto free;
-		}
+		if (!opts.freeze_cgroup) {
+			ret = run_plugins(PAUSE_DEVICES, pid);
+			if (ret < 0 && ret != -ENOTSUP) {
+				goto free;
+			}
 
-		if (!opts.freeze_cgroup)
 			/* fails when meets a zombie */
 			__ignore_value(compel_interrupt_task(pid));
+		}
 
 		ret = compel_wait_task(pid, item->pid->real, parse_pid_status, NULL, &creds.s, NULL);
 		if (ret < 0) {
@@ -967,6 +968,54 @@ static int cgroup_version(void)
 	return -1;
 }
 
+static int pause_devices_in_cgroup(void)
+{
+	char buffer[4096];
+	char procs_path[PATH_MAX];
+	ssize_t bytes_read;
+	pid_t pid;
+	int procs_fd;
+
+	/* Open the cgroup.procs file */
+	snprintf(procs_path, sizeof(procs_path), "%s/%s", opts.freeze_cgroup, "cgroup.procs");
+	procs_fd = open(procs_path, O_RDONLY);
+	if (procs_fd == -1) {
+		pr_perror("Failed to open cgroup.procs file");
+		return -1;
+	}
+
+	/* Read cgroup.procs into a buffer */
+	while ((bytes_read = read(procs_fd, buffer, sizeof(buffer) - 1)) > 0) {
+		char *ptr, *end_ptr;
+		/* Null-terminate the buffer */
+		buffer[bytes_read] = '\0';
+		ptr = buffer;
+
+		/* Process each PID */
+		while (*ptr) {
+			pid = strtol(ptr, &end_ptr, 10);
+			if (ptr != end_ptr) {
+				run_plugins(PAUSE_DEVICES, pid);
+				ptr = end_ptr;
+			} else {
+				/* Move to the next line if the current line was invalid */
+				while (*ptr && *ptr != '\n')
+					ptr++;
+				/* Skip the newline character */
+				if (*ptr)
+					ptr++;
+			}
+		}
+	}
+	close(procs_fd);
+
+	if (bytes_read == -1) {
+		pr_perror("Failed to read cgroup.procs file");
+		return -1;
+	}
+	return 0;
+}
+
 int collect_pstree(void)
 {
 	pid_t pid = root_item->pid->real;
@@ -983,7 +1032,17 @@ int collect_pstree(void)
 	 */
 	alarm(opts.timeout);
 
-	ret = run_plugins(PAUSE_DEVICES, pid);
+	/*
+	 * To create a checkpoint of a container, we use the cgroup specified by the
+	 * container runtime (e.g., runc) to freeze all processes in the container.
+	 * However, if there are processes with CUDA state, we need to "lock" them
+	 * before freezing; otherwise, the cuda-checkpoint tool may hang.
+	 */
+	if (opts.freeze_cgroup)
+		ret = pause_devices_in_cgroup();
+	else
+		ret = run_plugins(PAUSE_DEVICES, pid);
+
 	if (ret < 0 && ret != -ENOTSUP) {
 		goto err;
 	}