Skip to content

Commit

Permalink
Enable CUDA checkpointing with multiple processes
Browse files Browse the repository at this point in the history
When checkpointing a container, CRIU uses the cgroup path specified by
the container runtime (e.g., runc) via the `--freeze-cgroup` option to
pause all running processes in the container and obtain a consistent
process tree. However, if the container has multiple processes with
CUDA state, we need to "lock" these processes before freezing the cgroup;
otherwise the cuda-checkpoint tool may hang.

To address this problem, this patch updates the collect_pstree function
to run the CUDA plugin PAUSE_DEVICES hook for all processes in the
container cgroup prior to freezing.

In addition, this change introduces a mechanism to disable the use of
freeze cgroups during process seizing, even if explicitly requested
via the --freeze-cgroup option.

The CUDA plugin is updated to utilize this new mechanism to ensure
compatibility.

Signed-off-by: Andrei Vagin <[email protected]>
Signed-off-by: Radostin Stoyanov <[email protected]>
  • Loading branch information
rst0git committed Sep 12, 2024
1 parent 4ca4a09 commit 76b2fa5
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 17 deletions.
2 changes: 1 addition & 1 deletion criu/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -742,7 +742,7 @@ static int collect_cgroups(struct list_head *ctls)
if (ret < 0)
return ret;

if (opts.freeze_cgroup && !strcmp(cc->name, "freezer") && add_freezer_state(current_controller))
if (opts.freeze_cgroup && !freeze_cgroup_disabled && !strcmp(cc->name, "freezer") && add_freezer_state(current_controller))
return -1;
}

Expand Down
3 changes: 3 additions & 0 deletions criu/include/seize.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,7 @@ extern bool alarm_timeouted(void);
extern char *task_comm_info(pid_t pid, char *comm, size_t size);
extern char *__task_comm_info(pid_t pid);

extern void dont_use_freeze_cgroup(void);
extern bool freeze_cgroup_disabled;

#endif
115 changes: 99 additions & 16 deletions criu/seize.c
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,19 @@ enum freezer_state { FREEZER_ERROR = -1, THAWED, FROZEN, FREEZING };
/* Track if we are running on cgroup v2 system. */
static bool cgroup_v2 = false;

bool freeze_cgroup_disabled;

/*
* Disables the use of freeze cgroups for process seizing, even if explicitly
* requested via the --freeze-cgroup option. This is necessary for plugins
* (e.g., CUDA) that do not function correctly when processes are frozen using
* cgroups.
*/
void __attribute__((used)) dont_use_freeze_cgroup(void)
{
freeze_cgroup_disabled = true;
}

static enum freezer_state get_freezer_v1_state(int fd)
{
char state[32];
Expand Down Expand Up @@ -236,7 +249,7 @@ static int freezer_restore_state(void)
int fd;
int ret;

if (!opts.freeze_cgroup || origin_freezer_state != FROZEN)
if (!opts.freeze_cgroup || freeze_cgroup_disabled || origin_freezer_state != FROZEN)
return 0;

fd = freezer_open();
Expand Down Expand Up @@ -397,7 +410,7 @@ static int freezer_detach(void)
{
int i;

if (!opts.freeze_cgroup)
if (!opts.freeze_cgroup || freeze_cgroup_disabled)
return 0;

for (i = 0; i < processes_to_wait && processes_to_wait_pids; i++) {
Expand Down Expand Up @@ -638,14 +651,17 @@ static int collect_children(struct pstree_item *item)
goto free;
}

ret = run_plugins(PAUSE_DEVICES, pid);
if (ret < 0 && ret != -ENOTSUP) {
goto free;
if (!opts.freeze_cgroup) {
ret = run_plugins(PAUSE_DEVICES, pid);
if (ret < 0 && ret != -ENOTSUP) {
goto free;
}
}

if (!opts.freeze_cgroup)
if (!opts.freeze_cgroup || freeze_cgroup_disabled) {
/* fails when meets a zombie */
__ignore_value(compel_interrupt_task(pid));
}

ret = compel_wait_task(pid, item->pid->real, parse_pid_status, NULL, &creds.s, NULL);
if (ret < 0) {
Expand Down Expand Up @@ -831,7 +847,7 @@ static int collect_threads(struct pstree_item *item)

pr_info("\tSeizing %d's %d thread\n", item->pid->real, pid);

if (!opts.freeze_cgroup && compel_interrupt_task(pid))
if ((!opts.freeze_cgroup || freeze_cgroup_disabled) && compel_interrupt_task(pid))
continue;

ret = compel_wait_task(pid, item_ppid(item), parse_pid_status, NULL, &t_creds.s, NULL);
Expand Down Expand Up @@ -887,7 +903,7 @@ static int collect_loop(struct pstree_item *item, int (*collect)(struct pstree_i
{
int attempts = NR_ATTEMPTS, nr_inprogress = 1;

if (opts.freeze_cgroup)
if (opts.freeze_cgroup && !freeze_cgroup_disabled)
attempts = 1;

/*
Expand Down Expand Up @@ -967,6 +983,61 @@ static int cgroup_version(void)
return -1;
}

static int pause_devices_in_cgroup(void)
{
char buffer[4096];
int ret;
char procs_path[PATH_MAX];
ssize_t bytes_read;
pid_t pid;
int procs_fd;

/* Open the cgroup.procs file */
snprintf(procs_path, sizeof(procs_path), "%s/%s", opts.freeze_cgroup, "cgroup.procs");
procs_fd = open(procs_path, O_RDONLY);
if (procs_fd == -1) {
pr_perror("Failed to open cgroup.procs file");
return -1;
}

/* Read cgroup.procs into a buffer */
while ((bytes_read = read(procs_fd, buffer, sizeof(buffer) - 1)) > 0) {
char *ptr, *end_ptr;
/* Null-terminate the buffer */
buffer[bytes_read] = '\0';
ptr = buffer;

/* Process each PID */
while (*ptr) {
pid = strtol(ptr, &end_ptr, 10);
if (ptr != end_ptr) {
ret = run_plugins(PAUSE_DEVICES, pid);
if (ret < 0 && ret != -ENOTSUP) {
pr_err("Failed to pause GPU device\n");
close(procs_fd);
return -1;
}

ptr = end_ptr;
} else {
/* Move to the next line if the current line was invalid */
while (*ptr && *ptr != '\n')
ptr++;
/* Skip the newline character */
if (*ptr)
ptr++;
}
}
}
close(procs_fd);

if (bytes_read == -1) {
pr_perror("Failed to read cgroup.procs file");
return -1;
}
return 0;
}

int collect_pstree(void)
{
pid_t pid = root_item->pid->real;
Expand All @@ -983,7 +1054,17 @@ int collect_pstree(void)
*/
alarm(opts.timeout);

ret = run_plugins(PAUSE_DEVICES, pid);
/*
* To create a checkpoint of a container, we use the cgroup specified by the
* container runtime (e.g., runc) to freeze all processes in the container.
* However, if there are processes with CUDA state, we need to "lock" them
* before freezing; otherwise, the cuda-checkpoint tool may hang.
*/
if (opts.freeze_cgroup)
ret = pause_devices_in_cgroup();
else
ret = run_plugins(PAUSE_DEVICES, pid);

if (ret < 0 && ret != -ENOTSUP) {
goto err;
}
Expand All @@ -993,12 +1074,14 @@ int collect_pstree(void)

pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1);

if (opts.freeze_cgroup && freeze_processes())
goto err;

if (!opts.freeze_cgroup && compel_interrupt_task(pid)) {
set_cr_errno(ESRCH);
goto err;
if (opts.freeze_cgroup && !freeze_cgroup_disabled) {
if (freeze_processes())
goto err;
} else {
if (compel_interrupt_task(pid)) {
set_cr_errno(ESRCH);
goto err;
}
}

ret = compel_wait_task(pid, -1, parse_pid_status, NULL, &creds.s, NULL);
Expand All @@ -1024,7 +1107,7 @@ int collect_pstree(void)
if (ret < 0)
goto err;

if (opts.freeze_cgroup && freezer_wait_processes()) {
if (opts.freeze_cgroup && !freeze_cgroup_disabled && freezer_wait_processes()) {
ret = -1;
goto err;
}
Expand Down
2 changes: 2 additions & 0 deletions plugins/cuda/cuda_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,8 @@ int cuda_plugin_init(int stage)
INIT_LIST_HEAD(&cuda_pids);
}

dont_use_freeze_cgroup();

return 0;
}

Expand Down

0 comments on commit 76b2fa5

Please sign in to comment.