Skip to content

Commit

Permalink
plugins/amdgpu: Implement parallel restore
Browse files Browse the repository at this point in the history
This patch implements the entire logic to enable the offloading of
buffer object content restoration. It has two parts: the first replaces
the restoration of buffer objects in the target process by sending a
parallel restore command to the main CRIU process; the second implements
the `POST_FORKING` hook in the amdgpu plugin to enable buffer object
content restoration in the main CRIU process.

Signed-off-by: Yanning Yang <[email protected]>
  • Loading branch information
wweewrwer committed Dec 5, 2024
1 parent cd1b195 commit 729ef86
Show file tree
Hide file tree
Showing 4 changed files with 212 additions and 53 deletions.
2 changes: 1 addition & 1 deletion plugins/amdgpu/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ endif
criu-amdgpu.pb-c.c: criu-amdgpu.proto
protoc-c --proto_path=. --c_out=. criu-amdgpu.proto

amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c
amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c
$(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC)

amdgpu_plugin_clean:
Expand Down
260 changes: 209 additions & 51 deletions plugins/amdgpu/amdgpu_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@
#include "xmalloc.h"
#include "criu-log.h"
#include "files.h"
#include "restore.h"

#include "common/list.h"
#include "amdgpu_plugin_drm.h"
#include "amdgpu_plugin_util.h"
#include "amdgpu_plugin_topology.h"
#include "amdgpu_socket_utils.h"

#include "img-streamer.h"
#include "image.h"
Expand Down Expand Up @@ -351,6 +353,9 @@ int amdgpu_plugin_init(int stage)
maps_init(&restore_maps);

if (stage == CR_PLUGIN_STAGE__RESTORE) {
if (install_parallel_sock() < 0) {
return -1;
}
/* Default Values */
kfd_fw_version_check = true;
kfd_sdma_fw_version_check = true;
Expand Down Expand Up @@ -1439,14 +1444,8 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e)

static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
{
struct thread_data *thread_datas;
int thread_i, ret = 0;

thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus);
if (!thread_datas) {
ret = -ENOMEM;
goto exit;
}
int ret = 0;
int offset = 0;

for (int i = 0; i < e->num_of_bos; i++) {
struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
Expand Down Expand Up @@ -1489,65 +1488,41 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf
}
}

thread_i = 0;
pr_info("Begin to send parallel restore cmd\n");
ret = init_parallel_restore_cmd(e->num_of_bos, id);
if (ret)
goto exit_parallel;

for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) {
struct tp_node *dev;
int ret_thread = 0;
uint32_t target_gpu_id;

if (!e->device_entries[i]->gpu_id)
continue;

/* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */
struct tp_node *dev;
offset = 8;
target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id);

/* We need the fd for actual_gpu_id */
dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id);
if (!dev) {
pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id);
ret = -ENODEV;
goto exit;
goto exit_parallel;
}

thread_datas[thread_i].id = id;
thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id;
thread_datas[thread_i].bo_buckets = bo_buckets;
thread_datas[thread_i].bo_entries = e->bo_entries;
thread_datas[thread_i].pid = e->pid;
thread_datas[thread_i].num_of_bos = e->num_of_bos;

thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev);
if (thread_datas[thread_i].drm_fd < 0) {
ret = -thread_datas[thread_i].drm_fd;
goto exit;
}

ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents,
(void *)&thread_datas[thread_i]);
if (ret_thread) {
pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread);
ret = -ret_thread;
goto exit;
}
thread_i++;
}

for (int i = 0; i < e->num_of_gpus; i++) {
pthread_join(thread_datas[i].thread, NULL);
pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret);

if (thread_datas[i].ret) {
ret = thread_datas[i].ret;
goto exit;
for (int j = 0; j < e->num_of_bos; j++) {
if (bo_buckets[j].gpu_id != e->device_entries[i]->gpu_id)
continue;
if (bo_buckets[j].alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
parallel_restore_bo_add(bo_buckets[j].dmabuf_fd, bo_buckets[j].gpu_id, bo_buckets[j].size,
offset, dev->drm_render_minor);
offset += bo_buckets[j].size;
}
}
}
ret = send_parallel_restore_cmd();
exit_parallel:
free_parallel_restore_cmd();
exit:
for (int i = 0; i < e->num_of_bos; i++) {
if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD)
close(bo_buckets[i].dmabuf_fd);
}

xfree(thread_datas);
return ret;
}

Expand Down Expand Up @@ -1862,3 +1837,186 @@ int amdgpu_plugin_resume_devices_late(int target_pid)
}

CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late)

int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size,
amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type)
{
return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer,
buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE);
}

int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size)
{
int ret = 0;
int drm_fd = -1;
uint32_t major, minor;
struct amdgpu_gpu_info gpu_info = { 0 };

drm_fd = open_drm_render_device(dev_minor);
if (drm_fd < 0) {
pr_err("Fail to open drm_fd\n");
return -1;
}

ret = amdgpu_device_initialize(drm_fd, &major, &minor, h_dev);
if (ret) {
pr_perror("Failed to initialize device");
goto err;
}

ret = amdgpu_query_gpu_info(*h_dev, &gpu_info);
if (ret) {
pr_perror("failed to query gpuinfo via libdrm");
goto err;
}
*max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE :
SDMA_LINEAR_COPY_MAX_SIZE - 1;
return 0;
err:
amdgpu_device_deinitialize(*h_dev);
return ret;
}

FILE *get_bo_contents_fp(int id, int gpu_id, size_t tot_size)
{
char img_path[40];
size_t image_size = 0;
FILE *bo_contents_fp = NULL;

snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, id, gpu_id);
bo_contents_fp = open_img_file(img_path, false, &image_size);
if (!bo_contents_fp) {
pr_perror("Cannot fopen %s", img_path);
return NULL;
}

if (tot_size != image_size) {
pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, tot_size);
fclose(bo_contents_fp);
return NULL;
}
return bo_contents_fp;
}

void *restore_device_parallel_worker(void *arg)
{
while (1) {
amdgpu_device_handle h_dev;
uint64_t max_copy_size;
size_t total_bo_size = 0, max_bo_size = 0, buffer_size = 0;
FILE *bo_contents_fp = NULL;
parallel_restore_entry *entry;
void *buffer = NULL;
int *vis = NULL;
int *ret = (int *)arg;
pr_info("Begin to recv parallel restore cmd\n");
*ret = recv_parallel_restore_cmd();
if (*ret) {
if (*ret == 1) {
*ret = 0;
}
return NULL;
}

vis = xzalloc(restore_cmd.cmd_head.entry_num * sizeof(int));
if (vis == 0) {
*ret = -ENOMEM;
return NULL;
}

//Enumerate gpu_id
for (int i = 0; i < restore_cmd.cmd_head.entry_num; i++) {
if (vis[i] != 0)
continue;

for (int j = 0; j < restore_cmd.cmd_head.entry_num; j++) {
if (restore_cmd.entries[i].gpu_id == restore_cmd.entries[j].gpu_id) {
total_bo_size += restore_cmd.entries[j].size;

if (restore_cmd.entries[j].size > max_bo_size)
max_bo_size = restore_cmd.entries[j].size;
}
}
buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size;

*ret = init_dev(restore_cmd.entries[i].minor, &h_dev, &max_copy_size);
if (*ret < 0) {
goto err;
}

bo_contents_fp = get_bo_contents_fp(restore_cmd.cmd_head.id, restore_cmd.entries[i].gpu_id, total_bo_size);
if (bo_contents_fp == NULL) {
*ret = -1;
goto err_sdma;
}

posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size);
if (!buffer) {
pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE.");
*ret = -ENOMEM;
goto err_sdma;
}

//Enumerate restore_cmd for the same gpu_id
for (int j = i; j < restore_cmd.cmd_head.entry_num; j++) {
entry = &restore_cmd.entries[j];
if (restore_cmd.entries[i].gpu_id == entry->gpu_id) {
vis[j] = 1;
fseek(bo_contents_fp, entry->read_offset, SEEK_SET);
*ret = sdma_copy_bo_helper(entry->size, restore_cmd.fds_write[entry->write_id],
bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE);
if (*ret) {
pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i);
goto err_sdma;
}
}
}

err_sdma:
if (bo_contents_fp)
fclose(bo_contents_fp);
if (buffer)
xfree(buffer);
amdgpu_device_deinitialize(h_dev);
if (*ret)
goto err;
}
err:
xfree(vis);
free_parallel_restore_cmd();
}
return NULL;
}

int amdgpu_plugin_post_forking(void)
{
if (plugin_disabled)
return -ENOTSUP;

pthread_t thread;
int thread_result;
int ret = 0;
if (pthread_create(&thread, NULL, restore_device_parallel_worker, &thread_result) != 0) {
pr_perror("Create worker thread fail");
return -1;
}

pr_info("Wait for inprogress tasks\n");
ret = restore_wait_inprogress_tasks();
if (ret) {
pr_err("Wait inprogress tasks fail\n");
goto err;
}

err:
pr_info("Close parallel restore server\n");
if (close_parallel_restore_server())
return -1;

pthread_join(thread, NULL);
if (thread_result)
return thread_result;

return ret;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking)
2 changes: 1 addition & 1 deletion plugins/amdgpu/amdgpu_plugin_topology.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ bool kfd_capability_check = true;
*/
int fd_next = -1;

static int open_drm_render_device(int minor)
int open_drm_render_device(int minor)
{
char path[128];
int fd, ret_fd;
Expand Down
1 change: 1 addition & 0 deletions plugins/amdgpu/amdgpu_plugin_topology.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32
struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor);
struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index);

int open_drm_render_device(int minor);
int node_get_drm_render_device(struct tp_node *node);
void sys_close_drm_render_devices(struct tp_system *sys);

Expand Down

0 comments on commit 729ef86

Please sign in to comment.