From 73732e73abe319dac8ff910ea21bb414cd94a0ed Mon Sep 17 00:00:00 2001 From: Dhanuka Warusadura Date: Mon, 29 May 2023 17:17:53 +0530 Subject: [PATCH] memfd_secret: add memfd_secret file support See "man 2 memfd_secret". Fixes: #2188 Signed-off-by: Dhanuka Warusadura --- criu/Makefile.crtools | 1 + criu/cr-check.c | 10 + criu/cr-restore.c | 3 +- criu/files.c | 7 + criu/image-desc.c | 2 + criu/include/image-desc.h | 3 + criu/include/image.h | 1 + criu/include/kerndat.h | 2 + criu/include/magic.h | 2 + criu/include/memfd-secret.h | 25 ++ criu/include/pagemap.h | 1 + criu/include/protobuf-desc.h | 2 + criu/kerndat.c | 67 ++++ criu/memfd-secret.c | 462 +++++++++++++++++++++++++++ criu/page-xfer.c | 20 +- criu/pagemap.c | 3 + criu/pie/restorer.c | 2 +- criu/proc_parse.c | 14 + criu/protobuf-desc.c | 3 + images/Makefile | 1 + images/fdinfo.proto | 3 + images/vma.proto | 3 + test/zdtm/static/memfd-secret00.desc | 1 + 23 files changed, 632 insertions(+), 6 deletions(-) create mode 100644 criu/include/memfd-secret.h create mode 100644 criu/memfd-secret.c create mode 100644 test/zdtm/static/memfd-secret00.desc diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index f586449172..cc5f3c779f 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -40,6 +40,7 @@ obj-y += log.o obj-y += lsm.o obj-y += mem.o obj-y += memfd.o +obj-y += memfd-secret.o obj-y += mount.o obj-y += mount-v2.o obj-y += filesystems.o diff --git a/criu/cr-check.c b/criu/cr-check.c index cb083b16ca..ce926d7e04 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1340,6 +1340,14 @@ static int check_memfd_hugetlb(void) return 0; } +static int check_memfd_secret(void) +{ + if (!kdat.has_memfd_secret) + return -1; + + return 0; +} + static int check_network_lock_nftables(void) { if (!kdat.has_nftables_concat) { @@ -1502,6 +1510,7 @@ int cr_check(void) ret |= check_openat2(); ret |= check_ptrace_get_rseq_conf(); ret |= check_ipv6_freebind(); + ret |= check_memfd_secret(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1623,6 +1632,7 @@ static struct feature_list feature_list[] = { { "openat2", check_openat2 }, { "get_rseq_conf", check_ptrace_get_rseq_conf }, { "ipv6_freebind", check_ipv6_freebind }, + { "memfd_secret", check_memfd_secret }, { NULL, NULL }, }; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 2700497216..19e63d7720 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -79,6 +79,7 @@ #include "timens.h" #include "bpfmap.h" #include "apparmor.h" +#include "memfd-secret.h" #include "parasite-syscall.h" #include "files-reg.h" @@ -279,7 +280,7 @@ static struct collect_image_info *cinfos_files[] = { &unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo, &netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo, &tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo, - &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, + &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &memfd_secret_cinfo, }; /* These images are required to restore namespaces */ diff --git a/criu/files.c b/criu/files.c index 3b653e24be..e0933b7498 100644 --- a/criu/files.c +++ b/criu/files.c @@ -49,6 +49,7 @@ #include "kerndat.h" #include "fdstore.h" #include "bpfmap.h" +#include "memfd-secret.h" #include "protobuf.h" #include "util.h" @@ -563,6 +564,9 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, /* TODO: Dump for hugetlb fd when memfd hugetlb is not supported */ if (is_memfd(p.stat.st_dev) || (kdat.has_memfd_hugetlb && is_hugetlb_dev(p.stat.st_dev, NULL))) ops = &memfd_dump_ops; + /* memfd_secret */ + else if (is_memfd_secret(p.stat.st_dev) && kdat.has_memfd_secret) + ops = &memfd_secret_dump_ops; else if (link.name[1] == '/') ops = ®file_dump_ops; else if (check_ns_proc(&link)) @@ -1778,6 +1782,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) case FD_TYPES__MEMFD: ret = collect_one_file_entry(fe, fe->memfd->id, &fe->memfd->base, &memfd_cinfo); break; + case FD_TYPES__MEMFD_SECRET: + ret = collect_one_file_entry(fe, fe->memfd_secret->id, &fe->memfd_secret->base, &memfd_secret_cinfo); + break; #ifdef CONFIG_HAS_LIBBPF case FD_TYPES__BPFMAP: ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo); diff --git a/criu/image-desc.c b/criu/image-desc.c index d65d9c0986..d9b38b5188 100644 --- a/criu/image-desc.c +++ b/criu/image-desc.c @@ -29,6 +29,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY(FDINFO, "fdinfo-%u"), FD_ENTRY(PAGEMAP, "pagemap-%lu"), FD_ENTRY(SHMEM_PAGEMAP, "pagemap-shmem-%lu"), + FD_ENTRY(SECRETMEM_PAGEMAP, "pagemap-secretmem-%lu"), FD_ENTRY(REG_FILES, "reg-files"), FD_ENTRY(EXT_FILES, "ext-files"), FD_ENTRY(NS_FILES, "ns-files"), @@ -67,6 +68,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY(REMAP_FPATH, "remap-fpath"), FD_ENTRY_F(GHOST_FILE, "ghost-file-%x", O_NOBUF), FD_ENTRY_F(MEMFD_INODE, "memfd", O_NOBUF), + FD_ENTRY_F(MEMFD_SECRET_INODE, "memfd-secret", O_NOBUF), FD_ENTRY(TCP_STREAM, "tcp-stream-%x"), FD_ENTRY(MNTS, "mountpoints-%u"), FD_ENTRY(NETDEV, "netdev-%u"), diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h index 9f369be645..76a072db93 100644 --- a/criu/include/image-desc.h +++ b/criu/include/image-desc.h @@ -52,6 +52,7 @@ enum { CR_FD_PSTREE, CR_FD_SHMEM_PAGEMAP, + CR_FD_SECRETMEM_PAGEMAP, CR_FD_GHOST_FILE, CR_FD_TCP_STREAM, CR_FD_FDINFO, @@ -69,6 +70,7 @@ enum { CR_FD_SECCOMP, CR_FD_APPARMOR, CR_FD_MEMFD_INODE, + CR_FD_MEMFD_SECRET_INODE, CR_FD_BPFMAP_FILE, CR_FD_BPFMAP_DATA, _CR_FD_GLOB_TO, @@ -113,6 +115,7 @@ enum { CR_FD_PIPES, CR_FD_TTY_FILES, CR_FD_MEMFD_FILE, + CR_FD_MEMFD_SECRET_FILE, CR_FD_AUTOFS, diff --git a/criu/include/image.h b/criu/include/image.h index 9a275565f9..fe1ab97610 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -84,6 +84,7 @@ #define VMA_AREA_VVAR (1 << 12) #define VMA_AREA_AIORING (1 << 13) #define VMA_AREA_MEMFD (1 << 14) +#define VMA_AREA_MEMFD_SECRET (1 << 15) #define VMA_EXT_PLUGIN (1 << 27) #define VMA_CLOSE (1 << 28) diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index f5d409acbf..9f836a82d7 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -34,11 +34,13 @@ enum loginuid_func { struct kerndat_s { u32 magic1, magic2; dev_t shmem_dev; + dev_t secretmem_dev; int last_cap; u64 zero_page_pfn; bool has_dirty_track; bool has_memfd; bool has_memfd_hugetlb; + bool has_memfd_secret; bool has_fdinfo_lock; unsigned long task_size; bool ipv6; diff --git a/criu/include/magic.h b/criu/include/magic.h index 0e8c37234e..ff9cfa65e1 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -37,6 +37,7 @@ #define FDINFO_MAGIC 0x56213732 /* Dmitrov */ #define PAGEMAP_MAGIC 0x56084025 /* Vladimir */ #define SHMEM_PAGEMAP_MAGIC PAGEMAP_MAGIC +#define SECRETMEM_PAGEMAP_MAGIC PAGEMAP_MAGIC #define PAGES_MAGIC RAW_IMAGE_MAGIC #define CORE_MAGIC 0x55053847 /* Kolomna */ #define IDS_MAGIC 0x54432030 /* Konigsberg */ @@ -95,6 +96,7 @@ #define AUTOFS_MAGIC 0x49353943 /* Sochi */ #define FILES_MAGIC 0x56303138 /* Toropets */ #define MEMFD_INODE_MAGIC 0x48453499 /* Dnipro */ +#define MEMFD_SECRET_INODE_MAGIC 0x44573468 /* Simferopol */ #define TIMENS_MAGIC 0x43114433 /* Beslan */ #define PIDNS_MAGIC 0x61157326 /* Surgut */ #define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */ diff --git a/criu/include/memfd-secret.h b/criu/include/memfd-secret.h new file mode 100644 index 0000000000..54422aafd7 --- /dev/null +++ b/criu/include/memfd-secret.h @@ -0,0 +1,25 @@ +#ifndef __CR_MEMFD_SECRET_H__ +#define __CR_MEMFD_SECRET_H__ + +#include +#include +#include +#include + +#include "common/config.h" + +extern int is_memfd_secret(dev_t dev); +extern const struct fdtype_ops memfd_secret_dump_ops; +extern struct collect_image_info memfd_secret_cinfo; + +static inline int memfd_secret(unsigned int flags) +{ +#ifdef __NR_memfd_secret + return syscall(__NR_memfd_secret, flags); +#else + errno = ENOSYS; + return -1; +#endif /* __NR_memfd_secret */ +} + +#endif /* __CR_MEMFD_SECRET_H__ */ diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index 8c71805598..521674c47b 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -87,6 +87,7 @@ struct page_read { /* flags for open_page_read */ #define PR_SHMEM 0x1 #define PR_TASK 0x2 +#define PR_SECRETMEM 0x3 #define PR_TYPE_MASK 0x3 #define PR_MOD 0x4 /* Will need to modify */ diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index 3824de101f..12141ee917 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -81,6 +81,8 @@ enum { PB_SK_QUEUES, PB_IPCNS_MSG, PB_IPCNS_MSG_ENT, + PB_MEMFD_SECRET_FILE, + PB_MEMFD_SECRET_INODE, PB_MAX, }; diff --git a/criu/kerndat.c b/criu/kerndat.c index fef5a46c19..f6167123e7 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -52,6 +52,7 @@ #include "kcmp.h" #include "sched.h" #include "memfd.h" +#include "memfd-secret.h" #include "mount-v2.h" #include "util-caps.h" @@ -259,6 +260,33 @@ static int kerndat_get_shmemdev(void) return -1; } +static int kerndat_get_secretmem_dev(int fd) +{ + void *secretmem = NULL; + dev_t dev; + + if (ftruncate(fd, PAGE_SIZE) < 0) + goto err; + + secretmem = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (secretmem == MAP_FAILED) + goto err; + + if (kerndat_get_dev(&dev, secretmem, PAGE_SIZE)) + goto err; + + munmap(secretmem, PAGE_SIZE); + kdat.secretmem_dev = dev; + pr_info("Found secret-memory device at %ld\n", kdat.secretmem_dev); + + return 0; + +err: + if (secretmem) + munmap(secretmem, PAGE_SIZE); + return -1; +} + /* Return -1 -- error * Return 0 -- successful but can't get any new device's numbers * Return 1 -- successful and get new device's numbers @@ -533,6 +561,41 @@ static bool kerndat_has_memfd_hugetlb(void) return 0; } +static bool kerndat_has_memfd_secret(void) +{ + int fd, ret; + + fd = memfd_secret(0); + + if (errno == ENOSYS) { + pr_warn("CRIU was built without memfd_secret support\n"); + kdat.has_memfd_secret = false; + kdat.secretmem_dev = 0; + return 0; + } + + if (fd > 0) { + kdat.has_memfd_secret = true; + } else if (fd == -1 && (errno == EINVAL || errno == EMFILE || errno == ENOMEM)) { + kdat.has_memfd_secret = false; + kdat.secretmem_dev = 0; + return 0; + } else { + pr_perror("Unexpected error from memfd_secret(0)"); + return -1; + } + + ret = kerndat_get_secretmem_dev(fd); + if (ret) { + close(fd); + return -1; + } + + close(fd); + + return 0; +} + static int get_task_size(void) { kdat.task_size = compel_task_size(); @@ -1818,6 +1881,10 @@ int kerndat_init(void) pr_err("kerndat_has_memfd_hugetlb failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_memfd_secret()) { + pr_err("kerndat_has_memfd_secret failed when initializing kerndat.\n"); + ret = -1; + } if (!ret && kerndat_detect_stack_guard_gap()) { pr_err("kerndat_detect_stack_guard_gap failed when initializing kerndat.\n"); ret = -1; diff --git a/criu/memfd-secret.c b/criu/memfd-secret.c new file mode 100644 index 0000000000..17bc76fc45 --- /dev/null +++ b/criu/memfd-secret.c @@ -0,0 +1,462 @@ +#include +#include +#include +#include + +#include "memfd-secret.h" +#include "log.h" +#include "kerndat.h" +#include "files.h" +#include "protobuf.h" +#include "images/memfd-secret.pb-c.h" +#include "files-reg.h" +#include "imgset.h" +#include "util.h" +#include "namespaces.h" +#include "rst-malloc.h" +#include "fdstore.h" +#include "string.h" +#include "page.h" +#include "page-pipe.h" +#include "image-desc.h" +#include "page-xfer.h" +#include "stats.h" +#include "common/list.h" +#include "pagemap.h" +#include "mem.h" +#include "types.h" + +#define PST_DIRTY 3 + +struct memfd_secret_dump_inode { + struct list_head list; + u32 id; + u32 dev; + u32 ino; +}; + +struct memfd_secret_restore_inode { + struct list_head list; + u32 id; + mutex_t lock; + int fdstore_id; + MemfdSecretInodeEntry msie; +}; + +static LIST_HEAD(memfd_secret_inodes); + +static u32 memfd_secret_inode_ids = 1; + +/* secretmem dump */ + +struct secretmem_info { + unsigned long secretmem_id; + unsigned long size; +}; + +static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer) +{ + struct page_pipe_buf *ppb; + + list_for_each_entry(ppb, &pp->bufs, l) + if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs, SPLICE_F_GIFT | SPLICE_F_NONBLOCK) != + ppb->pages_in * PAGE_SIZE) { + pr_perror("Can't get secretmem into page-pipe"); + return -1; + } + + return page_xfer_dump_pages(xfer, pp); +} + +static int do_dump_one_secretmem(void *addr, struct secretmem_info *smi) +{ + struct page_pipe *pp; + struct page_xfer xfer; + unsigned long nrpages, pfn; + char buf[PAGE_SIZE]; + unsigned long pages[2] = {}; + int err, ret = -1; + + nrpages = (smi->size + PAGE_SIZE - 1) / PAGE_SIZE; + + pp = create_page_pipe((nrpages + 1) / 2, NULL, PP_CHUNK_MODE); + if (!pp) + goto err; + + err = open_page_xfer(&xfer, CR_FD_SECRETMEM_PAGEMAP, smi->secretmem_id); + if (err) + goto err_pp; + + xfer.offset = (unsigned long)addr; + + for (pfn = 0; pfn < nrpages; pfn++) { + unsigned int pgstate = PST_DIRTY; + unsigned long _pgaddr, pgaddr; + int st = -1; + + _pgaddr = (unsigned long)addr + pfn * PAGE_SIZE; + memset(buf, 0, PAGE_SIZE); + /* secretmem areas can't be vmspliced */ + memcpy(buf, (void *)_pgaddr, smi->size); + pgaddr = (unsigned long)buf; + + if (xfer.parent && page_in_parent(pgstate == PST_DIRTY)) { + ret = page_pipe_add_hole(pp, pgaddr, PP_HOLE_PARENT); + st = 0; + } else { + ret = page_pipe_add_page(pp, pgaddr, 0); + st = 1; + } + + if (ret) + goto err_xfer; + + pages[st]++; + } + + cnt_add(CNT_SECMEMPAGES_SCANNED, nrpages); + cnt_add(CNT_SECMEMPAGES_SKIPPED_PARENT, pages[0]); + cnt_add(CNT_SECMEMPAGES_WRITTEN, pages[1]); + + ret = dump_pages(pp, &xfer); + +err_xfer: + xfer.close(&xfer); +err_pp: + destroy_page_pipe(pp); +err: + return ret; +} + +static int dump_one_memfd_secretmem(int fd, unsigned long secretmem_id, unsigned long size) +{ + int ret = -1; + void *addr; + struct secretmem_info smi; + + if (size == 0) + return 0; + + memset(&smi, 0, sizeof(smi)); + smi.secretmem_id = secretmem_id; + smi.size = size; + + addr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + pr_perror("Can't mmap secretmem 0x%lx", secretmem_id); + return ret; + } + + ret = do_dump_one_secretmem(addr, &smi); + munmap(addr, size); + + return ret; +} + +/* secretmem restore */ + +static int do_restore_secretmem_content(void *addr, unsigned long size, unsigned long secretmem_id) +{ + int ret = 0; + struct page_read pr; + + ret = open_page_read(secretmem_id, &pr, PR_SECRETMEM); + if (ret <= 0) + return -1; + + while (1) { + unsigned long vaddr; + unsigned nr_pages; + + ret = pr.advance(&pr); + if (ret <= 0) + break; + + vaddr = (unsigned long)decode_pointer(pr.pe->vaddr); + nr_pages = pr.pe->nr_pages; + + pr.read_pages(&pr, vaddr, nr_pages, addr, 0); + } + + pr.close(&pr); + return ret; +} + +static int restore_secretmem_content(int fd, unsigned long secretmem_id, unsigned long size) +{ + void *addr = NULL; + int ret = -1; + + if (size == 0) + return 0; + + if (ftruncate(fd, size) < 0) { + pr_perror("Can't resize secretmem 0x%lx size=%ld", secretmem_id, size); + goto out; + } + + addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + pr_perror("Can't mmap secretmem 0x%lx size=%ld", secretmem_id, size); + goto out; + } + + if (do_restore_secretmem_content(addr, round_up(size, PAGE_SIZE), secretmem_id) < 0) { + pr_perror("Can't restore secretmem content"); + goto out; + } + + ret = 0; + +out: + if (addr) + munmap(addr, size); + return ret; +} + +/* fd dump */ + +int is_memfd_secret(dev_t dev) +{ + /* struct kerndat_s */ + return dev == kdat.secretmem_dev; +} + +static int dump_memfd_secret_inode(int fd, struct memfd_secret_dump_inode *inode, const struct stat *st) +{ + MemfdSecretInodeEntry msie = MEMFD_SECRET_INODE_ENTRY__INIT; + int ret = -1; + u32 secretmem_id; + + secretmem_id = inode->ino; + + pr_info("Dumping secretmem contents (id %#x, secretmem_id: %#x, size: %" PRIu64 ")\n", inode->id, secretmem_id, + st->st_size); + + if (dump_one_memfd_secretmem(fd, secretmem_id, st->st_size) < 0) + return ret; + + msie.inode_id = inode->id; + msie.uid = userns_uid(st->st_uid); + msie.gid = userns_gid(st->st_gid); + msie.size = st->st_size; + msie.secretmem_id = secretmem_id; + + if (pb_write_one(img_from_set(glob_imgset, CR_FD_MEMFD_SECRET_INODE), &msie, PB_MEMFD_SECRET_INODE)) + return ret; + + return 0; +} + +static struct memfd_secret_dump_inode *dump_unique_memfd_secret_inode(int lfd, const struct stat *st) +{ + struct memfd_secret_dump_inode *inode; + + list_for_each_entry(inode, &memfd_secret_inodes, list) + if ((inode->dev == st->st_dev) && (inode->ino == st->st_ino)) + return inode; + + inode = xmalloc(sizeof(*inode)); + if (inode == NULL) + return NULL; + + inode->dev = st->st_dev; + inode->ino = st->st_ino; + inode->id = memfd_secret_inode_ids++; + + if (dump_memfd_secret_inode(lfd, inode, st)) { + xfree(inode); + return NULL; + } + + list_add_tail(&inode->list, &memfd_secret_inodes); + + return inode; +} + +static int dump_one_memfd_secret(int lfd, u32 id, const struct fd_parms *p) +{ + MemfdSecretFileEntry msfe = MEMFD_SECRET_FILE_ENTRY__INIT; + FileEntry fe = FILE_ENTRY__INIT; + struct memfd_secret_dump_inode *inode; + struct fd_link _link, *link; + + if (!p->link) { + if (fill_fdlink(lfd, p, &_link)) + return -1; + link = &_link; + } else + link = p->link; + + link_strip_deleted(link); /* link->name: ./secretmem */ + + inode = dump_unique_memfd_secret_inode(lfd, &p->stat); + if (!inode) + return -1; + + msfe.id = id; + msfe.flags = p->flags; + msfe.pos = p->pos; + msfe.fown = (FownEntry *)&p->fown; + msfe.inode_id = inode->id; + + fe.type = FD_TYPES__MEMFD_SECRET; + fe.id = msfe.id; + fe.memfd_secret = &msfe; + + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +} + +const struct fdtype_ops memfd_secret_dump_ops = { + .type = FD_TYPES__MEMFD_SECRET, + .dump = dump_one_memfd_secret, +}; + +/* fd restore */ + +struct memfd_secret_info { + MemfdSecretFileEntry *msfe; + struct file_desc d; + struct memfd_secret_restore_inode *inode; +}; + +static struct memfd_secret_restore_inode *memfd_secret_alloc_inode(int id) +{ + struct memfd_secret_restore_inode *inode; + + list_for_each_entry(inode, &memfd_secret_inodes, list) + if (inode->id == id) + return inode; + + inode = shmalloc(sizeof(*inode)); + if (!inode) + return NULL; + + inode->id = id; + mutex_init(&inode->lock); + inode->fdstore_id = -1; + + list_add_tail(&inode->list, &memfd_secret_inodes); + return inode; +} + +static int memfd_secret_open_inode_nocache(struct memfd_secret_restore_inode *inode) +{ + MemfdSecretInodeEntry *msie = NULL; + struct cr_img *img = NULL; + int fd = -1; + int ret = -1; + int flags = 0; + + img = open_image(CR_FD_MEMFD_SECRET_INODE, O_RSTR, inode->id); + if (!img) + goto out; + + if (pb_read_one(img, &msie, PB_MEMFD_SECRET_INODE) < 0) + goto out; + + fd = memfd_secret(flags); + if (fd < 0) { + pr_perror("Can't create memfd_secret"); + goto out; + } + + if (restore_secretmem_content(fd, msie->secretmem_id, msie->size)) + goto out; + + if (cr_fchown(fd, msie->uid, msie->gid)) { + pr_perror("Can't change uid %d gid %d of memfd-secret", (int)msie->uid, (int)msie->gid); + goto out; + } + + inode->fdstore_id = fdstore_add(fd); + if (inode->fdstore_id < 0) + goto out; + + ret = fd; + fd = -1; + +out: + if (fd != -1) + close(fd); + if (img) + close_image(img); + if (msie) + memfd_secret_inode_entry__free_unpacked(msie, NULL); + + return ret; +} + +static int memfd_secret_open_inode(struct memfd_secret_restore_inode *inode) +{ + int fd; + + if (inode->fdstore_id != -1) + return fdstore_get(inode->fdstore_id); + + mutex_lock(&inode->lock); + if (inode->fdstore_id != -1) + fd = fdstore_get(inode->fdstore_id); + else + fd = memfd_secret_open_inode_nocache(inode); + mutex_unlock(&inode->lock); + + return fd; +} + +static int memfd_secret_open(struct file_desc *d, u32 *fdflags) +{ + struct memfd_secret_info *msfi; + MemfdSecretFileEntry *msfe; + int fd; + + msfi = container_of(d, struct memfd_secret_info, d); + msfe = msfi->msfe; + + if (inherited_fd(d, &fd)) + return fd; + + pr_info("Restoring memfd_secret id=%d\n", msfe->id); + + fd = memfd_secret_open_inode(msfi->inode); + if (fd < 0) + return -1; + + return fd; +} + +static int memfd_secret_open_fe_fd(struct file_desc *fd, int *new_fd) +{ + int tmp; + + tmp = memfd_secret_open(fd, NULL); + if (tmp < 0) + return -1; + + *new_fd = tmp; + return 0; +} + +static struct file_desc_ops memfd_secret_desc_ops = { + .type = FD_TYPES__MEMFD_SECRET, + .open = memfd_secret_open_fe_fd, +}; + +static int collect_one_memfd_secret(void *o, ProtobufCMessage *msg, struct cr_img *i) +{ + struct memfd_secret_info *info = o; + + info->msfe = pb_msg(msg, MemfdSecretFileEntry); + info->inode = memfd_secret_alloc_inode(info->msfe->inode_id); + if (!info->inode) + return -1; + + return file_desc_add(&info->d, info->msfe->id, &memfd_secret_desc_ops); +} + +struct collect_image_info memfd_secret_cinfo = { + .fd_type = CR_FD_MEMFD_SECRET_FILE, + .pb_type = PB_MEMFD_SECRET_FILE, + .priv_size = sizeof(struct memfd_secret_info), + .collect = collect_one_memfd_secret, +}; diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 94f4774148..376a4c96dc 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -62,6 +62,7 @@ static void psi2iovec(struct page_server_iov *ps, struct iovec *iov) #define PS_TYPE_PID (1) #define PS_TYPE_SHMEM (2) +#define PS_TYPE_SECRETMEM (3) /* * XXX: When adding new types here check decode_pm for legacy * numbers that can be met from older CRIUs @@ -73,6 +74,8 @@ static inline u64 encode_pm(int type, unsigned long id) type = PS_TYPE_PID; else if (type == CR_FD_SHMEM_PAGEMAP) type = PS_TYPE_SHMEM; + else if (type == CR_FD_SECRETMEM_PAGEMAP) + type = PS_TYPE_SECRETMEM; else { BUG(); return 0; @@ -111,6 +114,10 @@ static int decode_pm(u64 dst_id, unsigned long *id) *id = dst_id >> PS_TYPE_BITS; type = CR_FD_SHMEM_PAGEMAP; break; + case PS_TYPE_SECRETMEM: + *id = dst_id >> PS_TYPE_BITS; + type = CR_FD_SECRETMEM_PAGEMAP; + break; default: type = -1; break; @@ -382,10 +389,15 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, unsigned lo * to exist in parent (either pagemap or hole) */ xfer->parent = NULL; - if (fd_type == CR_FD_PAGEMAP || fd_type == CR_FD_SHMEM_PAGEMAP) { - int ret; - int pfd; - int pr_flags = (fd_type == CR_FD_PAGEMAP) ? PR_TASK : PR_SHMEM; + if (fd_type == CR_FD_PAGEMAP || fd_type == CR_FD_SHMEM_PAGEMAP || fd_type == CR_FD_SECRETMEM_PAGEMAP) { + int ret, pfd, pr_flags; + + if (fd_type == CR_FD_PAGEMAP) + pr_flags = PR_TASK; + else if (fd_type == CR_FD_SECRETMEM_PAGEMAP) + pr_flags = PR_SECRETMEM; + else + pr_flags = PR_SHMEM; /* Image streaming lacks support for incremental images */ if (opts.stream) diff --git a/criu/pagemap.c b/criu/pagemap.c index 83f69bba37..48f8a516e4 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -778,6 +778,9 @@ int open_page_read_at(int dfd, unsigned long img_id, struct page_read *pr, int p case PR_SHMEM: i_typ = CR_FD_SHMEM_PAGEMAP; break; + case PR_SECRETMEM: + i_typ = CR_FD_SECRETMEM_PAGEMAP; + break; default: BUG(); return -1; diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index ba6f290dc8..dbacbc1498 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1875,7 +1875,7 @@ long __export_restore_task(struct task_restore_args *args) unsigned long m; vma_entry = args->vmas + i; - if (!vma_entry->has_madv || !vma_entry->madv) + if (!vma_entry->has_madv || !vma_entry->madv || vma_entry_is(vma_entry, VMA_AREA_MEMFD_SECRET)) continue; for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 16392e3864..0f6464b07c 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -42,6 +42,7 @@ #include "fault-injection.h" #include "memfd.h" #include "hugetlb.h" +#include "memfd-secret.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" @@ -79,6 +80,9 @@ static char *buf = __buf.buf; #define AIO_FNAME "/[aio]" +/* memfd_secret */ +#define SECRETMEM_NAME "/secretmem (deleted)" + /* check the @line starts with "%lx-%lx" format */ static bool __is_vma_range_fmt(char *line) { @@ -463,6 +467,14 @@ static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, st return 0; } + /* memfd_secret */ + if (!strncmp(fname, SECRETMEM_NAME, sizeof(SECRETMEM_NAME))) { + pr_info("Found memfd_secret fd mapping\n"); + vma->e->secretmem_id = buf.st_ino; + vma->e->status = VMA_AREA_MEMFD_SECRET; + return 0; + } + pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname); return -1; } @@ -651,6 +663,8 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat close_safe(vm_file_fd); return 0; + } else if (is_memfd_secret(st_buf->st_dev)) { /* for memfd_secret case */ + vma_area->e->status |= VMA_AREA_MEMFD_SECRET; } if (vma_area->e->flags & MAP_PRIVATE) diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c index ff16b9f5be..6cb421e314 100644 --- a/criu/protobuf-desc.c +++ b/criu/protobuf-desc.c @@ -68,6 +68,7 @@ #include "images/bpfmap-file.pb-c.h" #include "images/bpfmap-data.pb-c.h" #include "images/apparmor.pb-c.h" +#include "memfd-secret.pb-c.h" struct cr_pb_message_desc cr_pb_descs[PB_MAX]; @@ -121,6 +122,8 @@ void cr_pb_init(void) CR_PB_DESC(REMAP_FPATH, RemapFilePath, remap_file_path); CR_PB_DESC(NETDEV, NetDevice, net_device); CR_PB_MDESC_INIT(cr_pb_descs[PB_PAGEMAP_HEAD], PagemapHead, pagemap_head); + CR_PB_DESC(MEMFD_SECRET_FILE, MemfdSecretFile, memfd_secret_file); + CR_PB_DESC(MEMFD_SECRET_INODE, MemfdSecretInode, memfd_secret_inode); #include "protobuf-desc-gen.h" } diff --git a/images/Makefile b/images/Makefile index ca85b1a213..b42abe628d 100644 --- a/images/Makefile +++ b/images/Makefile @@ -67,6 +67,7 @@ proto-obj-y += autofs.o proto-obj-y += macvlan.o proto-obj-y += sit.o proto-obj-y += memfd.o +proto-obj-y += memfd-secret.o proto-obj-y += timens.o proto-obj-y += img-streamer.o proto-obj-y += bpfmap-file.o diff --git a/images/fdinfo.proto b/images/fdinfo.proto index 88f1c11860..26e7d0d64a 100644 --- a/images/fdinfo.proto +++ b/images/fdinfo.proto @@ -20,6 +20,7 @@ import "pipe.proto"; import "tty.proto"; import "memfd.proto"; import "bpfmap-file.proto"; +import "memfd-secret.proto"; enum fd_types { UND = 0; @@ -42,6 +43,7 @@ enum fd_types { TIMERFD = 17; MEMFD = 18; BPFMAP = 19; + MEMFD_SECRET = 20; /* Any number above the real used. Not stored to image */ CTL_TTY = 65534; @@ -78,4 +80,5 @@ message file_entry { optional tty_file_entry tty = 19; optional memfd_file_entry memfd = 20; optional bpfmap_file_entry bpf = 21; + optional memfd_secret_file_entry memfd_secret = 22; } diff --git a/images/vma.proto b/images/vma.proto index 0c07d51c6b..a19dbb0600 100644 --- a/images/vma.proto +++ b/images/vma.proto @@ -24,4 +24,7 @@ message vma_entry { /* file status flags */ optional uint32 fdflags = 10 [(criu).hex = true]; + + /* memfd_secret secretmem id */ + optional uint64 secretmem_id = 11; } diff --git a/test/zdtm/static/memfd-secret00.desc b/test/zdtm/static/memfd-secret00.desc new file mode 100644 index 0000000000..2bff09e633 --- /dev/null +++ b/test/zdtm/static/memfd-secret00.desc @@ -0,0 +1 @@ +{'feature': 'memfd_secret', 'flags': 'noauto'}