Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix swap handling for cgroups v2 #617

Merged
merged 3 commits into from
Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/bindings.c
Original file line number Diff line number Diff line change
Expand Up @@ -866,6 +866,7 @@ static void __attribute__((constructor)) lxcfs_init(void)
{
__do_close int init_ns = -EBADF, root_fd = -EBADF,
pidfd = -EBADF;
__do_free char *cgroup = NULL;
int i = 0;
pid_t pid;
struct hierarchy *hierarchy;
Expand Down Expand Up @@ -920,7 +921,8 @@ static void __attribute__((constructor)) lxcfs_init(void)
lxcfs_info("Kernel supports pidfds");
}

can_use_swap = cgroup_ops->can_use_swap(cgroup_ops);
cgroup = get_pid_cgroup(pid, "memory");
mihalicyn marked this conversation as resolved.
Show resolved Hide resolved
can_use_swap = cgroup && cgroup_ops->can_use_swap(cgroup_ops, cgroup);
if (can_use_swap)
lxcfs_info("Kernel supports swap accounting");
else
Expand Down
33 changes: 12 additions & 21 deletions src/cgroups/cgfsng.c
Original file line number Diff line number Diff line change
Expand Up @@ -631,34 +631,25 @@ static int cgfsng_get_memory_slabinfo_fd(struct cgroup_ops *ops, const char *cgr
return openat(h->fd, path, O_RDONLY | O_CLOEXEC | O_NOFOLLOW);
}

static bool cgfsng_can_use_swap(struct cgroup_ops *ops)
static bool cgfsng_can_use_swap(struct cgroup_ops *ops, const char *cgroup)
{
bool has_swap = false;
__do_free char *cgroup_rel = NULL, *junk_value = NULL;
const char *file;
struct hierarchy *h;

h = ops->get_hierarchy(ops, "memory");
if (!h)
return false;

if (is_unified_hierarchy(h)) {
if (faccessat(h->fd, "memory.swap.max", F_OK, 0))
return false;

if (faccessat(h->fd, "memory.swap.current", F_OK, 0))
return false;

has_swap = true;
} else {
if (faccessat(h->fd, "memory.memsw.limit_in_bytes", F_OK, 0))
return false;

if (faccessat(h->fd, "memory.memsw.usage_in_bytes", F_OK, 0))
return false;

has_swap = true;
}

return has_swap;
cgroup_rel = must_make_path_relative(cgroup, NULL);
file = is_unified_hierarchy(h) ? "memory.swap.current" : "memory.memsw.usage_in_bytes";
/* For v2, we need to look at the lower levels of the hierarchy because
* no 'memory.swap.current' file exists at the root. We must search
* upwards in the hierarchy in case memory accounting is disabled via
* cgroup.subtree_control for the given cgroup itself.
*/
int ret = cgroup_walkup_to_root(ops->cgroup2_root_fd, h->fd, cgroup_rel, file, &junk_value);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I briefly looked at my old helper again and while walking upwards we should verify that each component we're looking at is actually on a cgroup2 filesystem. If not, we should fail.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be a separate patch from this series.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll prepare fix for this as a separate PR

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return ret == 0;
}

static int cgfsng_get_memory_stats(struct cgroup_ops *ops, const char *cgroup,
Expand Down
2 changes: 1 addition & 1 deletion src/cgroups/cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ struct cgroup_ops {
char **value);
int (*get_memory_slabinfo_fd)(struct cgroup_ops *ops,
const char *cgroup);
bool (*can_use_swap)(struct cgroup_ops *ops);
bool (*can_use_swap)(struct cgroup_ops *ops, const char *cgroup);

/* cpuset */
int (*get_cpuset_cpus)(struct cgroup_ops *ops, const char *cgroup,
Expand Down
123 changes: 77 additions & 46 deletions src/proc_fuse.c
Original file line number Diff line number Diff line change
Expand Up @@ -239,21 +239,37 @@ __lxcfs_fuse_ops int proc_release(const char *path, struct fuse_file_info *fi)
return 0;
}

static uint64_t get_memlimit(const char *cgroup, bool swap)
/**
* Gets a non-hierarchical memory controller limit, or UINT64_MAX if no limit is
* in place. If `swap` is true, reads 'swap' (v2) or 'memsw' (v1); otherwise
* reads the memory (RAM) limits.
*
* @returns 0 on success (and sets `*limit`), < 0 on error
*/
static int get_memlimit(const char *cgroup, bool swap, uint64_t *limit)
{
__do_free char *memlimit_str = NULL;
uint64_t memlimit = 0;
uint64_t memlimit = UINT64_MAX;
int ret;

if (swap)
ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
else
ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
if (ret > 0 && memlimit_str[0] && safe_uint64(memlimit_str, &memlimit, 10) < 0)
lxcfs_error("Failed to convert memory%s.max=%s for cgroup %s",
swap ? ".swap" : "", memlimit_str, cgroup);

return memlimit;
if (ret < 0)
return ret;

if (memlimit_str[0]) {
ret = safe_uint64(memlimit_str, &memlimit, 10);
if (ret < 0) {
lxcfs_error("Failed to convert memory%s.max=%s for cgroup %s",
swap ? ".swap" : "", memlimit_str, cgroup);
return ret;
}
}
*limit = memlimit;
return 0;
}

/*
Expand Down Expand Up @@ -318,31 +334,44 @@ static char *gnu_dirname(char *path)
return path;
}

static uint64_t get_min_memlimit(const char *cgroup, bool swap)
/**
* Gets a hierarchical memory controller limit, or UINT64_MAX if no limit is
* in place. If `swap` is true, reads 'swap' (v2) or 'memsw' (v1); otherwise
* reads the memory (RAM) limits.
*
* @returns 0 on success (and sets `*limit`), < 0 on error
*/
static int get_min_memlimit(const char *cgroup, bool swap, uint64_t *limit)
{
__do_free char *copy = NULL;
uint64_t memlimit = 0, retlimit = 0;
uint64_t memlimit = UINT64_MAX, retlimit = UINT64_MAX;
int ret;

copy = strdup(cgroup);
if (!copy)
return log_error_errno(0, ENOMEM, "Failed to allocate memory");

retlimit = get_memlimit(copy, swap);
ret = get_memlimit(copy, swap, &retlimit);
if (ret < 0)
return ret;

/*
* If the cgroup doesn't start with / (probably won't happen), dirname()
* will terminate with "" instead of "/"
*/
while (*copy && strcmp(copy, "/") != 0) {
while (retlimit != 0 && *copy && strcmp(copy, "/") != 0) {
char *it = copy;

it = gnu_dirname(it);
memlimit = get_memlimit(it, swap);
if (memlimit > 0 && memlimit < retlimit)
ret = get_memlimit(it, swap, &memlimit);
if (ret < 0)
return ret;
if (memlimit < retlimit)
retlimit = memlimit;
};
}

return retlimit;
*limit = retlimit;
return 0;
}

static inline bool startswith(const char *line, const char *pref)
Expand All @@ -361,30 +390,30 @@ static void get_swap_info(const char *cgroup, uint64_t memlimit,
*swtotal = *swusage = 0;
*memswpriority = 1;

memswlimit = get_min_memlimit(cgroup, true);
if (memswlimit > 0) {
ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
if (ret < 0 || safe_uint64(memswusage_str, &memswusage, 10) != 0)
return;

if (liblxcfs_memory_is_cgroupv2()) {
*swtotal = memswlimit / 1024;
*swusage = memswusage / 1024;
} else {
if (memlimit > memswlimit)
*swtotal = 0;
else
*swtotal = (memswlimit - memlimit) / 1024;
if (memusage > memswusage || swtotal == 0)
*swusage = 0;
else
*swusage = (memswusage - memusage) / 1024;
}

ret = cgroup_ops->get_memory_swappiness(cgroup_ops, cgroup, &memswpriority_str);
if (ret >= 0)
safe_uint64(memswpriority_str, memswpriority, 10);
ret = get_min_memlimit(cgroup, true, &memswlimit);
if (ret < 0)
return;
ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
if (ret < 0 || safe_uint64(memswusage_str, &memswusage, 10) < 0)
return;

if (liblxcfs_memory_is_cgroupv2()) {
*swtotal = memswlimit / 1024;
*swusage = memswusage / 1024;
} else {
if (memlimit > memswlimit)
*swtotal = 0;
else
*swtotal = (memswlimit - memlimit) / 1024;
if (memusage > memswusage || *swtotal == 0)
*swusage = 0;
else
*swusage = (memswusage - memusage) / 1024;
}

ret = cgroup_ops->get_memory_swappiness(cgroup_ops, cgroup, &memswpriority_str);
if (ret >= 0)
safe_uint64(memswpriority_str, memswpriority, 10);
}

static int proc_swaps_read(char *buf, size_t size, off_t offset,
Expand Down Expand Up @@ -432,12 +461,12 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset,
return read_file_fuse("/proc/swaps", buf, size, d);
prune_init_slice(cgroup);

memlimit = get_min_memlimit(cgroup, false);

ret = get_min_memlimit(cgroup, false, &memlimit);
if (ret < 0)
return 0;
ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
if (ret < 0)
return 0;

if (safe_uint64(memusage_str, &memusage, 10) < 0)
lxcfs_error("Failed to convert memusage %s", memusage_str);

Expand All @@ -459,11 +488,13 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset,
}

if (wants_swap) {
/* The total amount of swap is always reported to be the
/* For cgroups v1, the total amount of swap is always reported to be the
lesser of the RAM+SWAP limit or the SWAP device size.
This is because the kernel can swap as much as it
wants and not only up to swtotal. */
swtotal = memlimit / 1024 + swtotal;
if (!liblxcfs_memory_is_cgroupv2())
swtotal = memlimit / 1024 + swtotal;

if (hostswtotal < swtotal) {
swtotal = hostswtotal;
}
Expand Down Expand Up @@ -1318,8 +1349,9 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset,
if (!cgroup_parse_memory_stat(cgroup, &mstat))
return read_file_fuse("/proc/meminfo", buf, size, d);

memlimit = get_min_memlimit(cgroup, false);

ret = get_min_memlimit(cgroup, false, &memlimit);
if (ret < 0)
return read_file_fuse("/proc/meminfo", buf, size, d);
/*
* Following values are allowed to fail, because swapaccount might be
* turned off for current kernel.
Expand Down Expand Up @@ -1359,11 +1391,10 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset,

sscanf(line + STRLITERALLEN("SwapTotal:"), "%" PRIu64, &hostswtotal);

/* The total amount of swap is always reported to be the
/* In cgroups v1, the total amount of swap is always reported to be the
lesser of the RAM+SWAP limit or the SWAP device size.
This is because the kernel can swap as much as it
wants and not only up to swtotal. */

if (!liblxcfs_memory_is_cgroupv2())
swtotal += memlimit;

Expand Down
Loading