From e3f6216c0b6a3079bc42fdfbc9f0b2365f207d0e Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Tue, 21 Nov 2023 16:54:35 +1100 Subject: [PATCH] pmdabpf: prototype Kepler module (sustainable-computing.io) --- src/pmdas/bpf/README | 42 +-- src/pmdas/bpf/bpf.conf | 9 + src/pmdas/bpf/modules/GNUmakefile | 13 +- src/pmdas/bpf/modules/kepler.bpf.c | 307 +++++++++++++++++++++ src/pmdas/bpf/modules/kepler.bpf.h | 132 +++++++++ src/pmdas/bpf/modules/kepler.c | 424 +++++++++++++++++++++++++++++ src/pmdas/bpf/modules/kepler.h | 25 ++ src/pmdas/bpf/modules/module.h | 1 + src/pmdas/bpf/zmalloc.h | 23 -- 9 files changed, 931 insertions(+), 45 deletions(-) create mode 100644 src/pmdas/bpf/modules/kepler.bpf.c create mode 100644 src/pmdas/bpf/modules/kepler.bpf.h create mode 100644 src/pmdas/bpf/modules/kepler.c create mode 100644 src/pmdas/bpf/modules/kepler.h delete mode 100644 src/pmdas/bpf/zmalloc.h diff --git a/src/pmdas/bpf/README b/src/pmdas/bpf/README index e992735e8b..028c34aa2d 100644 --- a/src/pmdas/bpf/README +++ b/src/pmdas/bpf/README @@ -1,38 +1,40 @@ PCP PMDA to load linux BPF modules ================================== -This PMDA is capable of collecting and generating arbitrary metrics from kernel-side code running as pre-compiled -ELF BPF/eBPF modules. +This PMDA is capable of collecting and generating arbitrary metrics from +kernel-side code running as pre-compiled ELF BPF/eBPF modules. Comparison to other PCP PMDAS ============================= -- pmdabcc runs as python and is a little easier to develop and maintain, has more advanced configuration files, - however it requires significantly more runtime memory. It loads and compiles the BCC code through LLVM which - has a heavy footprint to work with the fact that kernel structures might change. By comparison, pmdabpf uses - pre-compiled ELF-based BPF CO-RE modules to avoid relocation constraints. +- pmdabcc runs as python and is a little easier to develop and maintain, + however it requires significantly more runtime memory. It loads and + compiles the BCC code through LLVM which has a heavy footprint to + work with the fact that kernel structures might change. By comparison, + pmdabpf uses pre-compiled ELF-based BPF CO-RE modules to avoid + relocation constraints. -- pmdabpftrace can load and run arbitrary bpftrace code, so it is very flexible, however it does this by shelling - out to bpftrace executable which again requires quite a lot of memory. +- pmdabpftrace can load and run arbitrary bpftrace code, so it is very + flexible, however it does this by shelling out to bpftrace executable + which again requires quite a lot of memory. Deployment ========== -The file `bpf.conf` lists modules, one per line, that will be started. +The file `bpf.conf` lists modules with their configuration information, +one section per module, that will be started if enabled is set to true. -A relatively new version of libbpf is required on the system. libbpf 0.1.0 is unlikely to work, however, libbpf 0.4.0 -has been shown to work. +A relatively new version of libbpf is required on the system - 1.0.0 is +known to work. Development =========== To develop additional modules: -- Follow the examples provided (runqlat and biolatency) to create a new module (which will output a .so). The entry - point in your .so will be a load_module() call that should return a newly allocated `struct module` object. -- Create your bpf code (this will become a .bpf.o). Use the various `_helpers` headers. -- Ensure `module.h` has correct unique setup for your cluster and metric ids. -- Add details to `pmns` and `help` files to ensure they match the `module.h` changes. - -TODO -==== -- allow configuration settings +- Follow the examples provided (runqlat and biolatency) to create a new + module (which will output a .so). The entry point in your .so will + be a load_module() call that should return a newly allocated `struct + module` object. +- Create your bpf code (this will become a .bpf.o). Use the various + `_helpers` headers. +- Add your module name to the global known-modules list in `module.h`. diff --git a/src/pmdas/bpf/bpf.conf b/src/pmdas/bpf/bpf.conf index b1a4d6ad33..f1545d5c81 100644 --- a/src/pmdas/bpf/bpf.conf +++ b/src/pmdas/bpf/bpf.conf @@ -135,3 +135,12 @@ enabled = false # cgroup - string - unset : Trace process in cgroup path [biosnoop.so] enabled = false + +# This tool traces scheduling activity for use in power management. +# +# Configuration options: +# Name - type - default +# +# process_count - int - 20 : number of processes to keep in cache +[kepler.so] +enabled = false diff --git a/src/pmdas/bpf/modules/GNUmakefile b/src/pmdas/bpf/modules/GNUmakefile index 3e45b2a0f7..a7e28a4c60 100644 --- a/src/pmdas/bpf/modules/GNUmakefile +++ b/src/pmdas/bpf/modules/GNUmakefile @@ -64,6 +64,15 @@ APPS_BPF = \ tcpconnlat.bpf.c \ vfsstat.bpf.c \ +# non-libbpf header files. +LOCAL_H = \ + kepler.bpf.h \ + +# non-libbpf *.bpf.c files. +LOCAL_BPF = \ + kepler.bpf.c \ + +PMDABPF_MODULES += kepler.so default_pcp default: $(PMDABPF_MODULES) @@ -99,7 +108,7 @@ $(APPS_BPF): vmlinux.h: $(PMDABPF_VMLINUXH) $(LN_S) -f $< $@ -%.bpf.o: %.bpf.c vmlinux.h $(APPS_H) $(APPS_BPF) +%.bpf.o: %.bpf.c vmlinux.h $(APPS_H) $(APPS_BPF) $(LOCAL_H) $(LOCAL_BPF) $(CLANG) -Wall -g -O2 -target bpf -D__TARGET_ARCH_$(PMDABPF_ARCH) \ -I. -c $< -o $@ $(LLVM_STRIP) -g $@ @@ -108,7 +117,7 @@ vmlinux.h: $(PMDABPF_VMLINUXH) $(BPFTOOL) gen skeleton $< > $@ %.o: %.c -%.o: %.c %.skel.h $(HELPERS_H) $(APPS_H) +%.o: %.c %.skel.h $(HELPERS_H) $(APPS_H) $(LOCAL_H) $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ %_helpers.o: %_helpers.c diff --git a/src/pmdas/bpf/modules/kepler.bpf.c b/src/pmdas/bpf/modules/kepler.bpf.c new file mode 100644 index 0000000000..46323cc177 --- /dev/null +++ b/src/pmdas/bpf/modules/kepler.bpf.c @@ -0,0 +1,307 @@ +/* +Copyright 2021. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// +build ignore + +#include + +#if (LINUX_KERNEL_VERSION >= KERNEL_VERSION(5, 12, 0)) +#define BPF_PERF_EVENT_READ_VALUE_AVAILABLE 1 +#endif + +#include "kepler.bpf.h" + +// processes and pid time +BPF_HASH(processes, u32, process_metrics_t); +BPF_HASH(pid_time, u32, u64); + +// perf counters +BPF_PERF_ARRAY(cpu_cycles_hc_reader); +BPF_ARRAY(cpu_cycles, u64); + +BPF_PERF_ARRAY(cpu_ref_cycles_hc_reader); +BPF_ARRAY(cpu_ref_cycles, u64); + +BPF_PERF_ARRAY(cpu_instructions_hc_reader); +BPF_ARRAY(cpu_instructions, u64); + +BPF_PERF_ARRAY(cache_miss_hc_reader); +BPF_ARRAY(cache_miss, u64); + +// cpu freq counters +BPF_ARRAY(cpu_freq_array, u32); + +// setting sample rate or counter to 0 will make compiler to remove the code entirely. +int sample_rate = 1; +int counter_sched_switch = 0; + +static inline u64 get_on_cpu_time(u32 cur_pid, u32 prev_pid, u64 cur_ts) +{ + u64 cpu_time = 0; + + // get pid time + pid_time_t prev_pid_key = {.pid = prev_pid}; + u64 *prev_ts; + prev_ts = bpf_map_lookup_elem(&pid_time, &prev_pid_key); + if (prev_ts) + { + // Probably a clock issue where the recorded on-CPU event had a + // timestamp later than the recorded off-CPU event, or vice versa. + if (cur_ts > *prev_ts) + { + cpu_time = (cur_ts - *prev_ts); /*nanosecond*/ + bpf_map_delete_elem(&pid_time, &prev_pid_key); + } + } + pid_time_t new_pid_key = {.pid = cur_pid}; + bpf_map_update_elem(&pid_time, &new_pid_key, &cur_ts, BPF_NOEXIST); + + return cpu_time; +} + +static inline u64 calc_delta(u64 *prev_val, u64 *val) +{ + u64 delta = 0; + if (prev_val) + { + if (*val > *prev_val) + delta = *val - *prev_val; + } + return delta; +} + +// although the "get_on_cpu_counters" has some code duplications, it is inline code and the compiles will improve this +static inline u64 get_on_cpu_cycles(u32 *cpu_id) +{ + u64 delta = 0; +#ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE + struct bpf_perf_event_value c = {}; + int error = bpf_perf_event_read_value(&cpu_cycles_hc_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value)); + if (error == 0) + { + u64 val = c.counter; + u64 *prev_val = bpf_map_lookup_elem(&cpu_cycles, cpu_id); + delta = calc_delta(prev_val, &val); + bpf_map_update_elem(&cpu_cycles, cpu_id, &val, BPF_ANY); + } +#else + int ret = bpf_perf_event_read(&cpu_cycles_hc_reader, *cpu_id); + if (ret < 0) { + return delta; + } + u64 val = ret; + u64 *prev_val = bpf_map_lookup_elem(&cpu_cycles, cpu_id); + delta = calc_delta(prev_val, &val); + bpf_map_update_elem(&cpu_cycles, cpu_id, &val, BPF_ANY); +#endif + + return delta; +} + +static inline u64 get_on_cpu_ref_cycles(u32 *cpu_id) +{ + u64 delta = 0; +#ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE + struct bpf_perf_event_value c = {}; + int error = bpf_perf_event_read_value(&cpu_ref_cycles_hc_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value)); + if (error == 0) + { + u64 val = c.counter; + u64 *prev_val = bpf_map_lookup_elem(&cpu_ref_cycles, cpu_id); + delta = calc_delta(prev_val, &val); + bpf_map_update_elem(&cpu_ref_cycles, cpu_id, &val, BPF_ANY); + } +#else + int ret = bpf_perf_event_read(&cpu_ref_cycles_hc_reader, *cpu_id); + if (ret < 0) { + return delta; + } + u64 val = ret; + u64 *prev_val = bpf_map_lookup_elem(&cpu_ref_cycles, cpu_id); + delta = calc_delta(prev_val, &val); + bpf_map_update_elem(&cpu_ref_cycles, cpu_id, &val, BPF_ANY); +#endif + return delta; +} + +static inline u64 get_on_cpu_instr(u32 *cpu_id) +{ + u64 delta = 0; +#ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE + struct bpf_perf_event_value c = {}; + int error = bpf_perf_event_read_value(&cpu_instructions_hc_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value)); + if (error == 0) + { + u64 val = c.counter; + u64 *prev_val = bpf_map_lookup_elem(&cpu_instructions, cpu_id); + delta = calc_delta(prev_val, &val); + bpf_map_update_elem(&cpu_instructions, cpu_id, &val, BPF_ANY); + } +#else + int ret = bpf_perf_event_read(&cpu_instructions_hc_reader, *cpu_id); + if (ret < 0) { + return delta; + } + u64 val = ret; + u64 *prev_val = bpf_map_lookup_elem(&cpu_instructions, cpu_id); + delta = calc_delta(prev_val, &val); + bpf_map_update_elem(&cpu_instructions, cpu_id, &val, BPF_ANY); +#endif + return delta; +} + +static inline u64 get_on_cpu_cache_miss(u32 *cpu_id) +{ + u64 delta = 0; +#ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE + struct bpf_perf_event_value c = {}; + int error = bpf_perf_event_read_value(&cache_miss_hc_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value)); + if (error == 0) + { + u64 val = c.counter; + u64 *prev_val = bpf_map_lookup_elem(&cache_miss, cpu_id); + delta = calc_delta(prev_val, &val); + bpf_map_update_elem(&cache_miss, cpu_id, &val, BPF_ANY); + } +#else + int ret = bpf_perf_event_read(&cache_miss_hc_reader, *cpu_id); + if (ret < 0) { + return delta; + } + u64 val = ret; + u64 *prev_val = bpf_map_lookup_elem(&cache_miss, cpu_id); + delta = calc_delta(prev_val, &val); + bpf_map_update_elem(&cache_miss, cpu_id, &val, BPF_ANY); +#endif + return delta; +} + +// calculate the average cpu freq +static inline u64 get_on_cpu_avg_freq(u32 *cpu_id, u64 on_cpu_cycles_delta, u64 on_cpu_ref_cycles_delta) +{ + u32 avg_freq = 0; + bpf_map_lookup_or_try_init(&cpu_freq_array, cpu_id, &avg_freq); + if (avg_freq == 0) + { + avg_freq = ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) * HZ; + } + else + { + avg_freq += ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) * HZ; + avg_freq /= 2; + } + bpf_map_update_elem(&cpu_freq_array, cpu_id, &avg_freq, BPF_ANY); + return avg_freq; +} + +SEC("tracepoint/sched/sched_switch") +int kepler_trace(struct sched_switch_args *ctx) +{ + u32 next_pid = ctx->next_pid; // the new pid that is to be scheduled + + // only do sampling if sample rate is set + if (sample_rate != 0) + { + if (counter_sched_switch > 0) + { + counter_sched_switch--; + return 0; + } + counter_sched_switch = sample_rate; + } + + u32 cur_pid = bpf_get_current_pid_tgid(); + u64 cgroup_id = bpf_get_current_cgroup_id(); // the cgroup id is the cgroup id of the running process (this is not next_pid or prev_pid) + u64 cur_ts = bpf_ktime_get_ns(); + u32 cpu_id = bpf_get_smp_processor_id(); + u32 prev_pid = ctx->prev_pid; + + u64 on_cpu_cycles_delta = get_on_cpu_cycles(&cpu_id); + u64 on_cpu_ref_cycles_delta = get_on_cpu_ref_cycles(&cpu_id); + u64 on_cpu_instr_delta = get_on_cpu_instr(&cpu_id); + u64 on_cpu_cache_miss_delta = get_on_cpu_cache_miss(&cpu_id); + u64 on_cpu_avg_freq = get_on_cpu_avg_freq(&cpu_id, on_cpu_cycles_delta, on_cpu_ref_cycles_delta); + u64 on_cpu_time_delta = get_on_cpu_time(next_pid, prev_pid, cur_ts); + + // store process metrics + struct process_metrics_t *process_metrics; + process_metrics = bpf_map_lookup_elem(&processes, &prev_pid); + if (process_metrics) + { + // update process time + process_metrics->process_run_time += on_cpu_time_delta; + process_metrics->cpu_cycles += on_cpu_cycles_delta; + process_metrics->cpu_instr += on_cpu_instr_delta; + process_metrics->cache_miss += on_cpu_cache_miss_delta; + } + + process_metrics = bpf_map_lookup_elem(&processes, &cur_pid); + if (process_metrics == 0) + { + process_metrics_t new_process = {}; + new_process.pid = cur_pid; + new_process.cgroup_id = cgroup_id; + // bpf_probe_read(&new_process.comm, sizeof(new_process.comm), (void *)ctx->next_comm); + bpf_get_current_comm(&new_process.comm, sizeof(new_process.comm)); + bpf_map_update_elem(&processes, &cur_pid, &new_process, BPF_NOEXIST); + } + return 0; +} + +SEC("tracepoint/irq/softirq_entry") +int kepler_irq_trace(struct trace_event_raw_softirq *ctx) +{ + u32 cur_pid = bpf_get_current_pid_tgid(); + struct process_metrics_t *process_metrics; + process_metrics = bpf_map_lookup_elem(&processes, &cur_pid); + if (process_metrics != 0) + { + if (ctx->vec < 10) { + process_metrics->vec_nr[ctx->vec] ++; + } + } + return 0; +} + +// count read page cache +SEC("kprobe/mark_page_accessed") +int kprobe__mark_page_accessed(struct pt_regs *ctx) +{ + u32 cur_pid = bpf_get_current_pid_tgid(); + struct process_metrics_t *process_metrics; + process_metrics = bpf_map_lookup_elem(&processes, &cur_pid); + if (process_metrics) + { + process_metrics->page_cache_hit ++; + } + return 0; +} + +// count write page cache +SEC("kprobe/set_page_dirty") +int kprobe__set_page_dirty(struct pt_regs *ctx) +{ + u32 cur_pid = bpf_get_current_pid_tgid(); + struct process_metrics_t *process_metrics; + process_metrics = bpf_map_lookup_elem(&processes, &cur_pid); + if (process_metrics) + { + process_metrics->page_cache_hit ++; + } + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/src/pmdas/bpf/modules/kepler.bpf.h b/src/pmdas/bpf/modules/kepler.bpf.h new file mode 100644 index 0000000000..0e29485841 --- /dev/null +++ b/src/pmdas/bpf/modules/kepler.bpf.h @@ -0,0 +1,132 @@ +/* +Copyright 2021. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + + +/* In Linux 5.4 asm_inline was introduced, but it's not supported by clang. + * Redefine it to just asm to enable successful compilation. + * see https://github.com/iovisor/bcc/commit/2d1497cde1cc9835f759a707b42dea83bee378b8 for more details + */ +#include +#include +#ifdef asm_inline +#undef asm_inline +#define asm_inline asm +#endif + +typedef __u64 u64; +typedef __u32 u32; +typedef __u16 u16; + +#include +#include + +#ifndef NUM_CPUS +#define NUM_CPUS 128 +#endif + +#ifndef TASK_COMM_LEN +#define TASK_COMM_LEN 16 +#endif + +// irq counter, 10 is the max number of irq vectors +#ifndef IRQ_MAX_LEN +#define IRQ_MAX_LEN 10 +#endif + +#ifndef CPU_REF_FREQ +#define CPU_REF_FREQ 2500 +#endif + +#ifndef HZ +#define HZ 1000 +#endif + +#ifndef MAP_SIZE +#define MAP_SIZE 32768 +#endif + +// array size is to be reset in userspace +#define BPF_ARRARY_MAP(_name, _type, _key_type, _value_type) \ + struct { \ + __uint(type, _type); \ + __type(key, _key_type); \ + __type(value, _value_type); \ + __uint(max_entries, NUM_CPUS); \ + } _name SEC(".maps"); + +#define BPF_HASH(_name, _key_type, _value_type) \ + struct { \ + __uint(type, BPF_MAP_TYPE_HASH); \ + __type(key, _key_type); \ + __type(value, _value_type); \ + __uint(max_entries, MAP_SIZE); \ + } _name SEC(".maps"); + +#define BPF_ARRAY(_name, _leaf_type) \ + BPF_ARRARY_MAP(_name, BPF_MAP_TYPE_ARRAY, u32, _leaf_type); + +#define BPF_PERF_ARRAY(_name) \ + BPF_ARRARY_MAP(_name, BPF_MAP_TYPE_PERF_EVENT_ARRAY, int, u32) + +static __always_inline void * +bpf_map_lookup_or_try_init(void *map, const void *key, const void *init) +{ + void *val; + int err; + + val = bpf_map_lookup_elem(map, key); + if (val) + return val; + + err = bpf_map_update_elem(map, key, init, BPF_NOEXIST); + if (err && err != -17) + return 0; + + return bpf_map_lookup_elem(map, key); +} + +struct sched_switch_args { + unsigned long long pad; + char prev_comm[TASK_COMM_LEN]; + int prev_pid; + int prev_prio; + long long prev_state; + char next_comm[TASK_COMM_LEN]; + int next_pid; + int next_prio; +}; + +struct trace_event_raw_softirq { + unsigned long long pad; + unsigned int vec; +}; + +typedef struct process_metrics_t { + u64 cgroup_id; + u64 pid; + u64 process_run_time; + u64 cpu_cycles; + u64 cpu_instr; + u64 cache_miss; + u64 page_cache_hit; + u16 vec_nr[IRQ_MAX_LEN]; + char comm[TASK_COMM_LEN]; +} process_metrics_t; + +typedef struct pid_time_t { + u32 pid; +} pid_time_t; + diff --git a/src/pmdas/bpf/modules/kepler.c b/src/pmdas/bpf/modules/kepler.c new file mode 100644 index 0000000000..a8b3b4ddf1 --- /dev/null +++ b/src/pmdas/bpf/modules/kepler.c @@ -0,0 +1,424 @@ +/* + * + * Copyright (c) 2023 Red Hat. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +#include "module.h" + +#include +#include +#include +#include +#include + +#include "kepler.skel.h" +#include "kepler.h" + +#define PERF_BUFFER_PAGES 64 +#define PERF_POLL_TIMEOUT_MS 0 + +#define INDOM_COUNT 1 + +static struct env { + int process_count; +} env = { + .process_count = 20, +}; + +static pmdaInstid *kepler_instances; +static struct kepler_bpf *obj; +static struct perf_buffer *pb = NULL; +static int lost_events; +static int queuelength; + +/* cache array */ +struct tailq_entry { + struct event event; + TAILQ_ENTRY(tailq_entry) entries; +}; + +TAILQ_HEAD(tailhead, tailq_entry) head; + +static struct tailq_entry* allocElm(void) +{ + return malloc(sizeof(struct tailq_entry)); +} + +static void push(struct tailq_entry *elm) +{ + TAILQ_INSERT_TAIL(&head, elm, entries); + if (queuelength > env.process_count) + { + struct tailq_entry *l; + l = head.tqh_first; + TAILQ_REMOVE(&head, l, entries); + free(l); + queuelength--; + } + queuelength++; +} + +static bool get_item(unsigned int offset, struct tailq_entry** val) +{ + struct tailq_entry *i; + unsigned int iter = 0; + + TAILQ_FOREACH_REVERSE(i, &head, tailhead, entries) { + if (offset == iter) { + *val = i; + return true; + } + iter++; + } + return false; +} + +static unsigned int indom_id_mapping[INDOM_COUNT]; + +enum metric_name { + COMM, + PID, + LOST, + CGROUPID, + RUNTIME, + CPU_CYCLES, + CPU_INSTR, + CACHE_MISS, + PAGE_CACHE_HIT, + METRIC_COUNT /* last */ +}; +enum metric_indom { KEPLER_INDOM }; + +char* metric_names[METRIC_COUNT] = { + [COMM] = "kepler.comm", + [PID] = "kepler.pid", + [LOST] = "kepler.lost", + [CGROUPID] = "kepler.cgroup_id", + [RUNTIME] = "kepler.runtime", + [CPU_CYCLES] = "kepler.cycles", + [CPU_INSTR] = "kepler.instr", + [CACHE_MISS] = "kepler.cache_miss", + [PAGE_CACHE_HIT] = "kepler.page_cache_hit", +}; + +char* metric_text_oneline[METRIC_COUNT] = { + [COMM] = "Command name", + [PID] = "Process identifier", + [LOST] = "Number of lost events", + [CGROUPID] = "Control Group identifier for each process", + [RUNTIME] = "Run time of each process", + [CPU_CYCLES] = "Number of cycles for each process", + [CPU_INSTR] = "Number of instructions for each process", + [CACHE_MISS] = "Number of cache misses for each process", + [PAGE_CACHE_HIT] = "Number of page cache hits for each process", +}; + +char* metric_text_long[METRIC_COUNT] = { + [COMM] = "Command name", + [PID] = "Process identifier", + [LOST] = "Number of lost events", + [CGROUPID] = "Control Group identifier for each process", + [RUNTIME] = "Run time of each process", + [CPU_CYCLES] = "Number of cycles for each process", + [CPU_INSTR] = "Number of instructions for each process", + [CACHE_MISS] = "Number of cache misses for each process", + [PAGE_CACHE_HIT] = "Number of page cache hits for each process", +}; + +static unsigned int kepler_metric_count(void) +{ + return METRIC_COUNT; +} + +static char* kepler_metric_name(unsigned int metric) +{ + return metric_names[metric]; +} + +static unsigned int kepler_indom_count(void) +{ + return INDOM_COUNT; +} + +static void kepler_set_indom_serial(unsigned int local_indom_id, unsigned int global_id) +{ + indom_id_mapping[local_indom_id] = global_id; +} + +static int kepler_metric_text(int item, int type, char **buffer) +{ + if (type & PM_TEXT_ONELINE) { + *buffer = metric_text_oneline[item]; + } else { + *buffer = metric_text_long[item]; + } + + return 0; +} + +static void kepler_register(unsigned int cluster_id, pmdaMetric *metrics, pmdaIndom *indoms) +{ + /* bpf.kepler.comm */ + metrics[COMM] = (struct pmdaMetric) + { + .m_desc = { + .pmid = PMDA_PMID(cluster_id, 0), + .type = PM_TYPE_STRING, + .indom = indom_id_mapping[KEPLER_INDOM], + .sem = PM_SEM_INSTANT, + .units = PMDA_PMUNITS(0, 0, 0, 0, 0, 0), + } + }; + /* bpf.kepler.pid */ + metrics[PID] = (struct pmdaMetric) + { + .m_desc = { + .pmid = PMDA_PMID(cluster_id, 1), + .type = PM_TYPE_U32, + .indom = indom_id_mapping[KEPLER_INDOM], + .sem = PM_SEM_DISCRETE, + .units = PMDA_PMUNITS(0, 0, 0, 0, 0, 0), + } + }; + /* bpf.kepler.cgroupid */ + metrics[CGROUPID] = (struct pmdaMetric) + { + .m_desc = { + .pmid = PMDA_PMID(cluster_id, 2), + .type = PM_TYPE_U64, + .indom = indom_id_mapping[KEPLER_INDOM], + .sem = PM_SEM_INSTANT, + .units = PMDA_PMUNITS(0, 0, 0, 0, 0, 0), + } + }; + /* bpf.kepler.lost */ + metrics[LOST] = (struct pmdaMetric) + { + .m_desc = { + .pmid = PMDA_PMID(cluster_id, 3), + .type = PM_TYPE_U32, + .indom = PM_INDOM_NULL, + .sem = PM_SEM_COUNTER, + .units = PMDA_PMUNITS(0, 0, 1, 0, 0, PM_COUNT_ONE), + } + }; + /* bpf.kepler.runtime */ + metrics[RUNTIME] = (struct pmdaMetric) + { + .m_desc = { + .pmid = PMDA_PMID(cluster_id, 4), + .type = PM_TYPE_U64, + .indom = indom_id_mapping[KEPLER_INDOM], + .sem = PM_SEM_COUNTER, + .units = PMDA_PMUNITS(0, 1, 0, 0, PM_TIME_NSEC, 0), + } + }; + /* bpf.kepler.cpu_cycles */ + metrics[CPU_CYCLES] = (struct pmdaMetric) + { + .m_desc = { + .pmid = PMDA_PMID(cluster_id, 5), + .type = PM_TYPE_U64, + .indom = indom_id_mapping[KEPLER_INDOM], + .sem = PM_SEM_COUNTER, + .units = PMDA_PMUNITS(0, 0, 1, 0, 0, PM_COUNT_ONE), + } + }; + /* bpf.kepler.cpu_instr */ + metrics[CPU_INSTR] = (struct pmdaMetric) + { + .m_desc = { + .pmid = PMDA_PMID(cluster_id, 6), + .type = PM_TYPE_U64, + .indom = indom_id_mapping[KEPLER_INDOM], + .sem = PM_SEM_COUNTER, + .units = PMDA_PMUNITS(0, 0, 1, 0, 0, PM_COUNT_ONE), + } + }; + /* bpf.kepler.cache_miss */ + metrics[CACHE_MISS] = (struct pmdaMetric) + { + .m_desc = { + .pmid = PMDA_PMID(cluster_id, 7), + .type = PM_TYPE_U64, + .indom = indom_id_mapping[KEPLER_INDOM], + .sem = PM_SEM_COUNTER, + .units = PMDA_PMUNITS(0, 0, 1, 0, 0, PM_COUNT_ONE), + } + }; + /* bpf.kepler.page_cache_hit */ + metrics[PAGE_CACHE_HIT] = (struct pmdaMetric) + { + .m_desc = { + .pmid = PMDA_PMID(cluster_id, 8), + .type = PM_TYPE_U64, + .indom = indom_id_mapping[KEPLER_INDOM], + .sem = PM_SEM_COUNTER, + .units = PMDA_PMUNITS(0, 0, 1, 0, 0, PM_COUNT_ONE), + } + }; + + /* KEPLER_INDOM */ + indoms[KEPLER_INDOM] = (struct pmdaIndom) + { + indom_id_mapping[KEPLER_INDOM], + env.process_count, + kepler_instances, + }; +} + +static void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) +{ + struct event *event = data; + struct tailq_entry *elm = allocElm(); + + elm->event.cgroup_id = event->cgroup_id; + elm->event.pid = event->pid; + elm->event.process_run_time = event->process_run_time; + elm->event.cpu_cycles = event->cpu_cycles; + elm->event.cpu_instr = event->cpu_instr; + elm->event.cache_miss = event->cache_miss; + elm->event.page_cache_hit = event->page_cache_hit; + /* TODO: vec_nr */ + strncpy(elm->event.comm, event->comm, sizeof(event->comm)); + + push(elm); +} + +static void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt) +{ + lost_events += lost_cnt; +} + +static int kepler_init(dict *cfg, char *module_name) +{ + int err; + char *val; + + if ((val = pmIniFileLookup(cfg, module_name, "process_count"))) + env.process_count = atoi(val); + + obj = kepler_bpf__open(); + if (!obj) { + pmNotifyErr(LOG_ERR, "failed to open BPF object"); + return 1; + } + pmNotifyErr(LOG_INFO, "booting: %s", obj->skeleton->name); + + err = kepler_bpf__load(obj); + if (err) { + pmNotifyErr(LOG_ERR, "failed to load BPF object: %d", err); + return err != 0; + } + + err = kepler_bpf__attach(obj); + if (err) { + pmNotifyErr(LOG_ERR, "failed to attach BPF programs"); + return err != 0; + } + + /* internal/external instance ids */ + fill_instids(env.process_count, &kepler_instances); + + /* Initialize the tail queue. */ + TAILQ_INIT(&head); + + /* setup event callbacks */ + pb = perf_buffer__new(bpf_map__fd(obj->maps.processes), PERF_BUFFER_PAGES, + handle_event, handle_lost_events, NULL, NULL); + if (!pb) { + err = -errno; + pmNotifyErr(LOG_ERR, "failed to open perf buffer: %d", err); + return err != 0; + } + + return err != 0; +} + +static void kepler_shutdown() +{ + struct tailq_entry *itemp; + + free(kepler_instances); + perf_buffer__free(pb); + kepler_bpf__destroy(obj); + /* Free the entire cache queue. */ + while ((itemp = TAILQ_FIRST(&head))) { + TAILQ_REMOVE(&head, itemp, entries); + free(itemp); + } +} + +static void kepler_refresh(unsigned int item) +{ + perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS); +} + +static int kepler_fetch_to_atom(unsigned int item, unsigned int inst, pmAtomValue *atom) +{ + struct tailq_entry *value; + + /* bpf.kepler.lost */ + if (item == LOST) { + atom->ul = lost_events; + return PMDA_FETCH_STATIC; + } + + if (inst == PM_IN_NULL) + return PM_ERR_INST; + + if (!get_item(inst, &value)) + return PMDA_FETCH_NOVALUES; + + switch (item) { + case COMM: /* bpf.kepler.comm */ + atom->cp = value->event.comm; + break; + case PID: /* bpf.kepler.pid */ + atom->ul = value->event.pid; + break; + case CGROUPID: /* bpf.kepler.cgroupid */ + atom->ull = value->event.cgroup_id; + break; + case RUNTIME: /* bpf.kepler.runtime */ + atom->ull = value->event.process_run_time; + break; + case CPU_CYCLES: /* bpf.kepler.cpu_cycles */ + atom->ull = value->event.cpu_cycles; + break; + case CPU_INSTR: /* bpf.kepler.cpu_instr */ + atom->ull = value->event.cpu_instr; + break; + case CACHE_MISS: /* bpf.kepler.cache_miss */ + atom->ull = value->event.cache_miss; + break; + case PAGE_CACHE_HIT: /* bpf.kepler.page_cache_hit */ + atom->ull = value->event.page_cache_hit; + break; + } + + return PMDA_FETCH_STATIC; +} + +struct module bpf_module = { + .init = kepler_init, + .register_metrics = kepler_register, + .metric_count = kepler_metric_count, + .indom_count = kepler_indom_count, + .set_indom_serial = kepler_set_indom_serial, + .shutdown = kepler_shutdown, + .refresh = kepler_refresh, + .fetch_to_atom = kepler_fetch_to_atom, + .metric_name = kepler_metric_name, + .metric_text = kepler_metric_text, +}; diff --git a/src/pmdas/bpf/modules/kepler.h b/src/pmdas/bpf/modules/kepler.h new file mode 100644 index 0000000000..764baf0704 --- /dev/null +++ b/src/pmdas/bpf/modules/kepler.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __KEPLER_H +#define __KEPLER_H + +#define TASK_COMM_LEN 16 +#define IRQ_MAX_LEN 10 +#define BASE_EVENT_SIZE (size_t)(&((struct event*)0)->args) +#define EVENT_SIZE(e) (BASE_EVENT_SIZE + e->args_size) + +typedef __u64 u64; +typedef __u16 u16; + +struct event { + u64 cgroup_id; + u64 pid; + u64 process_run_time; + u64 cpu_cycles; + u64 cpu_instr; + u64 cache_miss; + u64 page_cache_hit; + u16 vec_nr[IRQ_MAX_LEN]; + char comm[TASK_COMM_LEN]; +}; + +#endif /* __KEPLER_H */ diff --git a/src/pmdas/bpf/modules/module.h b/src/pmdas/bpf/modules/module.h index bb071d13ce..7aca3c1a19 100644 --- a/src/pmdas/bpf/modules/module.h +++ b/src/pmdas/bpf/modules/module.h @@ -113,6 +113,7 @@ char *all_modules[] = { "execsnoop", "exitsnoop" "fsslower", + "kepler", "mountsnoop", "oomkill", "opensnoop", diff --git a/src/pmdas/bpf/zmalloc.h b/src/pmdas/bpf/zmalloc.h deleted file mode 100644 index b5d166f633..0000000000 --- a/src/pmdas/bpf/zmalloc.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2017-2018,2020 Red Hat. - * - * This library is free software; you can redistribute it and/or modify it - * under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public - * License for more details. - */ -#ifndef ZMALLOC_H -#define ZMALLOC_H 1 - -#include - -extern void *zmalloc(size_t); -extern void *zcalloc(size_t, size_t); -extern void zfree(void *); - -#endif /* ZMALLOC_H */