From e3f6216c0b6a3079bc42fdfbc9f0b2365f207d0e Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@redhat.com>
Date: Tue, 21 Nov 2023 16:54:35 +1100
Subject: [PATCH] pmdabpf: prototype Kepler module (sustainable-computing.io)

---
 src/pmdas/bpf/README               |  42 +--
 src/pmdas/bpf/bpf.conf             |   9 +
 src/pmdas/bpf/modules/GNUmakefile  |  13 +-
 src/pmdas/bpf/modules/kepler.bpf.c | 307 +++++++++++++++++++++
 src/pmdas/bpf/modules/kepler.bpf.h | 132 +++++++++
 src/pmdas/bpf/modules/kepler.c     | 424 +++++++++++++++++++++++++++++
 src/pmdas/bpf/modules/kepler.h     |  25 ++
 src/pmdas/bpf/modules/module.h     |   1 +
 src/pmdas/bpf/zmalloc.h            |  23 --
 9 files changed, 931 insertions(+), 45 deletions(-)
 create mode 100644 src/pmdas/bpf/modules/kepler.bpf.c
 create mode 100644 src/pmdas/bpf/modules/kepler.bpf.h
 create mode 100644 src/pmdas/bpf/modules/kepler.c
 create mode 100644 src/pmdas/bpf/modules/kepler.h
 delete mode 100644 src/pmdas/bpf/zmalloc.h

diff --git a/src/pmdas/bpf/README b/src/pmdas/bpf/README
index e992735e8b..028c34aa2d 100644
--- a/src/pmdas/bpf/README
+++ b/src/pmdas/bpf/README
@@ -1,38 +1,40 @@
 PCP PMDA to load linux BPF modules
 ==================================
 
-This PMDA is capable of collecting and generating arbitrary metrics from kernel-side code running as pre-compiled
-ELF BPF/eBPF modules.
+This PMDA is capable of collecting and generating arbitrary metrics from
+kernel-side code running as pre-compiled ELF BPF/eBPF modules.
 
 Comparison to other PCP PMDAS
 =============================
 
-- pmdabcc runs as python and is a little easier to develop and maintain, has more advanced configuration files,
-  however it requires significantly more runtime memory. It loads and compiles the BCC code through LLVM which
-  has a heavy footprint to work with the fact that kernel structures might change. By comparison, pmdabpf uses
-  pre-compiled ELF-based BPF CO-RE modules to avoid relocation constraints.
+- pmdabcc runs as python and is a little easier to develop and maintain,
+  however it requires significantly more runtime memory. It loads and
+  compiles the BCC code through LLVM which has a heavy footprint to
+  work with the fact that kernel structures might change. By comparison,
+  pmdabpf uses pre-compiled ELF-based BPF CO-RE modules to avoid
+  relocation constraints.
 
-- pmdabpftrace can load and run arbitrary bpftrace code, so it is very flexible, however it does this by shelling
-  out to bpftrace executable which again requires quite a lot of memory.
+- pmdabpftrace can load and run arbitrary bpftrace code, so it is very
+  flexible, however it does this by shelling out to bpftrace executable
+  which again requires quite a lot of memory.
 
 Deployment
 ==========
 
-The file `bpf.conf` lists modules, one per line, that will be started.
+The file `bpf.conf` lists modules with their configuration information,
+one section per module, that will be started if enabled is set to true.
 
-A relatively new version of libbpf is required on the system. libbpf 0.1.0 is unlikely to work, however, libbpf 0.4.0
-has been shown to work.
+A relatively new version of libbpf is required on the system - 1.0.0 is
+known to work.
 
 Development
 ===========
 
 To develop additional modules:
-- Follow the examples provided (runqlat and biolatency) to create a new module (which will output a .so). The entry
-  point in your .so will be a load_module() call that should return a newly allocated `struct module` object.
-- Create your bpf code (this will become a .bpf.o). Use the various `_helpers` headers.
-- Ensure `module.h` has correct unique setup for your cluster and metric ids.
-- Add details to `pmns` and `help` files to ensure they match the `module.h` changes.
-
-TODO
-====
-- allow configuration settings
+- Follow the examples provided (runqlat and biolatency) to create a new
+  module (which will output a .so). The entry point in your .so will
+  be a load_module() call that should return a newly allocated `struct
+  module` object.
+- Create your bpf code (this will become a .bpf.o). Use the various
+  `_helpers` headers.
+- Add your module name to the global known-modules list in `module.h`.
diff --git a/src/pmdas/bpf/bpf.conf b/src/pmdas/bpf/bpf.conf
index b1a4d6ad33..f1545d5c81 100644
--- a/src/pmdas/bpf/bpf.conf
+++ b/src/pmdas/bpf/bpf.conf
@@ -135,3 +135,12 @@ enabled = false
 # cgroup        - string - unset : Trace process in cgroup path
 [biosnoop.so]
 enabled = false
+
+# This tool traces scheduling activity for use in power management.
+#
+# Configuration options:
+# Name              - type    - default
+#
+# process_count - int    - 20    : number of processes to keep in cache
+[kepler.so]
+enabled = false
diff --git a/src/pmdas/bpf/modules/GNUmakefile b/src/pmdas/bpf/modules/GNUmakefile
index 3e45b2a0f7..a7e28a4c60 100644
--- a/src/pmdas/bpf/modules/GNUmakefile
+++ b/src/pmdas/bpf/modules/GNUmakefile
@@ -64,6 +64,15 @@ APPS_BPF = \
 	tcpconnlat.bpf.c \
 	vfsstat.bpf.c \
 
+# non-libbpf header files.
+LOCAL_H = \
+	kepler.bpf.h \
+
+# non-libbpf *.bpf.c files.
+LOCAL_BPF = \
+	kepler.bpf.c \
+
+PMDABPF_MODULES += kepler.so
 
 default_pcp default:	$(PMDABPF_MODULES)
 
@@ -99,7 +108,7 @@ $(APPS_BPF):
 vmlinux.h: $(PMDABPF_VMLINUXH)
 	$(LN_S) -f $< $@
 
-%.bpf.o: %.bpf.c vmlinux.h $(APPS_H) $(APPS_BPF)
+%.bpf.o: %.bpf.c vmlinux.h $(APPS_H) $(APPS_BPF) $(LOCAL_H) $(LOCAL_BPF)
 	$(CLANG) -Wall -g -O2 -target bpf -D__TARGET_ARCH_$(PMDABPF_ARCH) \
 		-I. -c $< -o $@
 	$(LLVM_STRIP) -g $@
@@ -108,7 +117,7 @@ vmlinux.h: $(PMDABPF_VMLINUXH)
 	$(BPFTOOL) gen skeleton $< > $@
 
 %.o: %.c
-%.o: %.c %.skel.h $(HELPERS_H) $(APPS_H)
+%.o: %.c %.skel.h $(HELPERS_H) $(APPS_H) $(LOCAL_H)
 	$(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
 
 %_helpers.o: %_helpers.c
diff --git a/src/pmdas/bpf/modules/kepler.bpf.c b/src/pmdas/bpf/modules/kepler.bpf.c
new file mode 100644
index 0000000000..46323cc177
--- /dev/null
+++ b/src/pmdas/bpf/modules/kepler.bpf.c
@@ -0,0 +1,307 @@
+/*
+Copyright 2021.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// +build ignore
+
+#include <linux/version.h>
+
+#if (LINUX_KERNEL_VERSION >= KERNEL_VERSION(5, 12, 0))
+#define BPF_PERF_EVENT_READ_VALUE_AVAILABLE 1
+#endif
+
+#include "kepler.bpf.h"
+
+// processes and pid time
+BPF_HASH(processes, u32, process_metrics_t);
+BPF_HASH(pid_time, u32, u64);
+
+// perf counters
+BPF_PERF_ARRAY(cpu_cycles_hc_reader);
+BPF_ARRAY(cpu_cycles, u64);
+
+BPF_PERF_ARRAY(cpu_ref_cycles_hc_reader);
+BPF_ARRAY(cpu_ref_cycles, u64);
+
+BPF_PERF_ARRAY(cpu_instructions_hc_reader);
+BPF_ARRAY(cpu_instructions, u64);
+
+BPF_PERF_ARRAY(cache_miss_hc_reader);
+BPF_ARRAY(cache_miss, u64);
+
+// cpu freq counters
+BPF_ARRAY(cpu_freq_array, u32);
+
+// setting sample rate or counter to 0 will make compiler to remove the code entirely.
+int sample_rate = 1;
+int counter_sched_switch = 0;
+
+static inline u64 get_on_cpu_time(u32 cur_pid, u32 prev_pid, u64 cur_ts)
+{
+    u64 cpu_time = 0;
+
+    // get pid time
+    pid_time_t prev_pid_key = {.pid = prev_pid};
+    u64 *prev_ts;
+    prev_ts = bpf_map_lookup_elem(&pid_time, &prev_pid_key);
+    if (prev_ts)
+    {
+        // Probably a clock issue where the recorded on-CPU event had a
+        // timestamp later than the recorded off-CPU event, or vice versa.
+        if (cur_ts > *prev_ts)
+        {
+            cpu_time = (cur_ts - *prev_ts); /*nanosecond*/
+            bpf_map_delete_elem(&pid_time, &prev_pid_key);
+        }
+    }
+    pid_time_t new_pid_key = {.pid = cur_pid};
+    bpf_map_update_elem(&pid_time, &new_pid_key, &cur_ts, BPF_NOEXIST);
+
+    return cpu_time;
+}
+
+static inline u64 calc_delta(u64 *prev_val, u64 *val)
+{
+    u64 delta = 0;
+    if (prev_val)
+    {
+        if (*val > *prev_val)
+            delta = *val - *prev_val;
+    }
+    return delta;
+}
+
+// although the "get_on_cpu_counters" has some code duplications, it is inline code and the compiles will improve this
+static inline u64 get_on_cpu_cycles(u32 *cpu_id)
+{
+    u64 delta = 0;
+#ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE
+    struct bpf_perf_event_value c = {};
+    int error = bpf_perf_event_read_value(&cpu_cycles_hc_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
+    {
+        u64 val = c.counter;
+        u64 *prev_val = bpf_map_lookup_elem(&cpu_cycles, cpu_id);
+        delta = calc_delta(prev_val, &val);
+        bpf_map_update_elem(&cpu_cycles, cpu_id, &val, BPF_ANY);
+    }
+#else
+    int ret = bpf_perf_event_read(&cpu_cycles_hc_reader, *cpu_id);
+    if (ret < 0) {
+        return delta;
+    }
+    u64 val = ret;
+    u64 *prev_val = bpf_map_lookup_elem(&cpu_cycles, cpu_id);
+    delta = calc_delta(prev_val, &val);
+    bpf_map_update_elem(&cpu_cycles, cpu_id, &val, BPF_ANY);
+#endif
+
+    return delta;
+}
+
+static inline u64 get_on_cpu_ref_cycles(u32 *cpu_id)
+{
+    u64 delta = 0;
+#ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE
+    struct bpf_perf_event_value c = {};
+    int error = bpf_perf_event_read_value(&cpu_ref_cycles_hc_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
+    {
+        u64 val = c.counter;
+        u64 *prev_val = bpf_map_lookup_elem(&cpu_ref_cycles, cpu_id);
+        delta = calc_delta(prev_val, &val);
+        bpf_map_update_elem(&cpu_ref_cycles, cpu_id, &val, BPF_ANY);
+    }
+#else
+    int ret = bpf_perf_event_read(&cpu_ref_cycles_hc_reader, *cpu_id);
+    if (ret < 0) {
+        return delta;
+    }
+    u64 val = ret;
+    u64 *prev_val = bpf_map_lookup_elem(&cpu_ref_cycles, cpu_id);
+    delta = calc_delta(prev_val, &val);
+    bpf_map_update_elem(&cpu_ref_cycles, cpu_id, &val, BPF_ANY);
+#endif
+    return delta;
+}
+
+static inline u64 get_on_cpu_instr(u32 *cpu_id)
+{
+    u64 delta = 0;
+#ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE
+    struct bpf_perf_event_value c = {};
+    int error = bpf_perf_event_read_value(&cpu_instructions_hc_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
+    {
+        u64 val = c.counter;
+        u64 *prev_val = bpf_map_lookup_elem(&cpu_instructions, cpu_id);
+        delta = calc_delta(prev_val, &val);
+        bpf_map_update_elem(&cpu_instructions, cpu_id, &val, BPF_ANY);
+    }
+#else
+    int ret = bpf_perf_event_read(&cpu_instructions_hc_reader, *cpu_id);
+    if (ret < 0) {
+        return delta;
+    }
+    u64 val = ret;
+    u64 *prev_val = bpf_map_lookup_elem(&cpu_instructions, cpu_id);
+    delta = calc_delta(prev_val, &val);
+    bpf_map_update_elem(&cpu_instructions, cpu_id, &val, BPF_ANY);
+#endif
+    return delta;
+}
+
+static inline u64 get_on_cpu_cache_miss(u32 *cpu_id)
+{
+    u64 delta = 0;
+#ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE
+    struct bpf_perf_event_value c = {};
+    int error = bpf_perf_event_read_value(&cache_miss_hc_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
+    {
+        u64 val = c.counter;
+        u64 *prev_val = bpf_map_lookup_elem(&cache_miss, cpu_id);
+        delta = calc_delta(prev_val, &val);
+        bpf_map_update_elem(&cache_miss, cpu_id, &val, BPF_ANY);
+    }
+#else
+    int ret = bpf_perf_event_read(&cache_miss_hc_reader, *cpu_id);
+    if (ret < 0) {
+        return delta;
+    }
+    u64 val = ret;
+    u64 *prev_val = bpf_map_lookup_elem(&cache_miss, cpu_id);
+    delta = calc_delta(prev_val, &val);
+    bpf_map_update_elem(&cache_miss, cpu_id, &val, BPF_ANY);
+#endif
+    return delta;
+}
+
+// calculate the average cpu freq
+static inline u64 get_on_cpu_avg_freq(u32 *cpu_id, u64 on_cpu_cycles_delta, u64 on_cpu_ref_cycles_delta)
+{
+    u32 avg_freq = 0;
+    bpf_map_lookup_or_try_init(&cpu_freq_array, cpu_id, &avg_freq);
+    if (avg_freq == 0)
+    {
+        avg_freq = ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) * HZ;
+    }
+    else
+    {
+        avg_freq += ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) * HZ;
+        avg_freq /= 2;
+    }
+    bpf_map_update_elem(&cpu_freq_array, cpu_id, &avg_freq, BPF_ANY);
+    return avg_freq;
+}
+
+SEC("tracepoint/sched/sched_switch")
+int kepler_trace(struct sched_switch_args *ctx)
+{
+    u32 next_pid = ctx->next_pid; // the new pid that is to be scheduled
+
+    // only do sampling if sample rate is set
+    if (sample_rate != 0)
+    {
+        if (counter_sched_switch > 0)
+        {
+            counter_sched_switch--;
+            return 0;
+        }
+        counter_sched_switch = sample_rate;
+    }
+
+    u32 cur_pid = bpf_get_current_pid_tgid();
+    u64 cgroup_id = bpf_get_current_cgroup_id(); // the cgroup id is the cgroup id of the running process (this is not next_pid or prev_pid)
+    u64 cur_ts = bpf_ktime_get_ns();
+    u32 cpu_id = bpf_get_smp_processor_id();
+    u32 prev_pid = ctx->prev_pid;
+    
+    u64 on_cpu_cycles_delta = get_on_cpu_cycles(&cpu_id);
+    u64 on_cpu_ref_cycles_delta = get_on_cpu_ref_cycles(&cpu_id);
+    u64 on_cpu_instr_delta = get_on_cpu_instr(&cpu_id);
+    u64 on_cpu_cache_miss_delta = get_on_cpu_cache_miss(&cpu_id);
+    u64 on_cpu_avg_freq = get_on_cpu_avg_freq(&cpu_id, on_cpu_cycles_delta, on_cpu_ref_cycles_delta);
+    u64 on_cpu_time_delta = get_on_cpu_time(next_pid, prev_pid, cur_ts);
+
+    // store process metrics
+    struct process_metrics_t *process_metrics;
+    process_metrics = bpf_map_lookup_elem(&processes, &prev_pid);
+    if (process_metrics)
+    {
+        // update process time
+        process_metrics->process_run_time += on_cpu_time_delta;
+        process_metrics->cpu_cycles += on_cpu_cycles_delta;
+        process_metrics->cpu_instr += on_cpu_instr_delta;
+        process_metrics->cache_miss += on_cpu_cache_miss_delta;
+    }
+
+    process_metrics = bpf_map_lookup_elem(&processes, &cur_pid);
+    if (process_metrics == 0)
+    {
+        process_metrics_t new_process = {};
+        new_process.pid = cur_pid;
+        new_process.cgroup_id = cgroup_id;
+        // bpf_probe_read(&new_process.comm, sizeof(new_process.comm), (void *)ctx->next_comm);
+        bpf_get_current_comm(&new_process.comm, sizeof(new_process.comm));
+        bpf_map_update_elem(&processes, &cur_pid, &new_process, BPF_NOEXIST);
+    }
+    return 0;
+}
+
+SEC("tracepoint/irq/softirq_entry")
+int kepler_irq_trace(struct trace_event_raw_softirq *ctx)
+{
+    u32 cur_pid = bpf_get_current_pid_tgid();
+    struct process_metrics_t *process_metrics;
+    process_metrics = bpf_map_lookup_elem(&processes, &cur_pid);
+    if (process_metrics != 0)
+    {
+        if (ctx->vec < 10) {
+            process_metrics->vec_nr[ctx->vec] ++;
+        }
+    }
+    return 0;
+}
+
+// count read page cache
+SEC("kprobe/mark_page_accessed")
+int kprobe__mark_page_accessed(struct pt_regs *ctx)
+{
+    u32 cur_pid = bpf_get_current_pid_tgid();
+    struct process_metrics_t *process_metrics;
+    process_metrics = bpf_map_lookup_elem(&processes, &cur_pid);
+    if (process_metrics)
+    {
+        process_metrics->page_cache_hit ++;
+    }
+    return 0;
+}
+
+// count write page cache
+SEC("kprobe/set_page_dirty")
+int kprobe__set_page_dirty(struct pt_regs *ctx)
+{
+    u32 cur_pid = bpf_get_current_pid_tgid();
+    struct process_metrics_t *process_metrics;
+    process_metrics = bpf_map_lookup_elem(&processes, &cur_pid);
+    if (process_metrics)
+    {
+        process_metrics->page_cache_hit ++;
+    }
+    return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/src/pmdas/bpf/modules/kepler.bpf.h b/src/pmdas/bpf/modules/kepler.bpf.h
new file mode 100644
index 0000000000..0e29485841
--- /dev/null
+++ b/src/pmdas/bpf/modules/kepler.bpf.h
@@ -0,0 +1,132 @@
+/*
+Copyright 2021.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+
+/* In Linux 5.4 asm_inline was introduced, but it's not supported by clang.
+ * Redefine it to just asm to enable successful compilation.
+ * see https://github.com/iovisor/bcc/commit/2d1497cde1cc9835f759a707b42dea83bee378b8 for more details
+ */
+#include <linux/types.h>
+#include <linux/sched.h>
+#ifdef asm_inline
+#undef asm_inline
+#define asm_inline asm
+#endif
+
+typedef __u64 u64;
+typedef __u32 u32;
+typedef __u16 u16;
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+#ifndef NUM_CPUS
+#define NUM_CPUS 128
+#endif
+
+#ifndef TASK_COMM_LEN
+#define TASK_COMM_LEN 16
+#endif
+
+// irq counter, 10 is the max number of irq vectors
+#ifndef IRQ_MAX_LEN 
+#define IRQ_MAX_LEN 10
+#endif
+
+#ifndef CPU_REF_FREQ
+#define CPU_REF_FREQ 2500
+#endif
+
+#ifndef HZ
+#define HZ 1000
+#endif
+
+#ifndef MAP_SIZE
+#define MAP_SIZE 32768
+#endif
+
+// array size is to be reset in userspace
+#define BPF_ARRARY_MAP(_name, _type, _key_type, _value_type) \
+    struct {                                                 \
+        __uint(type, _type);                                 \
+        __type(key, _key_type);                              \
+        __type(value, _value_type);                          \
+        __uint(max_entries, NUM_CPUS);                       \
+    } _name SEC(".maps");
+
+#define BPF_HASH(_name, _key_type, _value_type) \
+    struct {                                    \
+        __uint(type, BPF_MAP_TYPE_HASH);        \
+        __type(key, _key_type);                 \
+        __type(value, _value_type);             \
+        __uint(max_entries, MAP_SIZE);          \
+    } _name SEC(".maps");
+
+#define BPF_ARRAY(_name, _leaf_type) \
+    BPF_ARRARY_MAP(_name, BPF_MAP_TYPE_ARRAY, u32, _leaf_type);
+
+#define BPF_PERF_ARRAY(_name) \
+    BPF_ARRARY_MAP(_name, BPF_MAP_TYPE_PERF_EVENT_ARRAY, int, u32)
+
+static __always_inline void *
+bpf_map_lookup_or_try_init(void *map, const void *key, const void *init)
+{
+	void *val;
+	int err;
+
+	val = bpf_map_lookup_elem(map, key);
+	if (val)
+		return val;
+
+	err = bpf_map_update_elem(map, key, init, BPF_NOEXIST);
+	if (err && err != -17)
+		return 0;
+
+	return bpf_map_lookup_elem(map, key);
+}
+
+struct sched_switch_args {
+    unsigned long long pad;
+    char prev_comm[TASK_COMM_LEN];
+    int prev_pid;
+    int prev_prio;
+    long long prev_state;
+    char next_comm[TASK_COMM_LEN];
+    int next_pid;
+    int next_prio;
+};
+
+struct trace_event_raw_softirq {
+    unsigned long long pad;
+    unsigned int vec;
+};
+
+typedef struct process_metrics_t {
+    u64 cgroup_id;
+    u64 pid;
+    u64 process_run_time;
+    u64 cpu_cycles;
+    u64 cpu_instr;
+    u64 cache_miss;
+    u64 page_cache_hit;
+    u16 vec_nr[IRQ_MAX_LEN]; 
+    char comm[TASK_COMM_LEN];
+} process_metrics_t;
+
+typedef struct pid_time_t {
+    u32 pid;
+} pid_time_t;
+
diff --git a/src/pmdas/bpf/modules/kepler.c b/src/pmdas/bpf/modules/kepler.c
new file mode 100644
index 0000000000..a8b3b4ddf1
--- /dev/null
+++ b/src/pmdas/bpf/modules/kepler.c
@@ -0,0 +1,424 @@
+/*
+ *
+ * Copyright (c) 2023 Red Hat.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include "module.h"
+
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+#include <pcp/pmda.h>
+#include <sys/queue.h>
+#include <pcp/pmwebapi.h>
+
+#include "kepler.skel.h"
+#include "kepler.h"
+
+#define PERF_BUFFER_PAGES 64
+#define PERF_POLL_TIMEOUT_MS 0
+
+#define INDOM_COUNT 1
+
+static struct env {
+    int process_count;
+} env = {
+    .process_count = 20,
+};
+
+static pmdaInstid *kepler_instances;
+static struct kepler_bpf *obj;
+static struct perf_buffer *pb = NULL;
+static int lost_events;
+static int queuelength;
+
+/* cache array */
+struct tailq_entry {
+    struct event event;
+    TAILQ_ENTRY(tailq_entry) entries;
+};
+
+TAILQ_HEAD(tailhead, tailq_entry) head;
+
+static struct tailq_entry* allocElm(void)
+{
+    return malloc(sizeof(struct tailq_entry));
+}
+
+static void push(struct tailq_entry *elm)
+{
+    TAILQ_INSERT_TAIL(&head, elm, entries);
+    if (queuelength > env.process_count)
+    {
+        struct tailq_entry *l;
+        l = head.tqh_first;
+        TAILQ_REMOVE(&head, l, entries);
+        free(l);
+        queuelength--;
+    }
+    queuelength++;
+}
+
+static bool get_item(unsigned int offset, struct tailq_entry** val)
+{
+    struct tailq_entry *i;
+    unsigned int iter = 0;
+
+    TAILQ_FOREACH_REVERSE(i, &head, tailhead, entries) {
+        if (offset == iter) {
+            *val = i;
+            return true;
+        }
+        iter++;
+    }
+    return false;
+}
+
+static unsigned int indom_id_mapping[INDOM_COUNT];
+
+enum metric_name {
+	COMM,
+	PID,
+	LOST,
+	CGROUPID,
+	RUNTIME,
+	CPU_CYCLES,
+	CPU_INSTR,
+	CACHE_MISS,
+	PAGE_CACHE_HIT,
+	METRIC_COUNT /* last */
+};
+enum metric_indom { KEPLER_INDOM };
+
+char* metric_names[METRIC_COUNT] = {
+    [COMM] = "kepler.comm",
+    [PID]  = "kepler.pid",
+    [LOST] = "kepler.lost",
+    [CGROUPID] = "kepler.cgroup_id",
+    [RUNTIME] = "kepler.runtime",
+    [CPU_CYCLES] = "kepler.cycles",
+    [CPU_INSTR] = "kepler.instr",
+    [CACHE_MISS] = "kepler.cache_miss",
+    [PAGE_CACHE_HIT] = "kepler.page_cache_hit",
+};
+
+char* metric_text_oneline[METRIC_COUNT] = {
+    [COMM] = "Command name",
+    [PID]  = "Process identifier",
+    [LOST] = "Number of lost events",
+    [CGROUPID] = "Control Group identifier for each process",
+    [RUNTIME] = "Run time of each process",
+    [CPU_CYCLES] = "Number of cycles for each process",
+    [CPU_INSTR] = "Number of instructions for each process",
+    [CACHE_MISS] = "Number of cache misses for each process",
+    [PAGE_CACHE_HIT] = "Number of page cache hits for each process",
+};
+
+char* metric_text_long[METRIC_COUNT] = {
+    [COMM] = "Command name",
+    [PID]  = "Process identifier",
+    [LOST] = "Number of lost events",
+    [CGROUPID] = "Control Group identifier for each process",
+    [RUNTIME] = "Run time of each process",
+    [CPU_CYCLES] = "Number of cycles for each process",
+    [CPU_INSTR] = "Number of instructions for each process",
+    [CACHE_MISS] = "Number of cache misses for each process",
+    [PAGE_CACHE_HIT] = "Number of page cache hits for each process",
+};
+
+static unsigned int kepler_metric_count(void)
+{
+    return METRIC_COUNT;
+}
+
+static char* kepler_metric_name(unsigned int metric)
+{
+    return metric_names[metric];
+}
+
+static unsigned int kepler_indom_count(void)
+{
+    return INDOM_COUNT;
+}
+
+static void kepler_set_indom_serial(unsigned int local_indom_id, unsigned int global_id)
+{
+    indom_id_mapping[local_indom_id] = global_id;
+}
+
+static int kepler_metric_text(int item, int type, char **buffer)
+{
+    if (type & PM_TEXT_ONELINE) {
+        *buffer = metric_text_oneline[item];
+    } else {
+        *buffer = metric_text_long[item];
+    }
+
+    return 0;
+}
+
+static void kepler_register(unsigned int cluster_id, pmdaMetric *metrics, pmdaIndom *indoms)
+{
+    /* bpf.kepler.comm */
+    metrics[COMM] = (struct pmdaMetric)
+    {
+        .m_desc = {
+            .pmid  = PMDA_PMID(cluster_id, 0),
+            .type  = PM_TYPE_STRING,
+            .indom = indom_id_mapping[KEPLER_INDOM],
+            .sem   = PM_SEM_INSTANT,
+            .units = PMDA_PMUNITS(0, 0, 0, 0, 0, 0),
+        }
+    };
+    /* bpf.kepler.pid */
+    metrics[PID] = (struct pmdaMetric)
+    {
+        .m_desc = {
+            .pmid  = PMDA_PMID(cluster_id, 1),
+            .type  = PM_TYPE_U32,
+            .indom = indom_id_mapping[KEPLER_INDOM],
+            .sem   = PM_SEM_DISCRETE,
+            .units = PMDA_PMUNITS(0, 0, 0, 0, 0, 0),
+        }
+    };
+    /* bpf.kepler.cgroupid */
+    metrics[CGROUPID] = (struct pmdaMetric)
+    {
+        .m_desc = {
+            .pmid  = PMDA_PMID(cluster_id, 2),
+            .type  = PM_TYPE_U64,
+            .indom = indom_id_mapping[KEPLER_INDOM],
+            .sem   = PM_SEM_INSTANT,
+            .units = PMDA_PMUNITS(0, 0, 0, 0, 0, 0),
+        }
+    };
+    /* bpf.kepler.lost */
+    metrics[LOST] = (struct pmdaMetric)
+    {
+        .m_desc = {
+            .pmid  = PMDA_PMID(cluster_id, 3),
+            .type  = PM_TYPE_U32,
+            .indom = PM_INDOM_NULL,
+            .sem   = PM_SEM_COUNTER,
+            .units = PMDA_PMUNITS(0, 0, 1, 0, 0, PM_COUNT_ONE),
+        }
+    };
+    /* bpf.kepler.runtime */
+    metrics[RUNTIME] = (struct pmdaMetric)
+    {
+        .m_desc = {
+            .pmid  = PMDA_PMID(cluster_id, 4),
+            .type  = PM_TYPE_U64,
+            .indom = indom_id_mapping[KEPLER_INDOM],
+            .sem   = PM_SEM_COUNTER,
+            .units = PMDA_PMUNITS(0, 1, 0, 0, PM_TIME_NSEC, 0),
+        }
+    };
+    /* bpf.kepler.cpu_cycles */
+    metrics[CPU_CYCLES] = (struct pmdaMetric)
+    {
+        .m_desc = {
+            .pmid  = PMDA_PMID(cluster_id, 5),
+            .type  = PM_TYPE_U64,
+            .indom = indom_id_mapping[KEPLER_INDOM],
+            .sem   = PM_SEM_COUNTER,
+            .units = PMDA_PMUNITS(0, 0, 1, 0, 0, PM_COUNT_ONE),
+        }
+    };
+    /* bpf.kepler.cpu_instr */
+    metrics[CPU_INSTR] = (struct pmdaMetric)
+    {
+        .m_desc = {
+            .pmid  = PMDA_PMID(cluster_id, 6),
+            .type  = PM_TYPE_U64,
+            .indom = indom_id_mapping[KEPLER_INDOM],
+            .sem   = PM_SEM_COUNTER,
+            .units = PMDA_PMUNITS(0, 0, 1, 0, 0, PM_COUNT_ONE),
+        }
+    };
+    /* bpf.kepler.cache_miss */
+    metrics[CACHE_MISS] = (struct pmdaMetric)
+    {
+        .m_desc = {
+            .pmid  = PMDA_PMID(cluster_id, 7),
+            .type  = PM_TYPE_U64,
+            .indom = indom_id_mapping[KEPLER_INDOM],
+            .sem   = PM_SEM_COUNTER,
+            .units = PMDA_PMUNITS(0, 0, 1, 0, 0, PM_COUNT_ONE),
+        }
+    };
+    /* bpf.kepler.page_cache_hit */
+    metrics[PAGE_CACHE_HIT] = (struct pmdaMetric)
+    {
+        .m_desc = {
+            .pmid  = PMDA_PMID(cluster_id, 8),
+            .type  = PM_TYPE_U64,
+            .indom = indom_id_mapping[KEPLER_INDOM],
+            .sem   = PM_SEM_COUNTER,
+            .units = PMDA_PMUNITS(0, 0, 1, 0, 0, PM_COUNT_ONE),
+        }
+    };
+
+    /* KEPLER_INDOM */
+    indoms[KEPLER_INDOM] = (struct pmdaIndom)
+    {
+        indom_id_mapping[KEPLER_INDOM],
+        env.process_count,
+        kepler_instances,
+    };
+}
+
+static void handle_event(void *ctx, int cpu, void *data, __u32 data_sz)
+{
+    struct event *event = data;
+    struct tailq_entry *elm = allocElm();
+
+    elm->event.cgroup_id = event->cgroup_id;
+    elm->event.pid = event->pid;
+    elm->event.process_run_time = event->process_run_time;
+    elm->event.cpu_cycles = event->cpu_cycles;
+    elm->event.cpu_instr = event->cpu_instr;
+    elm->event.cache_miss = event->cache_miss;
+    elm->event.page_cache_hit = event->page_cache_hit;
+    /* TODO: vec_nr */
+    strncpy(elm->event.comm, event->comm, sizeof(event->comm));
+
+    push(elm);
+}
+
+static void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt)
+{
+    lost_events += lost_cnt;
+}
+
+static int kepler_init(dict *cfg, char *module_name)
+{
+    int err;
+    char *val;
+
+    if ((val = pmIniFileLookup(cfg, module_name, "process_count")))
+        env.process_count = atoi(val);
+
+    obj = kepler_bpf__open();
+    if (!obj) {
+        pmNotifyErr(LOG_ERR, "failed to open BPF object");
+        return 1;
+    }
+    pmNotifyErr(LOG_INFO, "booting: %s", obj->skeleton->name);
+
+    err = kepler_bpf__load(obj);
+    if (err) {
+        pmNotifyErr(LOG_ERR, "failed to load BPF object: %d", err);
+        return err != 0;
+    }
+
+    err = kepler_bpf__attach(obj);
+    if (err) {
+        pmNotifyErr(LOG_ERR, "failed to attach BPF programs");
+        return err != 0;
+    }
+
+    /* internal/external instance ids */
+    fill_instids(env.process_count, &kepler_instances);
+
+    /* Initialize the tail queue. */
+    TAILQ_INIT(&head);
+
+    /* setup event callbacks */
+    pb = perf_buffer__new(bpf_map__fd(obj->maps.processes), PERF_BUFFER_PAGES,
+            handle_event, handle_lost_events, NULL, NULL);
+    if (!pb) {
+        err = -errno;
+        pmNotifyErr(LOG_ERR, "failed to open perf buffer: %d", err);
+        return err != 0;
+    }
+
+    return err != 0;
+}
+
+static void kepler_shutdown()
+{
+    struct tailq_entry *itemp;
+
+    free(kepler_instances);
+    perf_buffer__free(pb);
+    kepler_bpf__destroy(obj);
+    /* Free the entire cache queue. */
+    while ((itemp = TAILQ_FIRST(&head))) {
+        TAILQ_REMOVE(&head, itemp, entries);
+        free(itemp);
+    }
+}
+
+static void kepler_refresh(unsigned int item)
+{
+    perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS);
+}
+
+static int kepler_fetch_to_atom(unsigned int item, unsigned int inst, pmAtomValue *atom)
+{
+    struct tailq_entry *value;
+
+    /* bpf.kepler.lost */
+    if (item == LOST) {
+        atom->ul = lost_events;
+        return PMDA_FETCH_STATIC;
+    }
+
+    if (inst == PM_IN_NULL)
+        return PM_ERR_INST;
+
+    if (!get_item(inst, &value))
+        return PMDA_FETCH_NOVALUES;
+
+    switch (item) {
+	case COMM: /* bpf.kepler.comm */
+	    atom->cp = value->event.comm;
+	    break;
+	case PID: /* bpf.kepler.pid */
+	    atom->ul = value->event.pid;
+	    break;
+	case CGROUPID: /* bpf.kepler.cgroupid */
+	    atom->ull = value->event.cgroup_id;
+	    break;
+	case RUNTIME: /* bpf.kepler.runtime */
+	    atom->ull = value->event.process_run_time;
+	    break;
+	case CPU_CYCLES: /* bpf.kepler.cpu_cycles */
+	    atom->ull = value->event.cpu_cycles;
+	    break;
+	case CPU_INSTR: /* bpf.kepler.cpu_instr */
+	    atom->ull = value->event.cpu_instr;
+	    break;
+	case CACHE_MISS: /* bpf.kepler.cache_miss */
+	    atom->ull = value->event.cache_miss;
+	    break;
+	case PAGE_CACHE_HIT: /* bpf.kepler.page_cache_hit */
+	    atom->ull = value->event.page_cache_hit;
+	    break;
+    }
+
+    return PMDA_FETCH_STATIC;
+}
+
+struct module bpf_module = {
+    .init               = kepler_init,
+    .register_metrics   = kepler_register,
+    .metric_count       = kepler_metric_count,
+    .indom_count        = kepler_indom_count,
+    .set_indom_serial   = kepler_set_indom_serial,
+    .shutdown           = kepler_shutdown,
+    .refresh            = kepler_refresh,
+    .fetch_to_atom      = kepler_fetch_to_atom,
+    .metric_name        = kepler_metric_name,
+    .metric_text        = kepler_metric_text,
+};
diff --git a/src/pmdas/bpf/modules/kepler.h b/src/pmdas/bpf/modules/kepler.h
new file mode 100644
index 0000000000..764baf0704
--- /dev/null
+++ b/src/pmdas/bpf/modules/kepler.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#ifndef __KEPLER_H
+#define __KEPLER_H
+
+#define TASK_COMM_LEN 16
+#define IRQ_MAX_LEN 10
+#define BASE_EVENT_SIZE (size_t)(&((struct event*)0)->args)
+#define EVENT_SIZE(e) (BASE_EVENT_SIZE + e->args_size)
+
+typedef __u64 u64;
+typedef __u16 u16;
+
+struct event {
+	u64 cgroup_id;
+	u64 pid;
+	u64 process_run_time;
+	u64 cpu_cycles;
+	u64 cpu_instr;
+	u64 cache_miss;
+	u64 page_cache_hit;
+	u16 vec_nr[IRQ_MAX_LEN];
+	char comm[TASK_COMM_LEN];
+};
+
+#endif /* __KEPLER_H */
diff --git a/src/pmdas/bpf/modules/module.h b/src/pmdas/bpf/modules/module.h
index bb071d13ce..7aca3c1a19 100644
--- a/src/pmdas/bpf/modules/module.h
+++ b/src/pmdas/bpf/modules/module.h
@@ -113,6 +113,7 @@ char *all_modules[] = {
     "execsnoop",
     "exitsnoop"
     "fsslower",
+    "kepler",
     "mountsnoop",
     "oomkill",
     "opensnoop",
diff --git a/src/pmdas/bpf/zmalloc.h b/src/pmdas/bpf/zmalloc.h
deleted file mode 100644
index b5d166f633..0000000000
--- a/src/pmdas/bpf/zmalloc.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2017-2018,2020 Red Hat.
- * 
- * This library is free software; you can redistribute it and/or modify it
- * under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- * 
- * This library is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
- * License for more details.
- */
-#ifndef ZMALLOC_H
-#define ZMALLOC_H 1
-
-#include <sys/types.h>
-
-extern void *zmalloc(size_t);
-extern void *zcalloc(size_t, size_t);
-extern void zfree(void *);
-
-#endif /* ZMALLOC_H */