From 835dc2746398fc1e95c9f1faa7ff55d79599b7dc Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Tue, 19 Nov 2024 11:27:21 +0200 Subject: [PATCH 1/4] perf_cnt: add more build options to PERFORMANCE_COUNTERS The performance counter results are delivered via the logging subsystem and the logging overhead can interfere with the measurements themselves. To mitigate the impact, only a small set of performance counters should be enabled at the same time in build. To enable this, break the CONFIG_PERFORMANCE_COUNTERS Kconfig option into more fine-grained options and add separate options to enable LL task and audio component performance tracing. Signed-off-by: Kai Vehmanen --- Kconfig.sof | 20 ++++++++++++++++++++ app/perf_overlay.conf | 2 ++ src/audio/component.c | 4 ++-- src/include/sof/audio/component.h | 2 +- src/schedule/ll_schedule.c | 4 ++-- src/schedule/zephyr_ll.c | 4 ++-- 6 files changed, 29 insertions(+), 7 deletions(-) diff --git a/Kconfig.sof b/Kconfig.sof index 3bf3b337b040..0833201e817d 100644 --- a/Kconfig.sof +++ b/Kconfig.sof @@ -200,6 +200,26 @@ config PERFORMANCE_COUNTERS use the stamp() macro periodically to find out how long the cpu was in active/sleep state between the calls and estimate the cpu load. +config PERFORMANCE_COUNTERS_COMPONENT + bool "Use performance counters to track component execution" + default n + depends on PERFORMANCE_COUNTERS + help + Use performance counters to trace low-latency task execution. + This enables to observe average and peak execution times at + audio component level granularity. + Results are reported via logging subsystem. + +config PERFORMANCE_COUNTERS_LL_TASKS + bool "Use performance counters to track LL task execution" + default n + depends on PERFORMANCE_COUNTERS + help + Use performance counters to trace low-latency task execution. + This enables to observe average and peak execution times at + task level granularity. + Results are reported via logging subsystem. + config DSP_RESIDENCY_COUNTERS bool "DSP residency counters" default n diff --git a/app/perf_overlay.conf b/app/perf_overlay.conf index 08edb74ee978..c50f62d01d31 100644 --- a/app/perf_overlay.conf +++ b/app/perf_overlay.conf @@ -1,4 +1,6 @@ CONFIG_PERFORMANCE_COUNTERS=y +CONFIG_PERFORMANCE_COUNTERS_COMPONENT=y +CONFIG_PERFORMANCE_COUNTERS_LL_TASKS=y CONFIG_SYS_HEAP_RUNTIME_STATS=y CONFIG_TIMING_FUNCTIONS=y CONFIG_ADSP_IDLE_CLOCK_GATING=n diff --git a/src/audio/component.c b/src/audio/component.c index 4956e9ec0359..33ee3e6f0419 100644 --- a/src/audio/component.c +++ b/src/audio/component.c @@ -490,7 +490,7 @@ int comp_copy(struct comp_dev *dev) */ if (cpu_is_me(dev->ipc_config.core) || dev->ipc_config.proc_domain == COMP_PROCESSING_DOMAIN_DP) { -#if CONFIG_PERFORMANCE_COUNTERS +#if CONFIG_PERFORMANCE_COUNTERS_COMPONENT perf_cnt_init(&dev->pcd); #endif @@ -506,7 +506,7 @@ int comp_copy(struct comp_dev *dev) comp_update_performance_data(dev, cycles_consumed); #endif -#if CONFIG_PERFORMANCE_COUNTERS +#if CONFIG_PERFORMANCE_COUNTERS_COMPONENT perf_cnt_stamp(&dev->pcd, perf_trace_null, dev); perf_cnt_average(&dev->pcd, comp_perf_avg_info, dev); #endif diff --git a/src/include/sof/audio/component.h b/src/include/sof/audio/component.h index eb5253bffa15..e429e9cbe07d 100644 --- a/src/include/sof/audio/component.h +++ b/src/include/sof/audio/component.h @@ -627,7 +627,7 @@ struct comp_dev { /* private data - core does not touch this */ void *priv_data; /**< private data */ -#if CONFIG_PERFORMANCE_COUNTERS +#if CONFIG_PERFORMANCE_COUNTERS_COMPONENT struct perf_cnt_data pcd; #endif diff --git a/src/schedule/ll_schedule.c b/src/schedule/ll_schedule.c index d310f7d13105..8881fd7bd224 100644 --- a/src/schedule/ll_schedule.c +++ b/src/schedule/ll_schedule.c @@ -68,7 +68,7 @@ DECLARE_TR_CTX(ll_tr, SOF_UUID(ll_sched_uuid), LOG_LEVEL_INFO); struct ll_schedule_data { struct list_item tasks; /* list of ll tasks */ atomic_t num_tasks; /* number of ll tasks */ -#if CONFIG_PERFORMANCE_COUNTERS +#if CONFIG_PERFORMANCE_COUNTERS__LL_TASKS struct perf_cnt_data pcd; #endif struct ll_schedule_domain *domain; /* scheduling domain */ @@ -76,7 +76,7 @@ struct ll_schedule_data { static const struct scheduler_ops schedule_ll_ops; -#if CONFIG_PERFORMANCE_COUNTERS +#if CONFIG_PERFORMANCE_COUNTERS__LL_TASKS static void perf_ll_sched_trace(struct perf_cnt_data *pcd, int ignored) { tr_info(&ll_tr, "perf ll_work peak plat %u cpu %u", diff --git a/src/schedule/zephyr_ll.c b/src/schedule/zephyr_ll.c index 49b1cc0d63a2..e8a83e09dd2a 100644 --- a/src/schedule/zephyr_ll.c +++ b/src/schedule/zephyr_ll.c @@ -134,13 +134,13 @@ static inline enum task_state do_task_run(struct task *task) { enum task_state state; -#if CONFIG_PERFORMANCE_COUNTERS +#if CONFIG_PERFORMANCE_COUNTERS_LL_TASKS perf_cnt_init(&task->pcd); #endif state = task_run(task); -#if CONFIG_PERFORMANCE_COUNTERS +#if CONFIG_PERFORMANCE_COUNTERS_LL_TASKS perf_cnt_stamp(&task->pcd, perf_trace_null, NULL); task_perf_cnt_avg(&task->pcd, task_perf_avg_info, &ll_tr, task); #endif From 06c150537b84231203fe5433618faca5e39d8a82 Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Tue, 19 Nov 2024 11:38:20 +0200 Subject: [PATCH 2/4] app: perf_overlay: reduce amount of parallel perf monitors Disable CONFIG_PERFORMANCE_COUNTERS_LL_TASKS and CONFIG_SCHEDULE_LL_STATS_LOG by default in the performance overlay. This reduces logging overhead and makes the component level peak traces more reliable. The logging overhead has minimal impact to reported averages, but can be seen in peak execution measurements. Signed-off-by: Kai Vehmanen --- app/perf_overlay.conf | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/app/perf_overlay.conf b/app/perf_overlay.conf index c50f62d01d31..24d4109a1bb8 100644 --- a/app/perf_overlay.conf +++ b/app/perf_overlay.conf @@ -1,7 +1,10 @@ CONFIG_PERFORMANCE_COUNTERS=y CONFIG_PERFORMANCE_COUNTERS_COMPONENT=y -CONFIG_PERFORMANCE_COUNTERS_LL_TASKS=y +# disable ll task level statistics to reduce logging overhead +#CONFIG_PERFORMANCE_COUNTERS_LL_TASKS=y CONFIG_SYS_HEAP_RUNTIME_STATS=y CONFIG_TIMING_FUNCTIONS=y CONFIG_ADSP_IDLE_CLOCK_GATING=n CONFIG_KCPS_DYNAMIC_CLOCK_CONTROL=n +# disable top-level statistics to reduce logging overhead +CONFIG_SCHEDULE_LL_STATS_LOG=n From 9f195f0d95ab0dde44c30f25ce552c482b667693 Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Tue, 19 Nov 2024 11:45:36 +0200 Subject: [PATCH 3/4] perf_cnt: use alternate reporting to minimize logging overhead Implement simple alternate reporting for perf_cnt_average() and task_perf_cnt_avg(). By calling the reporting function only for every other measurement window, the overhead of reporting can be filtered out from data. This mostly affects the peak cycle reporting. For average values reporting has only minimal impact. Signed-off-by: Kai Vehmanen --- src/include/sof/lib/perf_cnt.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/include/sof/lib/perf_cnt.h b/src/include/sof/lib/perf_cnt.h index 471bbe283586..bcb2a0769630 100644 --- a/src/include/sof/lib/perf_cnt.h +++ b/src/include/sof/lib/perf_cnt.h @@ -95,11 +95,11 @@ struct perf_cnt_data { (uint32_t)((pcd)->cpu_delta_peak)) #define task_perf_cnt_avg(pcd, trace_m, arg, class) do { \ (pcd)->cpu_delta_sum += (pcd)->cpu_delta_last; \ - if (++(pcd)->period_cnt == 1 << PERF_CNT_CHECK_WINDOW_SIZE) { \ + if (!(++(pcd)->period_cnt & MASK(PERF_CNT_CHECK_WINDOW_SIZE - 1, 0))) { \ (pcd)->cpu_delta_sum >>= PERF_CNT_CHECK_WINDOW_SIZE; \ - trace_m(pcd, arg, class); \ + if ((pcd)->period_cnt & BIT(PERF_CNT_CHECK_WINDOW_SIZE)) \ + trace_m(pcd, arg, class); \ (pcd)->cpu_delta_sum = 0; \ - (pcd)->period_cnt = 0; \ (pcd)->plat_delta_peak = 0; \ (pcd)->cpu_delta_peak = 0; \ } \ @@ -115,11 +115,13 @@ struct perf_cnt_data { */ #define perf_cnt_average(pcd, trace_m, arg) do { \ (pcd)->cpu_delta_sum += (pcd)->cpu_delta_last; \ - if (++(pcd)->period_cnt == 1 << PERF_CNT_CHECK_WINDOW_SIZE) {\ + if (!(++(pcd)->period_cnt & MASK(PERF_CNT_CHECK_WINDOW_SIZE - 1, 0))) { \ (pcd)->cpu_delta_sum >>= PERF_CNT_CHECK_WINDOW_SIZE; \ - trace_m(pcd, arg); \ + (pcd)->peak_mcps_period_cnt &= MASK(PERF_CNT_CHECK_WINDOW_SIZE - 1, 0); \ + if ((pcd)->period_cnt & BIT(PERF_CNT_CHECK_WINDOW_SIZE)) { \ + trace_m(pcd, arg); \ + } \ (pcd)->cpu_delta_sum = 0; \ - (pcd)->period_cnt = 0; \ (pcd)->plat_delta_peak = 0; \ (pcd)->cpu_delta_peak = 0; \ (pcd)->peak_mcps_period_cnt = 0; \ From cc4a3318e03be50a53d1950a95deb90aa1df649d Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Tue, 19 Nov 2024 12:19:26 +0200 Subject: [PATCH 4/4] app: perf_overlay: add note on CONFIG_DMA_INTEL_ADSP_HDA_TIMING_L1_EXIT On many Intel platforms, the HD-DMA interrupts can interfere with component level performance measurements. Add a comment how to disable the interrupts when doing component performance analysis. Signed-off-by: Kai Vehmanen --- app/perf_overlay.conf | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/app/perf_overlay.conf b/app/perf_overlay.conf index 24d4109a1bb8..4f08b61fc3c3 100644 --- a/app/perf_overlay.conf +++ b/app/perf_overlay.conf @@ -8,3 +8,9 @@ CONFIG_ADSP_IDLE_CLOCK_GATING=n CONFIG_KCPS_DYNAMIC_CLOCK_CONTROL=n # disable top-level statistics to reduce logging overhead CONFIG_SCHEDULE_LL_STATS_LOG=n + +# vendor/target dependent options +# +# uncomment to disable Intel HD-DMA L1 exit ISR. this affects +# the peak execution times at component level +#CONFIG_DMA_INTEL_ADSP_HDA_TIMING_L1_EXIT=n