From eccc8f58dc74b5f4f76087c08b244290e3bb05e4 Mon Sep 17 00:00:00 2001 From: Serhiy Katsyuba Date: Thu, 12 Oct 2023 17:20:35 +0200 Subject: [PATCH] ipc4: Add cross-core binding support Implements binding of two pipelines from different cores so stream could travel cross-core. Signed-off-by: Serhiy Katsyuba --- src/include/sof/schedule/ll_schedule_domain.h | 25 +++ src/ipc/ipc4/helper.c | 155 +++++++++++++++--- src/schedule/zephyr_domain.c | 60 +++++++ zephyr/Kconfig | 11 ++ 4 files changed, 228 insertions(+), 23 deletions(-) diff --git a/src/include/sof/schedule/ll_schedule_domain.h b/src/include/sof/schedule/ll_schedule_domain.h index f555c256398b..9765df5ef0bc 100644 --- a/src/include/sof/schedule/ll_schedule_domain.h +++ b/src/include/sof/schedule/ll_schedule_domain.h @@ -44,6 +44,17 @@ struct ll_schedule_domain_ops { struct task *task, uint32_t num_tasks); void (*domain_enable)(struct ll_schedule_domain *domain, int core); void (*domain_disable)(struct ll_schedule_domain *domain, int core); +#if CONFIG_CROSS_CORE_STREAM + /* + * Unlike domain_disable(), these are intended to temporary block LL from + * starting its next cycle. Triggering (e.g., by means of timer interrupt) + * is still enabled and registered but execution of next cycle is blocked. + * Once unblocked, if triggering were previously registered in a blocked + * state -- next cycle execution could start immediately. + */ + void (*domain_block)(struct ll_schedule_domain *domain); + void (*domain_unblock)(struct ll_schedule_domain *domain); +#endif void (*domain_set)(struct ll_schedule_domain *domain, uint64_t start); void (*domain_clear)(struct ll_schedule_domain *domain); bool (*domain_is_pending)(struct ll_schedule_domain *domain, @@ -192,6 +203,20 @@ static inline void domain_disable(struct ll_schedule_domain *domain, int core) } } +#if CONFIG_CROSS_CORE_STREAM +static inline void domain_block(struct ll_schedule_domain *domain) +{ + if (domain->ops->domain_block) + domain->ops->domain_block(domain); +} + +static inline void domain_unblock(struct ll_schedule_domain *domain) +{ + if (domain->ops->domain_unblock) + domain->ops->domain_unblock(domain); +} +#endif + static inline bool domain_is_pending(struct ll_schedule_domain *domain, struct task *task, struct comp_dev **comp) { diff --git a/src/ipc/ipc4/helper.c b/src/ipc/ipc4/helper.c index 91babe75b97d..29ef61dfb5c9 100644 --- a/src/ipc/ipc4/helper.c +++ b/src/ipc/ipc4/helper.c @@ -337,10 +337,78 @@ static struct comp_buffer *ipc4_create_buffer(struct comp_dev *src, bool is_shar ipc_buf.size = buf_size; ipc_buf.comp.id = IPC4_COMP_ID(src_queue, dst_queue); ipc_buf.comp.pipeline_id = src->ipc_config.pipeline_id; - ipc_buf.comp.core = src->ipc_config.core; + ipc_buf.comp.core = cpu_get_id(); return buffer_new(&ipc_buf, is_shared); } +#if CONFIG_CROSS_CORE_STREAM +/* + * Disabling interrupts to block next LL cycle works much faster comparing using + * of condition variable and mutex. Since same core binding is the most typical + * case, let's use slower cond_var blocking mechanism only for not so typical + * cross-core binding. + * + * Note, disabling interrupts to block LL for cross-core binding case will not work + * as .bind() handlers are called on corresponding cores using IDC tasks. IDC requires + * interrupts to be enabled. Only disabling timer interrupt instead of all interrupts + * might work. However, as CPU could go to some power down mode while waiting for + * blocking IDC call response, it's not clear how safe is to assume CPU can wakeup + * without timer interrupt. It depends on blocking IDC waiting implementation. That + * is why additional cond_var mechanism to block LL was introduced which does not + * disable any interrupts. + */ + +#define ll_block(cross_core_bind) \ + do { \ + if (cross_core_bind) \ + domain_block(sof_get()->platform_timer_domain); \ + else \ + irq_local_disable(flags); \ + } while (0) + +#define ll_unblock(cross_core_bind) \ + do { \ + if (cross_core_bind) \ + domain_unblock(sof_get()->platform_timer_domain); \ + else \ + irq_local_enable(flags); \ + } while (0) + +/* Calling both ll_block() and ll_wait_finished_on_core() makes sure LL will not start its + * next cycle and its current cycle on specified core has finished. + */ +static int ll_wait_finished_on_core(struct comp_dev *dev) +{ + /* To make sure (blocked) LL has finished its current cycle, it is + * enough to send any blocking IDC to the core. Since IDC task has lower + * priority then LL thread and cannot preempt it, execution of IDC task + * happens when LL thread is not active waiting for its next cycle. + */ + + int ret; + struct ipc4_base_module_cfg dummy; + + if (cpu_is_me(dev->ipc_config.core)) + return 0; + + /* Any blocking IDC that does not change component state could be utilized */ + ret = comp_ipc4_get_attribute_remote(dev, COMP_ATTR_BASE_CONFIG, &dummy); + if (ret < 0) { + tr_err(&ipc_tr, "comp_ipc4_get_attribute_remote() failed for module %#x", + dev_comp_id(dev)); + return ret; + } + + return 0; +} + +#else + +#define ll_block(cross_core_bind) irq_local_disable(flags) +#define ll_unblock(cross_core_bind) irq_local_enable(flags) + +#endif + int ipc_comp_connect(struct ipc *ipc, ipc_pipe_comp_connect *_connect) { struct ipc4_module_bind_unbind *bu; @@ -364,14 +432,15 @@ int ipc_comp_connect(struct ipc *ipc, ipc_pipe_comp_connect *_connect) return IPC4_INVALID_RESOURCE_ID; } - bool is_shared = source->ipc_config.core != sink->ipc_config.core; + bool cross_core_bind = source->ipc_config.core != sink->ipc_config.core; - /* Pass IPC to target core if the buffer won't be shared and will be used - * on different core + /* If both components are on same core -- process IPC on that core, + * otherwise stay on core 0 */ - if (!cpu_is_me(source->ipc_config.core) && !is_shared) + if (!cpu_is_me(source->ipc_config.core) && !cross_core_bind) return ipc4_process_on_core(source->ipc_config.core, false); + /* these might call comp_ipc4_get_attribute_remote() if necessary */ ret = comp_get_attribute(source, COMP_ATTR_BASE_CONFIG, &source_src_cfg); if (ret < 0) { tr_err(&ipc_tr, "failed to get base config for module %#x", dev_comp_id(source)); @@ -397,7 +466,7 @@ int ipc_comp_connect(struct ipc *ipc, ipc_pipe_comp_connect *_connect) else buf_size = sink_src_cfg.ibs * 2; - buffer = ipc4_create_buffer(source, is_shared, buf_size, bu->extension.r.src_queue, + buffer = ipc4_create_buffer(source, cross_core_bind, buf_size, bu->extension.r.src_queue, bu->extension.r.dst_queue); if (!buffer) { tr_err(&ipc_tr, "failed to allocate buffer to bind %d to %d", src_id, sink_id); @@ -418,12 +487,26 @@ int ipc_comp_connect(struct ipc *ipc, ipc_pipe_comp_connect *_connect) source_set_min_available(audio_stream_get_source(&buffer->stream), sink_src_cfg.ibs); /* - * Connect and bind the buffer to both source and sink components with the interrupts - * disabled to prevent the IPC task getting preempted which could result in buffers being - * only half connected when a pipeline task gets executed. A spinlock isn't required - * because all connected pipelines need to be on the same core. + * Connect and bind the buffer to both source and sink components with LL processing been + * blocked on corresponding core(s) to prevent IPC or IDC task getting preempted which + * could result in buffers being only half connected when a pipeline task gets executed. */ - irq_local_disable(flags); + ll_block(cross_core_bind); + + if (cross_core_bind) { +#if CONFIG_CROSS_CORE_STREAM + /* Make sure LL has finished on both cores */ + if (!cpu_is_me(source->ipc_config.core)) + if (ll_wait_finished_on_core(source) < 0) + goto free; + if (!cpu_is_me(sink->ipc_config.core)) + if (ll_wait_finished_on_core(sink) < 0) + goto free; +#else + tr_err(&ipc_tr, "Cross-core binding is disabled"); + goto free; +#endif + } ret = comp_buffer_connect(source, source->ipc_config.core, buffer, PPL_CONN_DIR_COMP_TO_BUFFER); @@ -432,7 +515,6 @@ int ipc_comp_connect(struct ipc *ipc, ipc_pipe_comp_connect *_connect) goto free; } - ret = comp_buffer_connect(sink, sink->ipc_config.core, buffer, PPL_CONN_DIR_BUFFER_TO_COMP); if (ret < 0) { @@ -440,7 +522,7 @@ int ipc_comp_connect(struct ipc *ipc, ipc_pipe_comp_connect *_connect) goto e_sink_connect; } - + /* these might call comp_ipc4_bind_remote() if necessary */ ret = comp_bind(source, bu); if (ret < 0) goto e_src_bind; @@ -461,7 +543,7 @@ int ipc_comp_connect(struct ipc *ipc, ipc_pipe_comp_connect *_connect) source->direction_set = true; } - irq_local_enable(flags); + ll_unblock(cross_core_bind); return IPC4_SUCCESS; @@ -472,7 +554,7 @@ int ipc_comp_connect(struct ipc *ipc, ipc_pipe_comp_connect *_connect) e_sink_connect: pipeline_disconnect(source, buffer, PPL_CONN_DIR_COMP_TO_BUFFER); free: - irq_local_enable(flags); + ll_unblock(cross_core_bind); buffer_free(buffer); return IPC4_INVALID_RESOURCE_STATE; } @@ -491,6 +573,7 @@ int ipc_comp_disconnect(struct ipc *ipc, ipc_pipe_comp_connect *_connect) uint32_t src_id, sink_id, buffer_id; uint32_t flags; int ret, ret1; + bool cross_core_unbind; bu = (struct ipc4_module_bind_unbind *)_connect; src_id = IPC4_COMP_ID(bu->primary.r.module_id, bu->primary.r.instance_id); @@ -507,8 +590,12 @@ int ipc_comp_disconnect(struct ipc *ipc, ipc_pipe_comp_connect *_connect) return 0; } - /* Pass IPC to target core if both modules has the same target core */ - if (!cpu_is_me(src->ipc_config.core) && src->ipc_config.core == sink->ipc_config.core) + cross_core_unbind = src->ipc_config.core != sink->ipc_config.core; + + /* Pass IPC to target core if both modules has the same target core, + * otherwise stay on core 0 + */ + if (!cpu_is_me(src->ipc_config.core) && !cross_core_unbind) return ipc4_process_on_core(src->ipc_config.core, false); buffer_id = IPC4_COMP_ID(bu->extension.r.src_queue, bu->extension.r.dst_queue); @@ -527,17 +614,39 @@ int ipc_comp_disconnect(struct ipc *ipc, ipc_pipe_comp_connect *_connect) /* * Disconnect and unbind buffer from source/sink components and continue to free the buffer - * even in case of errors. Disable interrupts during disconnect and unbinding to prevent - * the IPC task getting preempted which could result in buffers being only half connected - * when a pipeline task gets executed. A spinlock isn't required because all connected - * pipelines need to be on the same core. + * even in case of errors. Block LL processing during disconnect and unbinding to prevent + * IPC or IDC task getting preempted which could result in buffers being only half connected + * when a pipeline task gets executed. */ - irq_local_disable(flags); + ll_block(cross_core_unbind); + + if (cross_core_unbind) { +#if CONFIG_CROSS_CORE_STREAM + /* Make sure LL has finished on both cores */ + if (!cpu_is_me(src->ipc_config.core)) + if (ll_wait_finished_on_core(src) < 0) { + ll_unblock(cross_core_unbind); + return IPC4_FAILURE; + } + if (!cpu_is_me(sink->ipc_config.core)) + if (ll_wait_finished_on_core(sink) < 0) { + ll_unblock(cross_core_unbind); + return IPC4_FAILURE; + } +#else + tr_err(&ipc_tr, "Cross-core binding is disabled"); + ll_unblock(cross_core_unbind); + return IPC4_FAILURE; +#endif + } + pipeline_disconnect(src, buffer, PPL_CONN_DIR_COMP_TO_BUFFER); pipeline_disconnect(sink, buffer, PPL_CONN_DIR_BUFFER_TO_COMP); + /* these might call comp_ipc4_bind_remote() if necessary */ ret = comp_unbind(src, bu); ret1 = comp_unbind(sink, bu); - irq_local_enable(flags); + + ll_unblock(cross_core_unbind); buffer_free(buffer); diff --git a/src/schedule/zephyr_domain.c b/src/schedule/zephyr_domain.c index 1605dd6ded63..7fb9b4ae0465 100644 --- a/src/schedule/zephyr_domain.c +++ b/src/schedule/zephyr_domain.c @@ -50,6 +50,11 @@ struct zephyr_domain { struct k_timer timer; struct zephyr_domain_thread domain_thread[CONFIG_CORE_COUNT]; struct ll_schedule_domain *ll_domain; +#if CONFIG_CROSS_CORE_STREAM + atomic_t block; + struct k_mutex block_mutex; + struct k_condvar block_condvar; +#endif }; /* perf measurement windows size 2^x */ @@ -67,6 +72,26 @@ static void zephyr_domain_thread_fn(void *p1, void *p2, void *p3) /* immediately go to sleep, waiting to be woken up by the timer */ k_sem_take(&dt->sem, K_FOREVER); +#if CONFIG_CROSS_CORE_STREAM + /* + * If zephyr_domain->block is set -- block LL scheduler from starting its + * next cycle. + * Mutex locking might be somewhat expensive, hence first check for + * zephyr_domain->block value is made without locking the mutex. If + * zephyr_domain->block is not set -- no need to do anything. Otherwise, + * usual condvar procedure is performed: mutex is locked to properly check + * zephyr_domain->block value again to avoid race with unblocking procedure + * (clearing zephyr_domain->block and broadcasting the condvar). + */ + if (atomic_get(&zephyr_domain->block)) { + k_mutex_lock(&zephyr_domain->block_mutex, K_FOREVER); + if (atomic_get(&zephyr_domain->block)) + k_condvar_wait(&zephyr_domain->block_condvar, + &zephyr_domain->block_mutex, K_FOREVER); + k_mutex_unlock(&zephyr_domain->block_mutex); + } +#endif + cycles0 = k_cycle_get_32(); dt->handler(dt->arg); cycles1 = k_cycle_get_32(); @@ -221,9 +246,38 @@ static int zephyr_domain_unregister(struct ll_schedule_domain *domain, return 0; } +#if CONFIG_CROSS_CORE_STREAM +static void zephyr_domain_block(struct ll_schedule_domain *domain) +{ + struct zephyr_domain *zephyr_domain = ll_sch_domain_get_pdata(domain); + + tr_dbg(&ll_tr, "Blocking LL scheduler"); + + k_mutex_lock(&zephyr_domain->block_mutex, K_FOREVER); + atomic_set(&zephyr_domain->block, 1); + k_mutex_unlock(&zephyr_domain->block_mutex); +} + +static void zephyr_domain_unblock(struct ll_schedule_domain *domain) +{ + struct zephyr_domain *zephyr_domain = ll_sch_domain_get_pdata(domain); + + tr_dbg(&ll_tr, "Unblocking LL scheduler"); + + k_mutex_lock(&zephyr_domain->block_mutex, K_FOREVER); + atomic_set(&zephyr_domain->block, 0); + k_condvar_broadcast(&zephyr_domain->block_condvar); + k_mutex_unlock(&zephyr_domain->block_mutex); +} +#endif + static const struct ll_schedule_domain_ops zephyr_domain_ops = { .domain_register = zephyr_domain_register, .domain_unregister = zephyr_domain_unregister, +#if CONFIG_CROSS_CORE_STREAM + .domain_block = zephyr_domain_block, + .domain_unblock = zephyr_domain_unblock, +#endif }; struct ll_schedule_domain *zephyr_domain_init(int clk) @@ -239,6 +293,12 @@ struct ll_schedule_domain *zephyr_domain_init(int clk) zephyr_domain->ll_domain = domain; +#if CONFIG_CROSS_CORE_STREAM + atomic_set(&zephyr_domain->block, 0); + k_mutex_init(&zephyr_domain->block_mutex); + k_condvar_init(&zephyr_domain->block_condvar); +#endif + ll_sch_domain_set_pdata(domain, zephyr_domain); return domain; diff --git a/zephyr/Kconfig b/zephyr/Kconfig index 86c94a1de9a4..ea8c1d6e809f 100644 --- a/zephyr/Kconfig +++ b/zephyr/Kconfig @@ -56,4 +56,15 @@ config ZEPHYR_DP_SCHEDULER DP modules can be located in dieffrent cores than LL pipeline modules, may have different tick (i.e. 300ms for speech reccognition, etc.) +config CROSS_CORE_STREAM + bool "Enable cross-core connected pipelines" + default y if IPC_MAJOR_4 + help + Enables support for pipelines from different cores to be + connected together cross-core. So stream can travel from one + core to another. Note, this is different from "multicore" + support. In SOF "multicore" support means different streams + can be processed on different cores, however, each stream + is processed entirely on single core. + endif