diff --git a/arch/Kconfig b/arch/Kconfig
index f3e4f14dfbc89f..f1295eec38e87b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -24,6 +24,7 @@ config ARC
 	imply XIP
 	select ARCH_HAS_THREAD_LOCAL_STORAGE
 	select ARCH_SUPPORTS_ROM_START
+	select ARCH_HAS_DIRECTED_IPIS
 	help
 	  ARC architecture
 
@@ -50,6 +51,7 @@ config ARM64
 	select USE_SWITCH_SUPPORTED
 	select IRQ_OFFLOAD_NESTED if IRQ_OFFLOAD
 	select BARRIER_OPERATIONS_ARCH
+	select ARCH_HAS_DIRECTED_IPIS
 	help
 	  ARM64 (AArch64) architecture
 
@@ -115,6 +117,7 @@ config RISCV
 	select USE_SWITCH_SUPPORTED
 	select USE_SWITCH
 	select SCHED_IPI_SUPPORTED if SMP
+	select ARCH_HAS_DIRECTED_IPIS
 	select BARRIER_OPERATIONS_BUILTIN
 	imply XIP
 	help
@@ -129,6 +132,7 @@ config XTENSA
 	select ARCH_HAS_CODE_DATA_RELOCATION
 	select ARCH_HAS_TIMING_FUNCTIONS
 	select ARCH_MEM_DOMAIN_DATA if USERSPACE
+	select ARCH_HAS_DIRECTED_IPIS
 	help
 	  Xtensa architecture
 
@@ -746,6 +750,13 @@ config ARCH_HAS_RESERVED_PAGE_FRAMES
 	  memory mappings. The architecture will need to implement
 	  arch_reserved_pages_update().
 
+config ARCH_HAS_DIRECTED_IPIS
+	bool
+	help
+	  This hidden configuration should be selected by the architecture if
+	  it has an implementation for arch_sched_directed_ipi() which allows
+	  for IPIs to be directed to specific CPUs.
+
 config CPU_HAS_DCACHE
 	bool
 	help
diff --git a/arch/arc/core/smp.c b/arch/arc/core/smp.c
index 9f8ee38a4a1055..aa12623db8014a 100644
--- a/arch/arc/core/smp.c
+++ b/arch/arc/core/smp.c
@@ -13,6 +13,7 @@
 #include <zephyr/kernel.h>
 #include <zephyr/kernel_structs.h>
 #include <ksched.h>
+#include <ipi.h>
 #include <zephyr/init.h>
 #include <zephyr/irq.h>
 #include <arc_irq_offload.h>
@@ -130,21 +131,27 @@ static void sched_ipi_handler(const void *unused)
 	z_sched_ipi();
 }
 
-/* arch implementation of sched_ipi */
-void arch_sched_ipi(void)
+void arch_sched_directed_ipi(uint32_t cpu_bitmap)
 {
-	uint32_t i;
+	unsigned int i;
+	unsigned int num_cpus = arch_num_cpus();
 
-	/* broadcast sched_ipi request to other cores
+	/* Send sched_ipi request to other cores
 	 * if the target is current core, hardware will ignore it
 	 */
-	unsigned int num_cpus = arch_num_cpus();
 
 	for (i = 0U; i < num_cpus; i++) {
-		z_arc_connect_ici_generate(i);
+		if ((cpu_bitmap & BIT(i)) != 0) {
+			z_arc_connect_ici_generate(i);
+		}
 	}
 }
 
+void arch_sched_broadcast_ipi(void)
+{
+	arch_sched_directed_ipi(IPI_ALL_CPUS_MASK);
+}
+
 int arch_smp_init(void)
 {
 	struct arc_connect_bcr bcr;
diff --git a/arch/arc/include/kernel_arch_func.h b/arch/arc/include/kernel_arch_func.h
index 65a497e02d0786..ca382a274f4b1b 100644
--- a/arch/arc/include/kernel_arch_func.h
+++ b/arch/arc/include/kernel_arch_func.h
@@ -64,8 +64,6 @@ extern void z_arc_userspace_enter(k_thread_entry_t user_entry, void *p1,
 
 extern void z_arc_fatal_error(unsigned int reason, const struct arch_esf *esf);
 
-extern void arch_sched_ipi(void);
-
 extern void z_arc_switch(void *switch_to, void **switched_from);
 
 static inline void arch_switch(void *switch_to, void **switched_from)
diff --git a/arch/arm/core/cortex_a_r/Kconfig b/arch/arm/core/cortex_a_r/Kconfig
index 3ec57cc408e1bc..4095a277c61388 100644
--- a/arch/arm/core/cortex_a_r/Kconfig
+++ b/arch/arm/core/cortex_a_r/Kconfig
@@ -131,6 +131,7 @@ config AARCH32_ARMV8_R
 	bool
 	select ATOMIC_OPERATIONS_BUILTIN
 	select SCHED_IPI_SUPPORTED if SMP
+	select ARCH_HAS_DIRECTED_IPIS
 	help
 	  This option signifies the use of an ARMv8-R AArch32 processor
 	  implementation.
diff --git a/arch/arm/core/cortex_a_r/smp.c b/arch/arm/core/cortex_a_r/smp.c
index 9e06730f91396c..379b7663d016b5 100644
--- a/arch/arm/core/cortex_a_r/smp.c
+++ b/arch/arm/core/cortex_a_r/smp.c
@@ -7,6 +7,7 @@
 #include <zephyr/kernel.h>
 #include <zephyr/arch/arm/cortex_a_r/lib_helpers.h>
 #include <zephyr/drivers/interrupt_controller/gic.h>
+#include <ipi.h>
 #include "boot.h"
 #include "zephyr/cache.h"
 #include "zephyr/kernel/thread_stack.h"
@@ -210,7 +211,7 @@ void arch_secondary_cpu_init(void)
 
 #ifdef CONFIG_SMP
 
-static void broadcast_ipi(unsigned int ipi)
+static void send_ipi(unsigned int ipi, uint32_t cpu_bitmap)
 {
 	uint32_t mpidr = MPIDR_TO_CORE(GET_MPIDR());
 
@@ -220,6 +221,10 @@ static void broadcast_ipi(unsigned int ipi)
 	unsigned int num_cpus = arch_num_cpus();
 
 	for (int i = 0; i < num_cpus; i++) {
+		if ((cpu_bitmap & BIT(i)) == 0) {
+			continue;
+		}
+
 		uint32_t target_mpidr = cpu_map[i];
 		uint8_t aff0;
 
@@ -239,10 +244,14 @@ void sched_ipi_handler(const void *unused)
 	z_sched_ipi();
 }
 
-/* arch implementation of sched_ipi */
-void arch_sched_ipi(void)
+void arch_sched_broadcast_ipi(void)
+{
+	send_ipi(SGI_SCHED_IPI, IPI_ALL_CPUS_MASK);
+}
+
+void arch_sched_directed_ipi(uint32_t cpu_bitmap)
 {
-	broadcast_ipi(SGI_SCHED_IPI);
+	send_ipi(SGI_SCHED_IPI, cpu_bitmap);
 }
 
 int arch_smp_init(void)
diff --git a/arch/arm64/core/smp.c b/arch/arm64/core/smp.c
index 8777c400766fce..31dfcf337e4212 100644
--- a/arch/arm64/core/smp.c
+++ b/arch/arm64/core/smp.c
@@ -16,6 +16,7 @@
 #include <zephyr/kernel.h>
 #include <zephyr/kernel_structs.h>
 #include <ksched.h>
+#include <ipi.h>
 #include <zephyr/init.h>
 #include <zephyr/arch/arm64/mm.h>
 #include <zephyr/arch/cpu.h>
@@ -180,7 +181,7 @@ void arch_secondary_cpu_init(int cpu_num)
 
 #ifdef CONFIG_SMP
 
-static void broadcast_ipi(unsigned int ipi)
+static void send_ipi(unsigned int ipi, uint32_t cpu_bitmap)
 {
 	uint64_t mpidr = MPIDR_TO_CORE(GET_MPIDR());
 
@@ -190,6 +191,10 @@ static void broadcast_ipi(unsigned int ipi)
 	unsigned int num_cpus = arch_num_cpus();
 
 	for (int i = 0; i < num_cpus; i++) {
+		if ((cpu_bitmap & BIT(i)) == 0) {
+			continue;
+		}
+
 		uint64_t target_mpidr = cpu_map[i];
 		uint8_t aff0;
 
@@ -209,10 +214,14 @@ void sched_ipi_handler(const void *unused)
 	z_sched_ipi();
 }
 
-/* arch implementation of sched_ipi */
-void arch_sched_ipi(void)
+void arch_sched_broadcast_ipi(void)
+{
+	send_ipi(SGI_SCHED_IPI, IPI_ALL_CPUS_MASK);
+}
+
+void arch_sched_directed_ipi(uint32_t cpu_bitmap)
 {
-	broadcast_ipi(SGI_SCHED_IPI);
+	send_ipi(SGI_SCHED_IPI, cpu_bitmap);
 }
 
 #ifdef CONFIG_USERSPACE
@@ -232,7 +241,7 @@ void mem_cfg_ipi_handler(const void *unused)
 
 void z_arm64_mem_cfg_ipi(void)
 {
-	broadcast_ipi(SGI_MMCFG_IPI);
+	send_ipi(SGI_MMCFG_IPI, IPI_ALL_CPUS_MASK);
 }
 #endif
 
diff --git a/arch/riscv/core/smp.c b/arch/riscv/core/smp.c
index 68147f8880a653..b5b94aac25cf09 100644
--- a/arch/riscv/core/smp.c
+++ b/arch/riscv/core/smp.c
@@ -7,6 +7,7 @@
 #include <zephyr/init.h>
 #include <zephyr/kernel.h>
 #include <ksched.h>
+#include <ipi.h>
 #include <zephyr/irq.h>
 #include <zephyr/sys/atomic.h>
 #include <zephyr/arch/riscv/irq.h>
@@ -86,14 +87,15 @@ static atomic_val_t cpu_pending_ipi[CONFIG_MP_MAX_NUM_CPUS];
 #define IPI_SCHED	0
 #define IPI_FPU_FLUSH	1
 
-void arch_sched_ipi(void)
+void arch_sched_directed_ipi(uint32_t cpu_bitmap)
 {
 	unsigned int key = arch_irq_lock();
 	unsigned int id = _current_cpu->id;
 	unsigned int num_cpus = arch_num_cpus();
 
 	for (unsigned int i = 0; i < num_cpus; i++) {
-		if (i != id && _kernel.cpus[i].arch.online) {
+		if ((i != id) && _kernel.cpus[i].arch.online &&
+		    ((cpu_bitmap & BIT(i)) != 0)) {
 			atomic_set_bit(&cpu_pending_ipi[i], IPI_SCHED);
 			MSIP(_kernel.cpus[i].arch.hartid) = 1;
 		}
@@ -102,6 +104,11 @@ void arch_sched_ipi(void)
 	arch_irq_unlock(key);
 }
 
+void arch_sched_broadcast_ipi(void)
+{
+	arch_sched_directed_ipi(IPI_ALL_CPUS_MASK);
+}
+
 #ifdef CONFIG_FPU_SHARING
 void arch_flush_fpu_ipi(unsigned int cpu)
 {
diff --git a/arch/x86/core/intel64/smp.c b/arch/x86/core/intel64/smp.c
index a73ba9c8f38c36..b0232f21984132 100644
--- a/arch/x86/core/intel64/smp.c
+++ b/arch/x86/core/intel64/smp.c
@@ -34,7 +34,7 @@ int arch_smp_init(void)
  * it is not clear exactly how/where/why to abstract this, as it
  * assumes the use of a local APIC (but there's no other mechanism).
  */
-void arch_sched_ipi(void)
+void arch_sched_broadcast_ipi(void)
 {
 	z_loapic_ipi(0, LOAPIC_ICR_IPI_OTHERS, CONFIG_SCHED_IPI_VECTOR);
 }
diff --git a/doc/kernel/services/smp/smp.rst b/doc/kernel/services/smp/smp.rst
index ca1e0149ad55e6..4b178432bd5597 100644
--- a/doc/kernel/services/smp/smp.rst
+++ b/doc/kernel/services/smp/smp.rst
@@ -180,13 +180,17 @@ handle the newly-runnable load.
 
 So where possible, Zephyr SMP architectures should implement an
 interprocessor interrupt.  The current framework is very simple: the
-architecture provides a :c:func:`arch_sched_ipi` call, which when invoked
-will flag an interrupt on all CPUs (except the current one, though
-that is allowed behavior) which will then invoke the :c:func:`z_sched_ipi`
-function implemented in the scheduler.  The expectation is that these
-APIs will evolve over time to encompass more functionality
-(e.g. cross-CPU calls), and that the scheduler-specific calls here
-will be implemented in terms of a more general framework.
+architecture provides at least a :c:func:`arch_sched_broadcast_ipi` call,
+which when invoked will flag an interrupt on all CPUs (except the current one,
+though that is allowed behavior). If the architecture supports directed IPIs
+(see :kconfig:option:`CONFIG_ARCH_HAS_DIRECTED_IPIS`), then the
+architecture also provides a :c:func:`arch_sched_directed_ipi` call, which
+when invoked will flag an interrupt on the specified CPUs. When an interrupt is
+flagged on the CPUs, the :c:func:`z_sched_ipi` function implmented in the
+scheduler will get invoked on those CPUs. The expectation is that these
+APIs will evolve over time to encompass more functionality (e.g. cross-CPU
+calls), and that the scheduler-specific calls here will be implemented in
+terms of a more general framework.
 
 Note that not all SMP architectures will have a usable IPI mechanism
 (either missing, or just undocumented/unimplemented).  In those cases
diff --git a/include/zephyr/arch/arch_interface.h b/include/zephyr/arch/arch_interface.h
index 797a60bbaa58c7..d7c33e511ce503 100644
--- a/include/zephyr/arch/arch_interface.h
+++ b/include/zephyr/arch/arch_interface.h
@@ -494,10 +494,18 @@ static inline uint32_t arch_proc_id(void);
 /**
  * Broadcast an interrupt to all CPUs
  *
- * This will invoke z_sched_ipi() on other CPUs in the system.
+ * This will invoke z_sched_ipi() on all other CPUs in the system.
  */
-void arch_sched_ipi(void);
+void arch_sched_broadcast_ipi(void);
 
+/**
+ * Direct IPIs to the specified CPUs
+ *
+ * This will invoke z_sched_ipi() on the CPUs identified by @a cpu_bitmap.
+ *
+ * @param cpu_bitmap A bitmap indicating which CPUs need the IPI
+ */
+void arch_sched_directed_ipi(uint32_t cpu_bitmap);
 
 int arch_smp_init(void);
 
diff --git a/include/zephyr/kernel_structs.h b/include/zephyr/kernel_structs.h
index baa2046f07c8e0..cf7daff9a6cf79 100644
--- a/include/zephyr/kernel_structs.h
+++ b/include/zephyr/kernel_structs.h
@@ -240,8 +240,8 @@ struct z_kernel {
 #endif
 
 #if defined(CONFIG_SMP) && defined(CONFIG_SCHED_IPI_SUPPORTED)
-	/* Need to signal an IPI at the next scheduling point */
-	bool pending_ipi;
+	/* Identify CPUs to send IPIs to at the next scheduling point */
+	atomic_t pending_ipi;
 #endif
 };
 
diff --git a/kernel/Kconfig.smp b/kernel/Kconfig.smp
index 22279270b19f0c..da83d1624e060a 100644
--- a/kernel/Kconfig.smp
+++ b/kernel/Kconfig.smp
@@ -56,12 +56,11 @@ config MP_MAX_NUM_CPUS
 config SCHED_IPI_SUPPORTED
 	bool
 	help
-	  True if the architecture supports a call to
-	  arch_sched_ipi() to broadcast an interrupt that will call
-	  z_sched_ipi() on other CPUs in the system.  Required for
-	  k_thread_abort() to operate with reasonable latency
-	  (otherwise we might have to wait for the other thread to
-	  take an interrupt, which can be arbitrarily far in the
+	  True if the architecture supports a call to arch_sched_broadcast_ipi()
+	  to broadcast an interrupt that will call z_sched_ipi() on other CPUs
+	  in the system.  Required for k_thread_abort() to operate with
+	  reasonable latency (otherwise we might have to wait for the other
+	  thread to take an interrupt, which can be arbitrarily far in the
 	  future).
 
 config TRACE_SCHED_IPI
@@ -73,6 +72,24 @@ config TRACE_SCHED_IPI
 	depends on SCHED_IPI_SUPPORTED
 	depends on MP_MAX_NUM_CPUS>1
 
+config IPI_OPTIMIZE
+	bool "Optimize IPI delivery"
+	default n
+	depends on SCHED_IPI_SUPPORTED && MP_MAX_NUM_CPUS>1
+	help
+	  When selected, the kernel will attempt to determine the minimum
+	  set of CPUs that need an IPI to trigger a reschedule in response to
+	  a thread newly made ready for execution. This increases the
+	  computation required at every scheduler operation by a value that is
+	  O(N) in the number of CPUs, and in exchange reduces the number of
+	  interrupts delivered. Which to choose is going to depend on
+	  application behavior. If the architecture also supports directing
+	  IPIs to specific CPUs then this has the potential to signficantly
+	  reduce the number of IPIs (and consequently ISRs) processed by the
+	  system as the number of CPUs increases. If not, the only benefit
+	  would be to not issue any IPIs if the newly readied thread is of
+	  lower priority than all the threads currently executing on other CPUs.
+
 config KERNEL_COHERENCE
 	bool "Place all shared data into coherent memory"
 	depends on ARCH_HAS_COHERENCE
diff --git a/kernel/include/ipi.h b/kernel/include/ipi.h
index 77105cac16834e..b353a676d4624d 100644
--- a/kernel/include/ipi.h
+++ b/kernel/include/ipi.h
@@ -7,13 +7,25 @@
 #ifndef ZEPHYR_KERNEL_INCLUDE_IPI_H_
 #define ZEPHYR_KERNEL_INCLUDE_IPI_H_
 
+#include <zephyr/kernel.h>
+#include <stdint.h>
+#include <zephyr/sys/atomic.h>
+
+#define IPI_ALL_CPUS_MASK  ((1 << CONFIG_MP_MAX_NUM_CPUS) - 1)
+
+#define IPI_CPU_MASK(cpu_id)   \
+	(IS_ENABLED(CONFIG_IPI_OPTIMIZE) ? BIT(cpu_id) : IPI_ALL_CPUS_MASK)
+
+
 /* defined in ipi.c when CONFIG_SMP=y */
 #ifdef CONFIG_SMP
-void flag_ipi(void);
+void flag_ipi(uint32_t ipi_mask);
 void signal_pending_ipi(void);
+atomic_val_t ipi_mask_create(struct k_thread *thread);
 #else
-#define flag_ipi() do { } while (false)
+#define flag_ipi(ipi_mask) do { } while (false)
 #define signal_pending_ipi() do { } while (false)
 #endif /* CONFIG_SMP */
 
+
 #endif /* ZEPHYR_KERNEL_INCLUDE_IPI_H_ */
diff --git a/kernel/ipi.c b/kernel/ipi.c
index 99693c0ecbfcfb..ee01c4594251ca 100644
--- a/kernel/ipi.c
+++ b/kernel/ipi.c
@@ -13,15 +13,58 @@ extern void z_trace_sched_ipi(void);
 #endif
 
 
-void flag_ipi(void)
+void flag_ipi(uint32_t ipi_mask)
 {
 #if defined(CONFIG_SCHED_IPI_SUPPORTED)
 	if (arch_num_cpus() > 1) {
-		_kernel.pending_ipi = true;
+		atomic_or(&_kernel.pending_ipi, (atomic_val_t)ipi_mask);
 	}
 #endif /* CONFIG_SCHED_IPI_SUPPORTED */
 }
 
+/* Create a bitmask of CPUs that need an IPI. Note: sched_spinlock is held. */
+atomic_val_t ipi_mask_create(struct k_thread *thread)
+{
+	if (!IS_ENABLED(CONFIG_IPI_OPTIMIZE)) {
+		return (CONFIG_MP_MAX_NUM_CPUS > 1) ? IPI_ALL_CPUS_MASK : 0;
+	}
+
+	uint32_t  ipi_mask = 0;
+	uint32_t  num_cpus = (uint32_t)arch_num_cpus();
+	uint32_t  id = _current_cpu->id;
+	struct k_thread *cpu_thread;
+	bool   executable_on_cpu = true;
+
+	for (uint32_t i = 0; i < num_cpus; i++) {
+		if (id == i) {
+			continue;
+		}
+
+		/*
+		 * An IPI absolutely does not need to be sent if ...
+		 * 1. the CPU is not active, or
+		 * 2. <thread> can not execute on the target CPU
+		 * ... and might not need to be sent if ...
+		 * 3. the target CPU's active thread is not preemptible, or
+		 * 4. the target CPU's active thread has a higher priority
+		 *    (Items 3 & 4 may be overridden by a metaIRQ thread)
+		 */
+
+#if defined(CONFIG_SCHED_CPU_MASK)
+		executable_on_cpu = ((thread->base.cpu_mask & BIT(i)) != 0);
+#endif
+
+		cpu_thread = _kernel.cpus[i].current;
+		if ((cpu_thread != NULL) &&
+		    (((z_sched_prio_cmp(cpu_thread, thread) < 0) &&
+		      (thread_is_preemptible(cpu_thread))) ||
+		     thread_is_metairq(thread)) && executable_on_cpu) {
+			ipi_mask |= BIT(i);
+		}
+	}
+
+	return (atomic_val_t)ipi_mask;
+}
 
 void signal_pending_ipi(void)
 {
@@ -34,9 +77,15 @@ void signal_pending_ipi(void)
 	 */
 #if defined(CONFIG_SCHED_IPI_SUPPORTED)
 	if (arch_num_cpus() > 1) {
-		if (_kernel.pending_ipi) {
-			_kernel.pending_ipi = false;
-			arch_sched_ipi();
+		uint32_t  cpu_bitmap;
+
+		cpu_bitmap = (uint32_t)atomic_clear(&_kernel.pending_ipi);
+		if (cpu_bitmap != 0) {
+#ifdef CONFIG_ARCH_HAS_DIRECTED_IPIS
+			arch_sched_directed_ipi(cpu_bitmap);
+#else
+			arch_sched_broadcast_ipi();
+#endif
 		}
 	}
 #endif /* CONFIG_SCHED_IPI_SUPPORTED */
diff --git a/kernel/sched.c b/kernel/sched.c
index 506ad57a141b21..67e5645bc6f245 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -348,11 +348,11 @@ static void update_cache(int preempt_ok)
 #endif /* CONFIG_SMP */
 }
 
-static bool thread_active_elsewhere(struct k_thread *thread)
+static struct _cpu *thread_active_elsewhere(struct k_thread *thread)
 {
-	/* True if the thread is currently running on another CPU.
-	 * There are more scalable designs to answer this question in
-	 * constant time, but this is fine for now.
+	/* Returns pointer to _cpu if the thread is currently running on
+	 * another CPU. There are more scalable designs to answer this
+	 * question in constant time, but this is fine for now.
 	 */
 #ifdef CONFIG_SMP
 	int currcpu = _current_cpu->id;
@@ -362,12 +362,12 @@ static bool thread_active_elsewhere(struct k_thread *thread)
 	for (int i = 0; i < num_cpus; i++) {
 		if ((i != currcpu) &&
 		    (_kernel.cpus[i].current == thread)) {
-			return true;
+			return &_kernel.cpus[i];
 		}
 	}
 #endif /* CONFIG_SMP */
 	ARG_UNUSED(thread);
-	return false;
+	return NULL;
 }
 
 static void ready_thread(struct k_thread *thread)
@@ -384,13 +384,14 @@ static void ready_thread(struct k_thread *thread)
 
 		queue_thread(thread);
 		update_cache(0);
-		flag_ipi();
+
+		flag_ipi(ipi_mask_create(thread));
 	}
 }
 
 void z_ready_thread_locked(struct k_thread *thread)
 {
-	if (!thread_active_elsewhere(thread)) {
+	if (thread_active_elsewhere(thread) == NULL) {
 		ready_thread(thread);
 	}
 }
@@ -398,7 +399,7 @@ void z_ready_thread_locked(struct k_thread *thread)
 void z_ready_thread(struct k_thread *thread)
 {
 	K_SPINLOCK(&_sched_spinlock) {
-		if (!thread_active_elsewhere(thread)) {
+		if (thread_active_elsewhere(thread) == NULL) {
 			ready_thread(thread);
 		}
 	}
@@ -466,11 +467,18 @@ static void z_thread_halt(struct k_thread *thread, k_spinlock_key_t key,
 	 * halt itself in the IPI.  Otherwise it's unscheduled, so we
 	 * can clean it up directly.
 	 */
-	if (thread_active_elsewhere(thread)) {
+
+	struct _cpu *cpu = thread_active_elsewhere(thread);
+
+	if (cpu != NULL) {
 		thread->base.thread_state |= (terminate ? _THREAD_ABORTING
 					      : _THREAD_SUSPENDING);
 #if defined(CONFIG_SMP) && defined(CONFIG_SCHED_IPI_SUPPORTED)
-		arch_sched_ipi();
+#ifdef CONFIG_ARCH_HAS_DIRECTED_IPIS
+		arch_sched_directed_ipi(IPI_CPU_MASK(cpu->id));
+#else
+		arch_sched_broadcast_ipi();
+#endif
 #endif
 		if (arch_is_in_isr()) {
 			thread_halt_spin(thread, key);
@@ -731,19 +739,38 @@ void z_unpend_thread(struct k_thread *thread)
 bool z_thread_prio_set(struct k_thread *thread, int prio)
 {
 	bool need_sched = 0;
+	int old_prio = thread->base.prio;
 
 	K_SPINLOCK(&_sched_spinlock) {
 		need_sched = z_is_thread_ready(thread);
 
 		if (need_sched) {
-			/* Don't requeue on SMP if it's the running thread */
 			if (!IS_ENABLED(CONFIG_SMP) || z_is_thread_queued(thread)) {
 				dequeue_thread(thread);
 				thread->base.prio = prio;
 				queue_thread(thread);
+
+				if (old_prio > prio) {
+					flag_ipi(ipi_mask_create(thread));
+				}
 			} else {
+				/*
+				 * This is a running thread on SMP. Update its
+				 * priority, but do not requeue it. An IPI is
+				 * needed if the priority is both being lowered
+				 * and it is running on another CPU.
+				 */
+
 				thread->base.prio = prio;
+
+				struct _cpu *cpu;
+
+				cpu = thread_active_elsewhere(thread);
+				if ((cpu != NULL) && (old_prio < prio)) {
+					flag_ipi(IPI_CPU_MASK(cpu->id));
+				}
 			}
+
 			update_cache(1);
 		} else {
 			thread->base.prio = prio;
@@ -1006,8 +1033,8 @@ void z_impl_k_thread_priority_set(k_tid_t thread, int prio)
 
 	bool need_sched = z_thread_prio_set((struct k_thread *)thread, prio);
 
-	flag_ipi();
-	if (need_sched && (_current->base.sched_locked == 0U)) {
+	if ((need_sched) && (IS_ENABLED(CONFIG_SMP) ||
+			     (_current->base.sched_locked == 0U))) {
 		z_reschedule_unlocked();
 	}
 }
@@ -1219,7 +1246,7 @@ void z_impl_k_wakeup(k_tid_t thread)
 
 	z_mark_thread_as_not_suspended(thread);
 
-	if (!thread_active_elsewhere(thread)) {
+	if (thread_active_elsewhere(thread) == NULL) {
 		ready_thread(thread);
 	}
 
diff --git a/kernel/timeslicing.c b/kernel/timeslicing.c
index 07ae497c7f91e6..be91d9606f51e2 100644
--- a/kernel/timeslicing.c
+++ b/kernel/timeslicing.c
@@ -58,11 +58,10 @@ static void slice_timeout(struct _timeout *timeout)
 	slice_expired[cpu] = true;
 
 	/* We need an IPI if we just handled a timeslice expiration
-	 * for a different CPU.  Ideally this would be able to target
-	 * the specific core, but that's not part of the API yet.
+	 * for a different CPU.
 	 */
-	if (IS_ENABLED(CONFIG_SMP) && cpu != _current_cpu->id) {
-		flag_ipi();
+	if (cpu != _current_cpu->id) {
+		flag_ipi(IPI_CPU_MASK(cpu));
 	}
 }
 
diff --git a/soc/espressif/esp32/esp32-mp.c b/soc/espressif/esp32/esp32-mp.c
index c380df6c8b77a6..ca2de23e1e8b86 100644
--- a/soc/espressif/esp32/esp32-mp.c
+++ b/soc/espressif/esp32/esp32-mp.c
@@ -12,6 +12,7 @@
 #include <zephyr/drivers/interrupt_controller/intc_esp32.h>
 #include <soc.h>
 #include <ksched.h>
+#include <ipi.h>
 #include <zephyr/device.h>
 #include <zephyr/kernel.h>
 #include <zephyr/spinlock.h>
@@ -290,10 +291,12 @@ void arch_cpu_start(int cpu_num, k_thread_stack_t *stack, int sz,
 	smp_log("ESP32: APPCPU initialized");
 }
 
-void arch_sched_ipi(void)
+void arch_sched_directed_ipi(uint32_t cpu_bitmap)
 {
 	const int core_id = esp_core_id();
 
+	ARG_UNUSED(cpu_bitmap);
+
 	if (core_id == 0) {
 		DPORT_WRITE_PERI_REG(DPORT_CPU_INTR_FROM_CPU_0_REG, DPORT_CPU_INTR_FROM_CPU_0);
 	} else {
@@ -301,6 +304,11 @@ void arch_sched_ipi(void)
 	}
 }
 
+void arch_sched_broadcast_ipi(void)
+{
+	arch_sched_directed_ipi(IPI_ALL_CPUS_MASK);
+}
+
 IRAM_ATTR bool arch_cpu_active(int cpu_num)
 {
 	return cpus_active[cpu_num];
diff --git a/soc/intel/intel_adsp/ace/multiprocessing.c b/soc/intel/intel_adsp/ace/multiprocessing.c
index 68b8693a520d4b..1c9b3fa3d32256 100644
--- a/soc/intel/intel_adsp/ace/multiprocessing.c
+++ b/soc/intel/intel_adsp/ace/multiprocessing.c
@@ -21,6 +21,7 @@
 #include <adsp_interrupt.h>
 #include <zephyr/irq.h>
 #include <zephyr/cache.h>
+#include <ipi.h>
 
 #define CORE_POWER_CHECK_NUM 128
 
@@ -209,7 +210,7 @@ void soc_mp_startup(uint32_t cpu)
 #ifndef CONFIG_XTENSA_MMU
 ALWAYS_INLINE
 #endif
-static void send_ipi(uint32_t msg)
+static void send_ipi(uint32_t msg, uint32_t cpu_bitmap)
 {
 	uint32_t curr = arch_proc_id();
 
@@ -217,24 +218,30 @@ static void send_ipi(uint32_t msg)
 	unsigned int num_cpus = arch_num_cpus();
 
 	for (int core = 0; core < num_cpus; core++) {
-		if (core != curr && soc_cpus_active[core]) {
+		if ((core != curr) && soc_cpus_active[core] &&
+		    ((cpu_bitmap & BIT(core)) != 0)) {
 			IDC[core].agents[1].ipc.idr = msg | INTEL_ADSP_IPC_BUSY;
 		}
 	}
 }
 
-void arch_sched_ipi(void)
-{
-	send_ipi(0);
-}
-
 #if defined(CONFIG_XTENSA_MMU) && (CONFIG_MP_MAX_NUM_CPUS > 1)
 void xtensa_mmu_tlb_ipi(void)
 {
-	send_ipi(IPI_TLB_FLUSH);
+	send_ipi(IPI_TLB_FLUSH, IPI_ALL_CPUS_MASK);
 }
 #endif
 
+void arch_sched_broadcast_ipi(void)
+{
+	send_ipi(0, IPI_ALL_CPUS_MASK);
+}
+
+void arch_sched_directed_ipi(uint32_t cpu_bitmap)
+{
+	send_ipi(0, cpu_bitmap);
+}
+
 #if CONFIG_MP_MAX_NUM_CPUS > 1
 int soc_adsp_halt_cpu(int id)
 {
diff --git a/soc/intel/intel_adsp/cavs/multiprocessing.c b/soc/intel/intel_adsp/cavs/multiprocessing.c
index 2a38f20355da01..d87cd435e57598 100644
--- a/soc/intel/intel_adsp/cavs/multiprocessing.c
+++ b/soc/intel/intel_adsp/cavs/multiprocessing.c
@@ -8,6 +8,7 @@
 #include <zephyr/irq.h>
 #include <zephyr/pm/pm.h>
 #include <zephyr/cache.h>
+#include <ipi.h>
 
 /* IDC power up message to the ROM firmware.  This isn't documented
  * anywhere, it's basically just a magic number (except the high bit,
@@ -121,18 +122,29 @@ void soc_start_core(int cpu_num)
 	IDC[curr_cpu].core[cpu_num].itc = IDC_MSG_POWER_UP;
 }
 
-void arch_sched_ipi(void)
+static void send_ipi(uint32_t cpu_bitmap)
 {
 	uint32_t curr = arch_proc_id();
 	unsigned int num_cpus = arch_num_cpus();
 
 	for (int c = 0; c < num_cpus; c++) {
-		if (c != curr && soc_cpus_active[c]) {
+		if ((c != curr) && soc_cpus_active[c] &&
+		    ((cpu_bitmap & BIT(c)) != 0)) {
 			IDC[curr].core[c].itc = BIT(31);
 		}
 	}
 }
 
+void arch_sched_broadcast_ipi(void)
+{
+	send_ipi(IPI_ALL_CPUS_MASK);
+}
+
+void arch_sched_directed_ipi(uint32_t cpu_bitmap)
+{
+	send_ipi(cpu_bitmap);
+}
+
 void idc_isr(const void *param)
 {
 	ARG_UNUSED(param);
diff --git a/submanifests/optional.yaml b/submanifests/optional.yaml
index 35f5d931167694..a6a9048d17f4e5 100644
--- a/submanifests/optional.yaml
+++ b/submanifests/optional.yaml
@@ -34,7 +34,7 @@ manifest:
       groups:
         - optional
     - name: sof
-      revision: a44758883f3f6cfb6c67b19bc76fcb01f77ca50b
+      revision: 3f1716b0da7a48358bc265586b90b2252745c14c
       path: modules/audio/sof
       remote: upstream
       groups:
diff --git a/tests/kernel/ipi_optimize/CMakeLists.txt b/tests/kernel/ipi_optimize/CMakeLists.txt
new file mode 100644
index 00000000000000..f32de519289c3b
--- /dev/null
+++ b/tests/kernel/ipi_optimize/CMakeLists.txt
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20.0)
+find_package(Zephyr REQUIRED HINTS $ENV{ZEPHYR_BASE})
+project(smp)
+
+target_sources(app PRIVATE src/main.c)
+
+target_include_directories(app PRIVATE
+  ${ZEPHYR_BASE}/kernel/include
+  ${ZEPHYR_BASE}/arch/${ARCH}/include
+  )
diff --git a/tests/kernel/ipi_optimize/boards/qemu_cortex_a53_qemu_cortex_a53_smp.conf b/tests/kernel/ipi_optimize/boards/qemu_cortex_a53_qemu_cortex_a53_smp.conf
new file mode 100644
index 00000000000000..f0ee34b467edd4
--- /dev/null
+++ b/tests/kernel/ipi_optimize/boards/qemu_cortex_a53_qemu_cortex_a53_smp.conf
@@ -0,0 +1,4 @@
+# Copyright (c) 2022 Carlo Caione <ccaione@baylibre.com>
+# SPDX-License-Identifier: Apache-2.0
+
+CONFIG_MP_MAX_NUM_CPUS=4
diff --git a/tests/kernel/ipi_optimize/boards/qemu_cortex_a53_qemu_cortex_a53_smp.overlay b/tests/kernel/ipi_optimize/boards/qemu_cortex_a53_qemu_cortex_a53_smp.overlay
new file mode 100644
index 00000000000000..5bb497069dd8fd
--- /dev/null
+++ b/tests/kernel/ipi_optimize/boards/qemu_cortex_a53_qemu_cortex_a53_smp.overlay
@@ -0,0 +1,19 @@
+/* Copyright 2022 Carlo Caione <ccaione@baylibre.com>
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/ {
+	cpus {
+		cpu@2 {
+			device_type = "cpu";
+			compatible = "arm,cortex-a53";
+			reg = <2>;
+		};
+
+		cpu@3 {
+			device_type = "cpu";
+			compatible = "arm,cortex-a53";
+			reg = <3>;
+		};
+	};
+};
diff --git a/tests/kernel/ipi_optimize/prj.conf b/tests/kernel/ipi_optimize/prj.conf
new file mode 100644
index 00000000000000..f337c89ff5bb41
--- /dev/null
+++ b/tests/kernel/ipi_optimize/prj.conf
@@ -0,0 +1,5 @@
+CONFIG_ZTEST=y
+CONFIG_SMP=y
+CONFIG_TRACE_SCHED_IPI=y
+CONFIG_IPI_OPTIMIZE=y
+CONFIG_SYS_CLOCK_TICKS_PER_SEC=50
diff --git a/tests/kernel/ipi_optimize/src/main.c b/tests/kernel/ipi_optimize/src/main.c
new file mode 100644
index 00000000000000..029b79b6d3a85a
--- /dev/null
+++ b/tests/kernel/ipi_optimize/src/main.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright (c) 2024 Intel Corporation.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <zephyr/tc_util.h>
+#include <zephyr/ztest.h>
+#include <zephyr/kernel.h>
+#include <ksched.h>
+#include <ipi.h>
+#include <zephyr/kernel_structs.h>
+
+#define STACK_SIZE (1024 + CONFIG_TEST_EXTRA_STACK_SIZE)
+
+#define NUM_THREADS (CONFIG_MP_MAX_NUM_CPUS - 1)
+
+#define DELAY_FOR_IPIS 200
+
+static struct k_thread thread[NUM_THREADS];
+static struct k_thread alt_thread;
+
+static bool alt_thread_created;
+
+static K_THREAD_STACK_ARRAY_DEFINE(stack, NUM_THREADS, STACK_SIZE);
+static K_THREAD_STACK_DEFINE(alt_stack, STACK_SIZE);
+
+static uint32_t ipi_count[CONFIG_MP_MAX_NUM_CPUS];
+static struct k_spinlock ipilock;
+static atomic_t busy_started;
+static volatile bool alt_thread_done;
+
+static K_SEM_DEFINE(sem, 0, 1);
+
+void z_trace_sched_ipi(void)
+{
+	k_spinlock_key_t  key;
+
+	key = k_spin_lock(&ipilock);
+	ipi_count[_current_cpu->id]++;
+	k_spin_unlock(&ipilock, key);
+}
+
+static void clear_ipi_counts(void)
+{
+	k_spinlock_key_t  key;
+
+	key = k_spin_lock(&ipilock);
+	memset(ipi_count, 0, sizeof(ipi_count));
+	k_spin_unlock(&ipilock, key);
+}
+
+static void get_ipi_counts(uint32_t *set, size_t n_elem)
+{
+	k_spinlock_key_t  key;
+
+	key = k_spin_lock(&ipilock);
+	memcpy(set, ipi_count, n_elem * sizeof(*set));
+	k_spin_unlock(&ipilock, key);
+}
+
+static void busy_thread_entry(void *p1, void *p2, void *p3)
+{
+	int  key;
+	uint32_t id;
+
+	key = arch_irq_lock();
+	id = _current_cpu->id;
+	arch_irq_unlock(key);
+
+	atomic_or(&busy_started, BIT(id));
+
+	while (1) {
+	}
+}
+
+static bool wait_until_busy_threads_ready(uint32_t id)
+{
+	uint32_t  all;
+	uint32_t  value;
+	unsigned int i;
+
+	all = IPI_ALL_CPUS_MASK ^ BIT(id);
+	for (i = 0; i < 10; i++) {
+		k_busy_wait(1000);
+
+		value = (uint32_t)atomic_get(&busy_started);
+		if (value == all) {
+			break;
+		}
+	}
+
+	return (i < 10);
+}
+
+static void pending_thread_entry(void *p1, void *p2, void *p3)
+{
+	int  key;
+
+	k_sem_take(&sem, K_FOREVER);
+
+	while (!alt_thread_done) {
+		key = arch_irq_lock();
+		arch_spin_relax();
+		arch_irq_unlock(key);
+	}
+}
+
+static void alt_thread_create(int priority, const char *desc)
+{
+	k_thread_create(&alt_thread, alt_stack, STACK_SIZE,
+			pending_thread_entry, NULL, NULL, NULL,
+			priority, 0, K_NO_WAIT);
+	alt_thread_created = true;
+
+	/* Verify alt_thread is pending */
+
+	k_busy_wait(10000);
+	zassert_true(z_is_thread_pending(&alt_thread),
+		     "%s priority thread has not pended.\n", desc);
+}
+
+uint32_t busy_threads_create(int priority)
+{
+	unsigned int  i;
+	uint32_t      id;
+	int           key;
+
+	atomic_clear(&busy_started);
+
+	for (i = 0; i < NUM_THREADS; i++) {
+		k_thread_create(&thread[i], stack[i], STACK_SIZE,
+				busy_thread_entry, NULL, NULL, NULL,
+				priority, 0, K_NO_WAIT);
+	}
+
+	/* Align to tick boundary to minimize probability of timer ISRs */
+
+	k_sleep(K_TICKS(1));
+	key = arch_irq_lock();
+	id = _current_cpu->id;
+	arch_irq_unlock(key);
+
+	/*
+	 * Spin until all busy threads are ready. It is assumed that as this
+	 * thread and the busy threads are cooperative that they will not be
+	 * rescheduled to execute on a different CPU.
+	 */
+
+	zassert_true(wait_until_busy_threads_ready(id),
+		     "1 or more 'busy threads' not ready.\n");
+
+	return id;
+}
+
+void busy_threads_priority_set(int priority, int delta)
+{
+	unsigned int  i;
+
+	for (i = 0; i < NUM_THREADS; i++) {
+		k_thread_priority_set(&thread[i], priority);
+		priority += delta;
+	}
+}
+
+/**
+ * Verify that arch_sched_broadcast_ipi() broadcasts IPIs as expected.
+ */
+ZTEST(ipi, test_arch_sched_broadcast_ipi)
+{
+	uint32_t  set[CONFIG_MP_MAX_NUM_CPUS];
+	uint32_t  id;
+	int priority;
+	unsigned int j;
+
+	priority = k_thread_priority_get(k_current_get());
+
+	id = busy_threads_create(priority - 1);
+
+	/* Broadcast the IPI. All other CPUs ought to receive and process it */
+
+	clear_ipi_counts();
+	arch_sched_broadcast_ipi();
+	k_busy_wait(DELAY_FOR_IPIS);
+	get_ipi_counts(set, CONFIG_MP_MAX_NUM_CPUS);
+
+	for (j = 0; j < CONFIG_MP_MAX_NUM_CPUS; j++) {
+		if (id == j) {
+			zassert_true(set[j] == 0,
+				     "Broadcast-Expected 0, got %u\n",
+				     set[j]);
+		} else {
+			zassert_true(set[j] == 1,
+				     "Broadcast-Expected 1, got %u\n",
+				     set[j]);
+		}
+	}
+}
+
+#ifdef CONFIG_ARCH_HAS_DIRECTED_IPIS
+/**
+ * Verify that arch_sched_directed_ipi() directs IPIs as expected.
+ */
+ZTEST(ipi, test_arch_sched_directed_ipi)
+{
+	uint32_t  set[CONFIG_MP_MAX_NUM_CPUS];
+	uint32_t  id;
+	int priority;
+	unsigned int j;
+
+	priority = k_thread_priority_get(k_current_get());
+
+	id = busy_threads_create(priority - 1);
+
+	/*
+	 * Send an IPI to each CPU, one at a time. Verify that only the
+	 * targeted CPU received the IPI.
+	 */
+	for (unsigned int i = 0; i < CONFIG_MP_MAX_NUM_CPUS; i++) {
+		if (i == id) {
+			continue;
+		}
+
+		clear_ipi_counts();
+		arch_sched_directed_ipi(BIT(i));
+		k_busy_wait(DELAY_FOR_IPIS);
+		get_ipi_counts(set, CONFIG_MP_MAX_NUM_CPUS);
+
+		for (j = 0; j < CONFIG_MP_MAX_NUM_CPUS; j++) {
+			if (i == j) {
+				zassert_true(set[j] == 1,
+					     "Direct-Expected 1, got %u\n",
+					     set[j]);
+			} else {
+				zassert_true(set[j] == 0,
+					     "Direct-Expected 0, got %u\n",
+					     set[j]);
+			}
+		}
+	}
+}
+#endif
+
+/**
+ * Verify that waking a thread whose priority is lower than any other
+ * currently executing thread does not result in any IPIs being sent.
+ */
+ZTEST(ipi, test_low_thread_wakes_no_ipis)
+{
+	uint32_t  set[CONFIG_MP_MAX_NUM_CPUS];
+	uint32_t  id;
+	int priority;
+	unsigned int i;
+
+	priority = k_thread_priority_get(k_current_get());
+	atomic_clear(&busy_started);
+
+	alt_thread_create(5, "Low");
+
+	id = busy_threads_create(priority - 1);
+
+	/*
+	 * Lower the priority of the busy threads now that we know that they
+	 * have started. As this is expected to generate IPIs, busy wait for
+	 * some small amount of time to give them time to be processed.
+	 */
+
+	busy_threads_priority_set(0, 0);
+	k_busy_wait(DELAY_FOR_IPIS);
+
+	/*
+	 * Low priority thread is pended. Current thread is cooperative.
+	 * Other CPUs are executing preemptible threads @ priority 0.
+	 */
+
+	clear_ipi_counts();
+	k_sem_give(&sem);
+	k_busy_wait(DELAY_FOR_IPIS);
+	get_ipi_counts(set, CONFIG_MP_MAX_NUM_CPUS);
+
+	zassert_true(z_is_thread_ready(&alt_thread),
+		     "Low priority thread is not ready.\n");
+
+	alt_thread_done = true;
+
+	for (i = 0; i < CONFIG_MP_MAX_NUM_CPUS; i++) {
+		zassert_true(set[i] == 0,
+			     "CPU %u unexpectedly received IPI.\n", i);
+	}
+}
+
+/**
+ * Verify that waking a thread whose priority is higher than all currently
+ * executing threads results in the proper IPIs being sent and processed.
+ */
+ZTEST(ipi, test_high_thread_wakes_some_ipis)
+{
+	uint32_t  set[CONFIG_MP_MAX_NUM_CPUS];
+	uint32_t  id;
+	int priority;
+	unsigned int i;
+
+	priority = k_thread_priority_get(k_current_get());
+	atomic_clear(&busy_started);
+
+	alt_thread_create(priority - 1 - NUM_THREADS, "High");
+
+	id = busy_threads_create(priority - 1);
+
+	/*
+	 * Lower the priority of the busy threads now that we know that they
+	 * have started and are busy waiting. As this is expected to generate
+	 * IPIs, busy wait for some small amount of time to give them time to
+	 * be processed.
+	 */
+
+	busy_threads_priority_set(0, 1);
+	k_busy_wait(DELAY_FOR_IPIS);
+
+	/*
+	 * High priority thread is pended. Current thread is cooperative.
+	 * Other CPUs are executing preemptible threads.
+	 */
+
+	clear_ipi_counts();
+	k_sem_give(&sem);
+	k_busy_wait(DELAY_FOR_IPIS);
+	get_ipi_counts(set, CONFIG_MP_MAX_NUM_CPUS);
+
+	zassert_true(z_is_thread_ready(&alt_thread),
+		     "High priority thread is not ready.\n");
+
+	alt_thread_done = true;
+
+	for (i = 0; i < CONFIG_MP_MAX_NUM_CPUS; i++) {
+		if (i == id) {
+			continue;
+		}
+
+		zassert_true(set[i] == 1, "CPU%u got %u IPIs", i, set[i]);
+	}
+
+	zassert_true(set[id] == 0, "Current CPU got %u IPI(s).\n", set[id]);
+}
+
+/**
+ * Verify that lowering the priority of an active thread results in an IPI.
+ * If directed IPIs are enabled, then only the CPU executing that active
+ * thread ought to receive the IPI. Otherwise if IPIs are broadcast, then all
+ * other CPUs save the current CPU ought to receive IPIs.
+ */
+ZTEST(ipi, test_thread_priority_set_lower)
+{
+	uint32_t  set[CONFIG_MP_MAX_NUM_CPUS];
+	uint32_t  id;
+	int priority;
+	unsigned int i;
+
+	priority = k_thread_priority_get(k_current_get());
+
+	id = busy_threads_create(priority - 1);
+
+	clear_ipi_counts();
+	k_thread_priority_set(&thread[0], priority);
+	k_busy_wait(DELAY_FOR_IPIS);
+	get_ipi_counts(set, CONFIG_MP_MAX_NUM_CPUS);
+
+	for (i = 0; i < CONFIG_MP_MAX_NUM_CPUS; i++) {
+		if (i == id) {
+			continue;
+		}
+
+#ifdef CONFIG_ARCH_HAS_DIRECTED_IPIS
+		unsigned int j;
+
+		for (j = 0; j < NUM_THREADS; j++) {
+			if (_kernel.cpus[i].current == &thread[j]) {
+				break;
+			}
+		}
+
+		zassert_true(j < NUM_THREADS,
+			     "CPU%u not executing expected thread\n", i);
+
+		if (j == 0) {
+			zassert_true(set[i] == 1, "CPU%u got %u IPIs.\n",
+				     i, set[i]);
+		} else {
+			zassert_true(set[i] == 0, "CPU%u got %u IPI(s).\n",
+				     i, set[i]);
+		}
+#else
+		zassert_true(set[i] == 1, "CPU%u got %u IPIs", i, set[i]);
+#endif
+	}
+
+	zassert_true(set[id] == 0, "Current CPU got %u IPI(s).\n", set[id]);
+}
+
+/*
+ * Verify that IPIs are not sent to CPUs that are executing cooperative
+ * threads.
+ */
+ZTEST(ipi, test_thread_coop_no_ipis)
+{
+	uint32_t  set[CONFIG_MP_MAX_NUM_CPUS];
+	uint32_t  id;
+	int priority;
+	unsigned int i;
+
+	priority = k_thread_priority_get(k_current_get());
+	atomic_clear(&busy_started);
+
+	alt_thread_create(priority - 1 - NUM_THREADS, "High");
+
+	id = busy_threads_create(priority - 1);
+
+	/*
+	 * High priority thread is pended. Current thread is cooperative.
+	 * Other CPUs are executing lower priority cooperative threads.
+	 */
+
+	clear_ipi_counts();
+	k_sem_give(&sem);
+	k_busy_wait(DELAY_FOR_IPIS);
+	get_ipi_counts(set, CONFIG_MP_MAX_NUM_CPUS);
+
+	zassert_true(z_is_thread_ready(&alt_thread),
+		     "High priority thread is not ready.\n");
+
+	alt_thread_done = true;
+
+	for (i = 0; i < CONFIG_MP_MAX_NUM_CPUS; i++) {
+		zassert_true(set[i] == 0, "CPU%u got %u IPIs", i, set[i]);
+	}
+}
+
+static void *ipi_tests_setup(void)
+{
+	/*
+	 * Sleep a bit to guarantee that all CPUs enter an idle thread
+	 * from which they can exit correctly to run the test.
+	 */
+
+	k_sleep(K_MSEC(20));
+
+	return NULL;
+}
+
+static void cleanup_threads(void *fixture)
+{
+	unsigned int  i;
+
+	ARG_UNUSED(fixture);
+
+	/*
+	 * Ensure that spawned busy threads are aborted before
+	 * proceeding to the next test.
+	 */
+
+	for (i = 0; i < NUM_THREADS; i++) {
+		k_thread_abort(&thread[i]);
+	}
+
+	/* Ensure alt_thread ,if it was created, also gets aborted */
+
+	if (alt_thread_created) {
+		k_thread_abort(&alt_thread);
+	}
+	alt_thread_created = false;
+
+	alt_thread_done = false;
+}
+
+ZTEST_SUITE(ipi, NULL, ipi_tests_setup, NULL, cleanup_threads, NULL);
diff --git a/tests/kernel/ipi_optimize/testcase.yaml b/tests/kernel/ipi_optimize/testcase.yaml
new file mode 100644
index 00000000000000..49227a720cbac8
--- /dev/null
+++ b/tests/kernel/ipi_optimize/testcase.yaml
@@ -0,0 +1,6 @@
+tests:
+  kernel.ipi_optimize.smp:
+    tags:
+      - kernel
+      - smp
+    filter: (CONFIG_MP_MAX_NUM_CPUS > 1)
diff --git a/tests/kernel/smp/src/main.c b/tests/kernel/smp/src/main.c
index f73a1dfdbbb6ea..7f556793e670c2 100644
--- a/tests/kernel/smp/src/main.c
+++ b/tests/kernel/smp/src/main.c
@@ -695,8 +695,8 @@ void z_trace_sched_ipi(void)
  * - To verify architecture layer provides a mechanism to issue an interprocessor
  *   interrupt to all other CPUs in the system that calls the scheduler IPI.
  *   We simply add a hook in z_sched_ipi(), in order to check if it has been
- *   called once in another CPU except the caller, when arch_sched_ipi() is
- *   called.
+ *   called once in another CPU except the caller, when arch_sched_broadcast_ipi()
+ *   is called.
  *
  * Testing techniques:
  * - Interface testing, function and block box testing,
@@ -711,7 +711,7 @@ void z_trace_sched_ipi(void)
  *
  * Test Procedure:
  * -# In main thread, given a global variable sched_ipi_has_called equaled zero.
- * -# Call arch_sched_ipi() then sleep for 100ms.
+ * -# Call arch_sched_broadcast_ipi() then sleep for 100ms.
  * -# In z_sched_ipi() handler, increment the sched_ipi_has_called.
  * -# In main thread, check the sched_ipi_has_called is not equaled to zero.
  * -# Repeat step 1 to 4 for 3 times.
@@ -727,7 +727,7 @@ void z_trace_sched_ipi(void)
  * - This test using for the platform that support SMP, in our current scenario
  *   , only x86_64 and arc supported.
  *
- * @see arch_sched_ipi()
+ * @see arch_sched_broadcast_ipi()
  */
 #ifdef CONFIG_SCHED_IPI_SUPPORTED
 ZTEST(smp, test_smp_ipi)
@@ -741,7 +741,7 @@ ZTEST(smp, test_smp_ipi)
 	for (int i = 0; i < 3 ; i++) {
 		/* issue a sched ipi to tell other CPU to run thread */
 		sched_ipi_has_called = 0;
-		arch_sched_ipi();
+		arch_sched_broadcast_ipi();
 
 		/* Need to wait longer than we think, loaded CI
 		 * systems need to wait for host scheduling to run the