-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement thread pinning for threads with high-frequency communication
- Loading branch information
Showing
14 changed files
with
399 additions
and
118 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,83 @@ | ||
#pragma once | ||
|
||
#include <cstdint> | ||
#include <string_view> | ||
#include <vector> | ||
|
||
namespace celerity { | ||
namespace detail { | ||
// The goal of this thread pinning mechanism, when enabled, is to ensure that threads which benefit from fast communication | ||
// are pinned to cores that are close to each other in terms of cache hierarchy. | ||
// It currently accomplishes this by pinning threads to cores in a round-robin fashion according to their order in the `thread_type` enum. | ||
// | ||
// In terms of interface design, the goal is to provide a very simple entry point (`pin_this_thread`), that is safe to use from any thread at any time, | ||
// and does not require polluting any other modules with state related to thread pinning. The `thread_pinner` RAII class offers the only way to manage the | ||
// lifetime of the pinning mechanism, and prevents misuse. The implementation safely removes pinning from any thread it previously pinned on teardown. | ||
// | ||
// TODO: A future extension would be to respect NUMA for threads performing memory operations, but this requires in-depth knowledge of the system's topology. | ||
namespace celerity::detail::thread_pinning { | ||
|
||
uint32_t affinity_cores_available(); | ||
constexpr uint32_t thread_type_step = 10000; | ||
|
||
/* a priori we need 3 threads, plus 1 for parallel-task workers and at least one more for host-task. | ||
This depends on the application invoking celerity. */ | ||
constexpr static uint64_t min_cores_needed = 5; | ||
// The threads Celerity interacts with ("user") and creates (everything else), identified for the purpose of pinning. | ||
// Note: this is not an enum class to make interactions such as specifying `first_backend_worker+i` easier | ||
enum thread_type : uint32_t { | ||
user = 0 * thread_type_step, | ||
scheduler = 1 * thread_type_step, | ||
executor = 2 * thread_type_step, | ||
first_backend_worker = 3 * thread_type_step, | ||
first_host_queue = 4 * thread_type_step, | ||
}; | ||
std::string thread_type_to_string(const thread_type t_type); | ||
|
||
} // namespace detail | ||
} // namespace celerity | ||
// User-level configuration of the thread pinning mechanism (set by the user via environment variables) | ||
struct environment_configuration { | ||
bool enabled = true; // we want thread pinning to be enabled by default | ||
uint32_t starting_from_core = 1; | ||
std::vector<uint32_t> hardcoded_core_ids; | ||
}; | ||
|
||
// Parses and validates the environment variable string, returning the corresponding configuration | ||
environment_configuration parse_validate_env(const std::string_view str); | ||
|
||
// Configures the pinning mechanism | ||
// For now, only "standard" threads are pinned | ||
// these are threads that benefit from rapid communication between each other, | ||
// i.e. scheduler -> executor -> backend workers | ||
// Extensible for future use where some threads might benefit from NUMA-aware per-GPU pinning | ||
struct runtime_configuration { | ||
// Whether or not to perform pinning | ||
bool enabled = false; | ||
|
||
// Number of devices that will need corresponding threads | ||
uint32_t num_devices = 1; | ||
|
||
// Number of processes running in legacy mode | ||
uint32_t num_legacy_processes = 1; | ||
// Process index of current process running in legacy mode | ||
uint32_t legacy_process_index = 0; | ||
|
||
// The core to start pinning "standard" threads to | ||
uint32_t standard_core_start_id = 1; | ||
|
||
// If set, this list of core ids will be used for pinning instead of the default round-robin scheme | ||
// The list must contain exactly as many elements as there are standard threads | ||
std::vector<uint32_t> hardcoded_core_ids = {}; // NOLINT(readability-redundant-member-init) -- to allow partial designated init elsewhere | ||
}; | ||
|
||
// An RAII class for managing thread pinning | ||
// Only one instance of this class may be active at a time (this is enforced by the implementation) | ||
// Threads pinned by this class will be unpinned when the instance is destroyed | ||
class thread_pinner { | ||
public: | ||
thread_pinner(const runtime_configuration& cfg); | ||
~thread_pinner(); | ||
thread_pinner(const thread_pinner&) = delete; | ||
thread_pinner& operator=(const thread_pinner&) = delete; | ||
thread_pinner(thread_pinner&&) = default; | ||
thread_pinner& operator=(thread_pinner&&) = default; | ||
}; | ||
|
||
// Pins the invoking thread of type `t_type` according to the current configuration | ||
// This is a no-op if the thread pinning machinery is not currently initialized (by a `thread_pinner` instance) | ||
void pin_this_thread(const thread_type t_type); | ||
|
||
} // namespace celerity::detail::thread_pinning |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
#pragma once | ||
|
||
// non-platform-specific code for thread pinning | ||
|
||
#include "affinity.h" | ||
#include "log.h" | ||
|
||
#include <cstdint> | ||
#include <ranges> | ||
#include <string_view> | ||
#include <vector> | ||
|
||
#include <libenvpp/env.hpp> | ||
|
||
namespace celerity::detail::thread_pinning { | ||
|
||
std::string thread_type_to_string(const thread_type t_type) { | ||
switch(t_type) { | ||
case thread_type::user: return "user"; | ||
case thread_type::scheduler: return "scheduler"; | ||
case thread_type::executor: return "executor"; | ||
default: break; | ||
} | ||
if(t_type >= thread_type::first_backend_worker && t_type < thread_type::first_host_queue) { | ||
return fmt::format("backend_worker_{}", t_type - thread_type::first_backend_worker); | ||
} | ||
if(t_type >= thread_type::first_host_queue) { return fmt::format("host_queue_{}", t_type - thread_type::first_host_queue); } | ||
return "unknown"; | ||
} | ||
|
||
environment_configuration parse_validate_env(const std::string_view str) { | ||
using namespace std::string_view_literals; | ||
constexpr const char* error_msg = | ||
"Cannot parse CELERITY_THREAD_PINNING setting, needs to be either 'auto', 'from:#', comma-separated core list, or bool: {}"; | ||
|
||
if(str.empty()) return {}; | ||
|
||
// "auto" case | ||
constexpr uint32_t auto_start_from_core = 1; | ||
if(str == "auto") { return {true, auto_start_from_core, {}}; } | ||
|
||
// "from:" case | ||
constexpr auto from_prefix = "from:"sv; | ||
if(str.starts_with(from_prefix)) { | ||
try { | ||
const auto from = env::default_parser<uint32_t>{}(std::string(str.substr(from_prefix.size()))); | ||
return {true, from, {}}; | ||
} catch(const env::parser_error& e) { | ||
CELERITY_ERROR(error_msg, e.what()); | ||
return {}; | ||
} | ||
} | ||
|
||
// core list case | ||
if(str.find(',') != std::string::npos) { | ||
std::vector<uint32_t> core_ids; | ||
for(auto cs : std::views::split(str, ","sv)) { | ||
try { | ||
core_ids.push_back(env::default_parser<uint32_t>{}(std::string(cs.begin(), cs.end()))); | ||
} catch(const env::parser_error& e) { | ||
CELERITY_ERROR(error_msg, e.what()); | ||
return {}; | ||
} | ||
} | ||
return {true, 0, core_ids}; | ||
} | ||
|
||
// if all else fails, assume we have a boolean | ||
try { | ||
return {env::default_parser<bool>{}(str), auto_start_from_core, {}}; | ||
} catch(const env::parser_error& e) { CELERITY_ERROR(error_msg, e.what()); } | ||
return {}; | ||
} | ||
|
||
} // namespace celerity::detail::thread_pinning |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.