From d71fc11022d8d8c91338843c941d997cd4f32e0e Mon Sep 17 00:00:00 2001 From: Frank Lange Date: Sun, 7 Jan 2024 20:07:24 +0100 Subject: [PATCH] Change use of map to unordered_map where possible Summary: This diff changes the use of std::map to unordered_map where order is irrelevant e.g. ID mappings. --- dynolog/src/KernelCollectorBase.cpp | 5 +- dynolog/src/KernelCollectorBase.h | 6 +- dynolog/src/LibkinetoConfigManager.h | 7 ++- dynolog/src/Metrics.cpp | 4 +- dynolog/src/PerfMonitor.h | 6 +- dynolog/src/metric_frame/MetricFrame.h | 4 +- dynolog/tests/KernelCollecterTest.cpp | 4 +- hbt/src/mon/Monitor.h | 17 +++--- hbt/src/perf_event/BuiltinMetrics.cpp | 82 +++++++++++++------------- hbt/src/perf_event/Metrics.h | 8 +-- hbt/src/perf_event/PmuDevices.h | 13 ++-- 11 files changed, 81 insertions(+), 75 deletions(-) diff --git a/dynolog/src/KernelCollectorBase.cpp b/dynolog/src/KernelCollectorBase.cpp index ef02c3cb..b925da60 100644 --- a/dynolog/src/KernelCollectorBase.cpp +++ b/dynolog/src/KernelCollectorBase.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include DEFINE_bool( @@ -110,7 +111,7 @@ void KernelCollectorBase::readCpuStats() { void KernelCollectorBase::readNetworkStats() { auto devices = pfs_.get_net().get_dev(); - std::map rxtxNew_; + std::unordered_map rxtxNew_; size_t nicDevCount = 0; for (const auto& device : devices) { @@ -168,7 +169,7 @@ bool KernelCollectorBase::isMonitoringInterfaceActive(std::string interface) { } void KernelCollectorBase::updateNetworkStatsDelta( - const std::map& rxtxNew) { + const std::unordered_map& rxtxNew) { rxtxDelta_.clear(); for (const auto& [devName, devRxtxNew] : rxtxNew) { if (rxtx_.find(devName) == rxtx_.end()) { diff --git a/dynolog/src/KernelCollectorBase.h b/dynolog/src/KernelCollectorBase.h index 2485ceb7..6ee405d7 100644 --- a/dynolog/src/KernelCollectorBase.h +++ b/dynolog/src/KernelCollectorBase.h @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include "dynolog/src/Types.h" #include "pfs/procfs.hpp" @@ -47,10 +47,10 @@ class KernelCollectorBase { std::vector perCoreCpuTime_; // Save more recent net device stats - std::map rxtx_, rxtxDelta_; + std::unordered_map rxtx_, rxtxDelta_; void updateNetworkStatsDelta( - const std::map& rxtxNew); + const std::unordered_map& rxtxNew); bool isMonitoringInterfaceActive(std::string interface); // Should match googletest/include/gtest/gtest_prod.h diff --git a/dynolog/src/LibkinetoConfigManager.h b/dynolog/src/LibkinetoConfigManager.h index 78d34088..5998004d 100644 --- a/dynolog/src/LibkinetoConfigManager.h +++ b/dynolog/src/LibkinetoConfigManager.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "dynolog/src/LibkinetoTypes.h" @@ -69,12 +70,12 @@ class LibkinetoConfigManager { // Map of pid ancestry -> LibkinetoProcess using ProcessMap = std::map, LibkinetoProcess>; - std::map jobs_; + std::unordered_map jobs_; // Map of gpu id -> pids - using InstancesPerGpuMap = std::map>; + using InstancesPerGpuMap = std::unordered_map>; // Job id -> InstancesPerGpu - std::map jobInstancesPerGpu_; + std::unordered_map jobInstancesPerGpu_; mutable std::mutex mutex_; void setOnDemandConfigForProcess( diff --git a/dynolog/src/Metrics.cpp b/dynolog/src/Metrics.cpp index 82b8d7e5..8cd7f3af 100644 --- a/dynolog/src/Metrics.cpp +++ b/dynolog/src/Metrics.cpp @@ -6,7 +6,7 @@ #include "dynolog/src/Metrics.h" #include -#include +#include namespace dynolog { @@ -34,7 +34,7 @@ const std::vector getAllMetrics() { .type = MetricType::Instant, .desc = "How long the system has been running in seconds."}, }; - static std::map cpustats = { + static std::unordered_map cpustats = { {"cpu_u_ms", "user"}, {"cpu_s_ms", "system"}, {"cpu_n_ms", "nice"}, diff --git a/dynolog/src/PerfMonitor.h b/dynolog/src/PerfMonitor.h index 148af7c4..852aae70 100644 --- a/dynolog/src/PerfMonitor.h +++ b/dynolog/src/PerfMonitor.h @@ -9,6 +9,8 @@ #include "hbt/src/mon/Monitor.h" #include "hbt/src/perf_event/BuiltinMetrics.h" +#include + namespace hbt = facebook::hbt; namespace dynolog { @@ -39,8 +41,8 @@ class PerfMonitor { const hbt::CpuSet& monCpus_; std::shared_ptr pmuDeviceManager_; const MuxGroupId defaultMuxGroupId_; - std::map> readValues_; - std::map> countReaders_; + std::unordered_map> readValues_; + std::unordered_map> countReaders_; }; // singleton object for default Metrics and PmuDeviceManager diff --git a/dynolog/src/metric_frame/MetricFrame.h b/dynolog/src/metric_frame/MetricFrame.h index 87ba02a0..ec51424a 100644 --- a/dynolog/src/metric_frame/MetricFrame.h +++ b/dynolog/src/metric_frame/MetricFrame.h @@ -10,8 +10,8 @@ #include "dynolog/src/metric_frame/MetricSeries.h" #include -#include #include +#include #include #include @@ -38,7 +38,7 @@ class MetricFrameMap : public MetricFrameBase { void show(std::ostream& s) const override; protected: - std::map series_; + std::unordered_map series_; }; using VectorSeriesDefList = std::vector; diff --git a/dynolog/tests/KernelCollecterTest.cpp b/dynolog/tests/KernelCollecterTest.cpp index 4c032e92..0ab0feaa 100644 --- a/dynolog/tests/KernelCollecterTest.cpp +++ b/dynolog/tests/KernelCollecterTest.cpp @@ -109,8 +109,8 @@ TEST(KernelCollecterTest, NetworkStatsTest) { } TEST(KernelCollecterTest, UpdateNetworkStatsDeltaTest) { - std::map oneDevice; - std::map twoDevices; + std::unordered_map oneDevice; + std::unordered_map twoDevices; KernelCollectorBase kb{get_test_root()}; diff --git a/hbt/src/mon/Monitor.h b/hbt/src/mon/Monitor.h index dee05d45..6201592b 100644 --- a/hbt/src/mon/Monitor.h +++ b/hbt/src/mon/Monitor.h @@ -14,6 +14,7 @@ #include #include +#include #ifdef HBT_ENABLE_TRACING #include "hbt/src/mon/TraceMonitor.h" @@ -200,13 +201,13 @@ class Monitor { /// Read counts for all events opened in sampling mode /// in all TraceCollectors. auto readSamplingCounts() const { - using TraceCollectorReadValues = std::map< + using TraceCollectorReadValues = std::unordered_map< std::string, std::optional>; std::lock_guard lock{mutex_}; - std::map rvs; + std::unordered_map rvs; for (auto& [k, tm] : trace_monitors_) { HBT_THROW_ASSERT_IF(tm == nullptr); @@ -220,10 +221,10 @@ class Monitor { /// Read counts for all events opened in counting mode /// in all PerCpuCountReaders. - std::map> readAllCounts() + std::unordered_map> readAllCounts() const { std::lock_guard lock{mutex_}; - std::map> rvs; + std::unordered_map> rvs; for (auto& [k, cr] : count_readers_) { HBT_THROW_ASSERT_IF(cr == nullptr); @@ -234,10 +235,10 @@ class Monitor { /// Read counts for all events opened in counting mode /// in all PerCpuCountReaders. - std::map>> + std::unordered_map>> readAllCountsPerCpu() const { std::lock_guard lock{mutex_}; - std::map>> rvs; + std::unordered_map>> rvs; for (auto& [k, cr] : count_readers_) { HBT_THROW_ASSERT_IF(cr == nullptr); @@ -343,10 +344,10 @@ class Monitor { #ifdef HBT_ENABLE_BPERF /// Read counts for all events opened in counting mode /// in all BPerfCountReaders. - std::map> + std::unordered_map> readAllBPerfCounts(bool skip_offset = false) const { std::lock_guard lock{mutex_}; - std::map> rvs; + std::unordered_map> rvs; for (auto& [k, cr] : bperf_count_readers_) { HBT_THROW_ASSERT_IF(cr == nullptr); diff --git a/hbt/src/perf_event/BuiltinMetrics.cpp b/hbt/src/perf_event/BuiltinMetrics.cpp index edd7f248..f2c8e585 100644 --- a/hbt/src/perf_event/BuiltinMetrics.cpp +++ b/hbt/src/perf_event/BuiltinMetrics.cpp @@ -18,8 +18,8 @@ #include "hbt/src/perf_event/json_events/generated/intel/JsonEvents.h" #endif // USE_JSON_GENERATED_PERF_EVENTS -#include #include +#include namespace facebook::hbt::perf_event { @@ -522,7 +522,7 @@ std::shared_ptr makeAvailableMetrics() { "instructions", "Number of CPU instructions retired since the counter is enabled.", "Number of CPU instructions retired since the counter is enabled.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "instructions", @@ -539,7 +539,7 @@ std::shared_ptr makeAvailableMetrics() { "cycles", "Number of CPU clock cycles since the counter is enabled.", "Number of CPU clock cycles since the counter is enabled.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "cycles", @@ -555,7 +555,7 @@ std::shared_ptr makeAvailableMetrics() { "instructions_per_cycle", "Average number of instructions executed each clock cycle.", "Average number of instructions executed each clock cycle.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{ EventRef{ @@ -583,7 +583,7 @@ std::shared_ptr makeAvailableMetrics() { "Core-originated cacheable demand requests missed L2", "Counts core-originated cacheable requests that miss the L2 cache. " "Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches from L1 and L2. ", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "l2_cache_misses", @@ -600,7 +600,7 @@ std::shared_ptr makeAvailableMetrics() { "Core-originated cacheable demand requests missed L2", "Counts core-originated cacheable requests that miss the L2 cache. " "Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches from L2. ", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{ EventRef{ @@ -634,7 +634,7 @@ std::shared_ptr makeAvailableMetrics() { "Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches from L1 and L2. " "It does not include all misses to the L3." "Also count number of instructions in the same period to calculate l3 misses per instruction.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{ EventRef{ @@ -659,7 +659,7 @@ std::shared_ptr makeAvailableMetrics() { "dram_access_reads", "Memory bandwidth used for read events.", "Memory bandwidth used for read events. The value is inferred from Intel offcore counters.", - std::map{ + std::unordered_map{ {CpuArch::BDW, EventRefs{EventRef{ "dram_access_reads", @@ -714,7 +714,7 @@ std::shared_ptr makeAvailableMetrics() { " executed by AVX vector instruction set." "Each instruction can be converted to operations by multipying the count" " with 1, 4, 8, 16 respectively.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{ EventRef{ @@ -752,7 +752,7 @@ std::shared_ptr makeAvailableMetrics() { " executed by AVX vector instruction set." "Each instruction can be converted to operations by multipying the count" " with 1, 2, 4, 8 respectively.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{ EventRef{ @@ -787,7 +787,7 @@ std::shared_ptr makeAvailableMetrics() { "cpu_clock", "High-resolution sys and user CPU clock", "High-resolution sys and user CPU clock", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{ EventRef{ @@ -810,7 +810,7 @@ std::shared_ptr makeAvailableMetrics() { "generic_sw", "All generic software events every context switch", "All generic software events. They are never multiplexed.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{ EventRef{ @@ -863,7 +863,7 @@ std::shared_ptr makeAvailableMetrics() { "page_faults", "Software Page faults", "Major and minor page faults", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{ EventRef{ @@ -893,7 +893,7 @@ std::shared_ptr makeAvailableMetrics() { "system_calls", "System calls Tracepoint", "System calls Tracepoint Event", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{ EventRef{ @@ -917,7 +917,7 @@ std::shared_ptr makeAvailableMetrics() { "dqos", "System-derived estimation of Dyno QoS", "IPC and Scheduler stats. Requires root. Make sure /proc/sys/kernel/sched_schedstats is set.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{ EventRef{ @@ -1002,7 +1002,7 @@ std::shared_ptr makeAvailableMetrics() { "ipc", "IPC including user, kernel, and hypervisor.", "Intructions-per-Cycle (IPC) including user, kernel, and hypervisor. ", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{ EventRef{ @@ -1025,7 +1025,7 @@ std::shared_ptr makeAvailableMetrics() { "cs_ipc", "Context switch-based IPC including user, kernel, and hypervisor.", "Context switch-based Intructions-per-Cycle (IPC) including user, kernel, and hypervisor. ", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{ EventRef{ @@ -1054,7 +1054,7 @@ std::shared_ptr makeAvailableMetrics() { "cycles_breakdown", "Cycles in user, kernel and idle.", "Time (ref-cycles) and cycles spent in user (ring 3) or kernel (ring 0)", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{ EventRef{ @@ -1084,7 +1084,7 @@ std::shared_ptr makeAvailableMetrics() { "topdown_l4_mem", "External memory (DRAM) bandwidth and latency", "External memory (DRAM) bandwidth and latency.", - std::map{ + std::unordered_map{ {CpuArch::BDX, EventRefs{ EventRef{ @@ -1122,7 +1122,7 @@ std::shared_ptr makeAvailableMetrics() { "topdown_l3_icache", "Fraction of cycles the CPU was stalled due to instruction cache misses", "Fraction of cycles the CPU was stalled due to instruction cache misses.", - std::map{ + std::unordered_map{ {CpuArch::BDX, EventRefs{ EventRef{ @@ -1146,7 +1146,7 @@ std::shared_ptr makeAvailableMetrics() { "topdown_l3_L1_bound", "Fraction of cycles the CPU was stalled due to L1 data cache misses", "Fraction of cycles the CPU was stalled due to L1 data cache misses.", - std::map{ + std::unordered_map{ {CpuArch::BDX, EventRefs{ EventRef{ @@ -1176,7 +1176,7 @@ std::shared_ptr makeAvailableMetrics() { "topdown_l3_L2_bound", "Estimates how often the CPU was stalled due to L2 cache accesses by loads.", "Estimates how often the CPU was stalled due to L2 cache accesses by loads.", - std::map{ + std::unordered_map{ {CpuArch::BDX, EventRefs{ EventRef{ @@ -1220,7 +1220,7 @@ std::shared_ptr makeAvailableMetrics() { "Provides how many CPU is halted, how much is spent in kernel and user space " " and a breakdown of how many pipeline slots are wasted due " "to each hardware bottleneck.", - std::map{ + std::unordered_map{ {CpuArch::SKX, EventRefs{ // Note: There are two events for CPU_CLK_UNHALTED: @@ -1333,7 +1333,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_ICACHE_MISSES", "L2 code requests", "Counts the total number of L2 code requests.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "icache_misses", @@ -1350,7 +1350,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_ICACHE_MISSES_PERF", "Level 1 instruction cache load operation misses", "Level 1 instruction cache load operation misses", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "icache_misses_perf", @@ -1366,7 +1366,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_DCACHE_MISSES", "Counts the number of cache lines replaced in L1 data cache.", "Counts L1D data line replacements including opportunistic replacements, and replacements that require stall-for-replace or block-for-replace.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "dcache_misses", @@ -1382,7 +1382,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_ITLB_MISSES", "Code miss in all TLB levels causes a page walk that completes. (All page sizes)", "Counts completed page walks (all page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "itlb_misses", @@ -1398,7 +1398,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_L2_MISSES", "L2 cache lines filling L2", "Counts the number of L2 cache lines filling the L2. Counting does not cover rejects.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "core_l2_misses", @@ -1414,7 +1414,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_LLC_MISSES", "Core-originated cacheable requests that missed L3 (Except hardware prefetches to the L3)", "Counts core-originated cacheable requests that miss the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "llc_misses", @@ -1430,7 +1430,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_BRANCH_MISSES", "All mispredicted branch instructions retired.", "Counts all the retired branch instructions that were mispredicted by the processor. A branch misprediction occurs when the processor incorrectly predicts the destination of the branch. When the misprediction is discovered at execution, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "branch_misses", @@ -1446,7 +1446,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_BRANCH_INSTRUCTIONS", "All branch instructions retired.", "Counts all branch instructions retired.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "branch_misses", @@ -1462,7 +1462,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_L2_PREFETCH_HITS", "SW prefetch requests that hit L2 cache.", "Counts Software prefetch requests that hit the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "prefetch_hits", @@ -1478,7 +1478,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_L2_PREFETCH_MISSES", "SW prefetch requests that miss L2 cache.", "Counts Software prefetch requests that miss the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "prefetch_misses", @@ -1494,7 +1494,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_FLOPS_DP_SCALAR", "Counts number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", "Number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "flops_dp_scalar", @@ -1510,7 +1510,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_FLOPS_SP_SCALAR", "Counts number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", "Number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "flops_sp_scalar", @@ -1526,7 +1526,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_FLOPS_DP_SSE", "Counts number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "flops_dp_sse", @@ -1542,7 +1542,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_FLOPS_SP_SSE", "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "flops_sp_sse", @@ -1558,7 +1558,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_FLOPS_DP_AVX", "Counts number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "flops_dp_avx", @@ -1574,7 +1574,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_FLOPS_SP_AVX", "Counts number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "flops_sp_avx", @@ -1590,7 +1590,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_FLOPS_DP_AVX2", "Counts number of SSE/AVX computational 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", "Number of SSE/AVX computational 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "flops_dp_avx2", @@ -1606,7 +1606,7 @@ void addCoreMetrics(std::shared_ptr& metrics) { "HW_CORE_FLOPS_SP_AVX2", "Counts number of SSE/AVX computational 512-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 16 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", "Number of SSE/AVX computational 512-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 16 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", - std::map{ + std::unordered_map{ {std::nullopt, EventRefs{EventRef{ "flops_sp_avx2", diff --git a/hbt/src/perf_event/Metrics.h b/hbt/src/perf_event/Metrics.h index 0cb2aef5..d2bf9512 100644 --- a/hbt/src/perf_event/Metrics.h +++ b/hbt/src/perf_event/Metrics.h @@ -9,8 +9,8 @@ #include "hbt/src/perf_event/PmuDevices.h" #include "hbt/src/perf_event/PmuEvent.h" -#include #include +#include #include namespace facebook::hbt::perf_event { @@ -46,7 +46,7 @@ struct MetricDesc { MetricId id; std::string brief_desc; std::string full_desc; - std::map event_refs_by_arch; + std::unordered_map event_refs_by_arch; uint64_t default_sampling_period; System::Permissions req_permissions; std::vector dives; @@ -56,7 +56,7 @@ struct MetricDesc { MetricId id, const std::string& brief_desc, const std::string& full_desc, - const std::map& event_refs_by_arch, + const std::unordered_map& event_refs_by_arch, uint64_t default_sampling_period, const System::Permissions& req_permissions, const std::vector& dives, @@ -223,7 +223,7 @@ class Metrics { } protected: - std::map> metric_descs_; + std::unordered_map> metric_descs_; }; } // namespace facebook::hbt::perf_event diff --git a/hbt/src/perf_event/PmuDevices.h b/hbt/src/perf_event/PmuDevices.h index 57873938..3ac480af 100644 --- a/hbt/src/perf_event/PmuDevices.h +++ b/hbt/src/perf_event/PmuDevices.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -59,7 +60,7 @@ struct LibPfm4EventGroup { } }; -using LibPfm4EventGroups = std::map; +using LibPfm4EventGroups = std::unordered_map; /// An instance representing a system's PMU (a Performance Monitoring Unit). /// It can be statically enumerated PMU or a dynamic one. @@ -136,8 +137,8 @@ class PmuDevice { uint8_t len; }; - using SysFsDeviceCaps = std::map; - using SysFsDeviceFormat = std::map; + using SysFsDeviceCaps = std::unordered_map; + using SysFsDeviceFormat = std::unordered_map; // Entries in format subfolder // (/sys/devices//format). @@ -256,8 +257,8 @@ class PmuDevice { bool in_sysfs_; // Alias as key, original event ID as value. - std::map> event_defs_; - std::map aliases_; + std::unordered_map> event_defs_; + std::unordered_map aliases_; // PMUs that are not per-core can be opened for any // CPU within a CPU group. In uncore PMUs, this is @@ -273,7 +274,7 @@ class PmuDevice { } }; -using PerCpuEventConfs = std::map; +using PerCpuEventConfs = std::unordered_map; /// Container for all types and instances of PMUs in the system. class PmuDeviceManager {