Skip to content

Commit

Permalink
feat(nvidia): Add cupti support
Browse files Browse the repository at this point in the history
This commit adds a --nvidia option, which injects a library into the program under measurement, which records entry and exit into CUDA kernels via CUPTI
  • Loading branch information
cvonelm committed Aug 29, 2024
1 parent a323e32 commit b50f989
Show file tree
Hide file tree
Showing 26 changed files with 883 additions and 64 deletions.
54 changes: 54 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ IfUpdatedUnsetAll(lo2s_USE_STATIC_LIBS
Libpfm_USE_STATIC_LIBS
X86Adapt_STATIC
x86_energy_STATIC
CUDA_USE_STATIC_LIBS
)

if(lo2s_USE_STATIC_LIBS STREQUAL "OFF")
Expand All @@ -45,6 +46,7 @@ if(lo2s_USE_STATIC_LIBS STREQUAL "OFF")
set(x86_energy_STATIC OFF CACHE BOOL "")
set(Sensors_USE_STATIC_LIBS OFF CACHE BOOL "")
set(Libpfm_USE_STATIC_LIBS OFF CACHE BOOL "")
set(CUDA_USE_STATIC_LIBS OFF CACHE BOOL "")
endif()

if(lo2s_USE_STATIC_LIBS STREQUAL "MOSTLY")
Expand All @@ -56,6 +58,7 @@ if(lo2s_USE_STATIC_LIBS STREQUAL "MOSTLY")
set(x86_energy_STATIC ON CACHE BOOL "")
set(Sensors_USE_STATIC_LIBS ON CACHE BOOL "")
set(Libpfm_USE_STATIC_LIBS ON CACHE BOOL "")
set(CUDA_USE_STATIC_LIBS ON CACHE BOOL "")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libstdc++ -static-libgcc")
endif()

Expand All @@ -68,6 +71,7 @@ if(lo2s_USE_STATIC_LIBS STREQUAL "ALL")
set(x86_energy_STATIC ON CACHE BOOL "")
set(Sensors_USE_STATIC_LIBS ON CACHE BOOL "")
set(Libpfm_USE_STATIC_LIBS ON CACHE BOOL "")
set(CUDA_USE_STATIC_LIBS ON CACHE BOOL "")

# Doesn't seem to work with clang, even though it should,
# but at least it doesn't complain about it either
Expand Down Expand Up @@ -107,6 +111,7 @@ find_package(Sensors)
find_package(Veosinfo)
find_package(Libpfm)
find_package(PkgConfig)
find_package(CUDAToolkit)

if(PkgConfig_FOUND)
pkg_check_modules(Audit audit)
Expand All @@ -129,6 +134,8 @@ CMAKE_DEPENDENT_OPTION(USE_LIBAUDIT "Use libaudit for syscall name resolution" O
add_feature_info("USE_LIBAUDIT" USE_LIBAUDIT "Use libaudit for syscall name resolution.")
CMAKE_DEPENDENT_OPTION(USE_VEOSINFO "Use libveosinfo to sample NEC SX-Aurora Tsubasa cards." ON "Veosinfo_FOUND" OFF)
add_feature_info("USE_VEOSINFO" USE_VEOSINFO "Use libveosinfo to sample NEC SX-Aurora Tsubasa cards.")
CMAKE_DEPENDENT_OPTION(USE_CUPTI "Use CUPTI to record CUDA activity." ON "CUDAToolkit_FOUND" OFF)
add_feature_info("USE_CUPTI" USE_CUPTI "Use CUPTI to record CUDA activity.")
# system configuration checks
CHECK_INCLUDE_FILES(linux/hw_breakpoint.h HAVE_HW_BREAKPOINT_H)
CHECK_STRUCT_HAS_MEMBER("struct perf_event_attr" clockid linux/perf_event.h HAVE_PERF_EVENT_ATTR_CLOCKID)
Expand All @@ -144,6 +151,13 @@ if(NOT CLOCK_GETTIME_FOUND)
unset(CMAKE_REQUIRED_LIBRARIES)
endif()

check_function_exists(shm_open SHM_OPEN_FOUND)
if(NOT SHM_OPEN_FOUND)
set(CMAKE_REQUIRED_LIBRARIES "rt")
check_function_exists(shm_open SHM_OPEN_FOUND_WITH_RT)
unset(CMAKE_REQUIRED_LIBRARIES)
endif()

CHECK_STRUCT_HAS_BITFIELD("struct perf_event_attr" context_switch linux/perf_event.h HAVE_PERF_RECORD_SWITCH)

if(NOT HAVE_PERF_RECORD_SWITCH)
Expand Down Expand Up @@ -226,6 +240,14 @@ if(NOT CLOCK_GETTIME_FOUND)
endif()
endif()

if(NOT SHM_OPEN_FOUND)
if(SHM_OPEN_FOUND_WITH_RT)
target_link_libraries(lo2s PRIVATE rt)
else()
message(SEND_ERROR "Could not find the function shm_open(), but it is required.")
endif()
endif()

# handle x86_adapt dependency
if(X86Adapt_FOUND)
target_sources(lo2s PRIVATE
Expand Down Expand Up @@ -306,6 +328,38 @@ if (USE_LIBAUDIT)
endif()
endif()

set(LO2S_CUDA_INJECTIONLIB_PATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/liblo2s_injection.so")
if(USE_CUPTI)
if(CUDAToolkit_FOUND)
add_library(lo2s_injection SHARED src/cupti/lib.cpp)
target_include_directories(lo2s_injection PRIVATE include
${CMAKE_CURRENT_BINARY_DIR}/include)

if (CUDA_USE_STATIC_LIBS)
target_link_libraries(lo2s_injection PRIVATE CUDA::cupti_static)
else()
target_link_libraries(lo2s_injection PRIVATE CUDA::cupti)
endif()

target_link_libraries(lo2s_injection PRIVATE fmt::fmt
Nitro::log
Nitro::env
Nitro::dl
Nitro::options
otf2xx::Writer)

if(SHM_OPEN_FOUND_WITH_RT)
target_link_libraries(lo2s_injection PRIVATE rt)
endif()

target_compile_definitions(lo2s PUBLIC HAVE_CUDA)
install(TARGETS lo2s_injection LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
else()
message(SEND_ERROR "Cupti not found but requested.")
endif()
endif()



# generate version string used in lo2s
if(Git_FOUND)
Expand Down
4 changes: 4 additions & 0 deletions include/lo2s/build_config.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,7 @@


#cmakedefine LO2S_COPYRIGHT_YEAR "@LO2S_COPYRIGHT_YEAR@"

// The CUDA injection library installation path

#cmakedefine LO2S_CUDA_INJECTIONLIB_PATH "@LO2S_CUDA_INJECTIONLIB_PATH@"
4 changes: 4 additions & 0 deletions include/lo2s/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,10 @@ struct Config
bool use_nec;
std::chrono::microseconds nec_read_interval;
std::chrono::milliseconds nec_check_interval;
// Nvidia CUPTI
bool use_nvidia;
std::string cuda_injectionlib_path;
uint64_t nvidia_ringbuf_size;
};

const Config& config();
Expand Down
50 changes: 50 additions & 0 deletions include/lo2s/cupti/events.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* This file is part of the lo2s software.
* Linux OTF2 sampling
*
* Copyright (c) 2024,
* Technische Universitaet Dresden, Germany
*
* lo2s is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* lo2s is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with lo2s. If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#include <cstdint>

namespace lo2s
{
namespace cupti
{
enum class EventType : uint64_t
{
CUPTI_KERNEL = 1,
};

struct event_header
{
EventType type;
uint64_t size;
};

struct event_kernel
{
struct event_header header;
uint64_t start;
uint64_t end;
char name[1];
};

} // namespace cupti
} // namespace lo2s
98 changes: 98 additions & 0 deletions include/lo2s/cupti/reader.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
* This file is part of the lo2s software.
* Linux OTF2 sampling
*
* Copyright (c) 2016,
* Technische Universitaet Dresden, Germany
*
* lo2s is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* lo2s is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with lo2s. If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#include <lo2s/config.hpp>
#include <lo2s/cupti/events.hpp>
#include <lo2s/log.hpp>
#include <lo2s/perf/time/converter.hpp>
#include <lo2s/ringbuf.hpp>
#include <lo2s/trace/trace.hpp>
#include <lo2s/types.hpp>

#include <chrono>
#include <cstdlib>
#include <string>

extern "C"
{
#include <sys/timerfd.h>
#include <unistd.h>
}

namespace lo2s
{
namespace cupti
{

class Reader
{
public:
Reader(trace::Trace& trace, Process process)
: process_(process), trace_(trace), time_converter_(perf::time::Converter::instance()),
ringbuf_reader_("cupti", process.as_pid_t(), true, config().nvidia_ringbuf_size),
timer_fd_(timerfd_from_ns(config().userspace_read_interval)),
executable_name_(get_process_exe(process))
{
}

void read()
{
struct event_header* header = nullptr;

while ((header = reinterpret_cast<struct event_header*>(
ringbuf_reader_.get(sizeof(struct event_header)))) != nullptr)
{
if (header->type == EventType::CUPTI_KERNEL)
{
struct event_kernel* kernel =
reinterpret_cast<struct event_kernel*>(ringbuf_reader_.get(header->size));

auto& writer = trace_.cuda_writer(Thread(process_.as_thread()));

std::string kernel_name = kernel->name;
auto& cu_cctx = trace_.cuda_calling_context(executable_name_, kernel_name);

writer.write_calling_context_enter(time_converter_(kernel->start), cu_cctx.ref(),
2);
writer.write_calling_context_leave(time_converter_(kernel->end), cu_cctx.ref());
}

ringbuf_reader_.pop(header->size);
}
}

int fd()
{
return timer_fd_;
}

private:
Process process_;
trace::Trace& trace_;
perf::time::Converter& time_converter_;
RingBufReader ringbuf_reader_;
int timer_fd_;
std::string executable_name_;
};
} // namespace cupti
} // namespace lo2s
8 changes: 8 additions & 0 deletions include/lo2s/measurement_scope.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ enum class MeasurementScopeType
NEC_METRIC,
BIO,
SYSCALL,
CUDA,
UNKNOWN
};

Expand Down Expand Up @@ -79,6 +80,11 @@ struct MeasurementScope
return { MeasurementScopeType::SYSCALL, s };
}

static MeasurementScope cuda(ExecutionScope s)
{
return { MeasurementScopeType::CUDA, s };
}

friend bool operator==(const MeasurementScope& lhs, const MeasurementScope& rhs)
{
return (lhs.scope == rhs.scope) && lhs.type == rhs.type;
Expand Down Expand Up @@ -111,6 +117,8 @@ struct MeasurementScope
return fmt::format("block layer I/O events for {}", scope.name());
case MeasurementScopeType::SYSCALL:
return fmt::format("syscall events for {}", scope.name());
case lo2s::MeasurementScopeType::CUDA:
return fmt::format("cuda kernel events for {}", scope.name());
default:
throw new std::runtime_error("Unknown ExecutionScopeType!");
}
Expand Down
2 changes: 1 addition & 1 deletion include/lo2s/monitor/abstract_process_monitor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class AbstractProcessMonitor
virtual void insert_process(Process parent, Process process, std::string proc_name,
bool spawn = false) = 0;
virtual void insert_thread(Process process, Thread thread, std::string name = "",
bool spawn = false) = 0;
bool spawn = false, bool is_process = false) = 0;

virtual void exit_thread(Thread thread) = 0;

Expand Down
3 changes: 2 additions & 1 deletion include/lo2s/monitor/process_monitor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ class ProcessMonitor : public AbstractProcessMonitor, public MainMonitor
~ProcessMonitor();
void insert_process(Process parent, Process child, std::string proc_name,
bool spawn = false) override;
void insert_thread(Process parent, Thread child, std::string name, bool spawn = false) override;
void insert_thread(Process parent, Thread child, std::string name, bool spawn = false,
bool is_process = false) override;

void exit_thread(Thread thread) override;

Expand Down
7 changes: 5 additions & 2 deletions include/lo2s/monitor/scope_monitor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,15 @@
#include <lo2s/monitor/main_monitor.hpp>
#include <lo2s/monitor/poll_monitor.hpp>

#include <lo2s/cupti/reader.hpp>
#include <lo2s/perf/counter/group/writer.hpp>
#include <lo2s/perf/counter/userspace/writer.hpp>

#include <lo2s/perf/sample/writer.hpp>
#include <lo2s/perf/syscall/writer.hpp>

#include <array>
#include <chrono>
#include <memory>
#include <thread>

#include <cstddef>
Expand All @@ -50,7 +51,8 @@ namespace monitor
class ScopeMonitor : public PollMonitor
{
public:
ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enable_on_exec);
ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enable_on_exec,
bool is_process = false);

void initialize_thread() override;
void finalize_thread() override;
Expand All @@ -74,6 +76,7 @@ class ScopeMonitor : public PollMonitor
std::unique_ptr<perf::sample::Writer> sample_writer_;
std::unique_ptr<perf::counter::group::Writer> group_counter_writer_;
std::unique_ptr<perf::counter::userspace::Writer> userspace_counter_writer_;
std::unique_ptr<cupti::Reader> cupti_reader_;
};
} // namespace monitor
} // namespace lo2s
4 changes: 2 additions & 2 deletions include/lo2s/monitor/system_process_monitor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ class SystemProcessMonitor : public AbstractProcessMonitor
virtual void insert_process(Process parent, Process process, std::string proc_name,
bool spawn) override;

virtual void insert_thread(Process process, Thread thread, std::string name,
bool spawn) override;
virtual void insert_thread(Process process, Thread thread, std::string name, bool spawn,
bool is_process) override;

virtual void exit_thread(Thread thread) override;

Expand Down
Loading

0 comments on commit b50f989

Please sign in to comment.