Skip to content

Commit

Permalink
dev/licenses(#312)
Browse files Browse the repository at this point in the history
feat: parse licenses config

feat: queryLicenses

feat: licenses malloc and free

refactor

feat: Add licenses configuration to the config template file

refactor

feat: Use the cqueue -L parameter to query task information for specified licenses.

fix: No error is reported when the task requests non-existent licenses.

merge master

feat: display 'licenses' for the pending queue reason.

fix: ctld restarts without allocating licenses.

merge master

refactor
  • Loading branch information
huerni committed Aug 16, 2024
1 parent 78cd701 commit 96f35d4
Show file tree
Hide file tree
Showing 51 changed files with 3,566 additions and 975 deletions.
46 changes: 23 additions & 23 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,44 +48,44 @@ set(CMAKE_CXX_STANDARD 20)

if (CMAKE_BUILD_TYPE STREQUAL "Debug")
set(CRANE_ENABLE_TESTS ON)
endif()
endif ()

# Set the minimal log level based on the build type if it has not been explicitly set by the user
if(CRANE_MIN_LOG_LEVEL STREQUAL "OFF")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
if (CRANE_MIN_LOG_LEVEL STREQUAL "OFF")
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
set(CRANE_MIN_LOG_LEVEL "TRACE")
else()
else ()
set(CRANE_MIN_LOG_LEVEL "DEBUG")
endif()
endif()
endif ()
endif ()

# Validate the user-provided log level
if(NOT CRANE_MIN_LOG_LEVEL MATCHES "INFO|DEBUG|TRACE")
if (NOT CRANE_MIN_LOG_LEVEL MATCHES "INFO|DEBUG|TRACE")
message(FATAL_ERROR "Invalid log level: ${CRANE_MIN_LOG_LEVEL}. Must be INFO, DEBUG, or TRACE.")
endif()
endif ()

add_compile_definitions(CRANE_LOG_LEVEL=CRANE_LOG_LEVEL_${CRANE_MIN_LOG_LEVEL})
message(STATUS "Minimal log level is set to ${CRANE_MIN_LOG_LEVEL}")

# Generate version text and building timestamp
if(EXISTS "${CMAKE_SOURCE_DIR}/VERSION")
if (EXISTS "${CMAKE_SOURCE_DIR}/VERSION")
# For a release version, the version file should be provided
file(READ "${CMAKE_SOURCE_DIR}/VERSION" VERSION_CONTENT)
else()
else ()
# Otherwise, use git hash as the version
message(WARNING "No VERSION file found. Use git hash as the version string.")
execute_process(
COMMAND git rev-parse --short HEAD
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE VERSION_CONTENT
OUTPUT_STRIP_TRAILING_WHITESPACE
COMMAND git rev-parse --short HEAD
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE VERSION_CONTENT
OUTPUT_STRIP_TRAILING_WHITESPACE
)
endif()
endif ()

if("${VERSION_CONTENT}" STREQUAL "")
if ("${VERSION_CONTENT}" STREQUAL "")
# if not in git repository, fall back to "Unknown"
set(VERSION_CONTENT "Unknown")
endif()
endif ()

string(TIMESTAMP BUILD_TIMESTAMP "%a, %d %b %Y %H:%M:%S %z")

Expand Down Expand Up @@ -303,12 +303,12 @@ set(CPACK_COMPONENTS_GROUPING ONE_PER_GROUP)
set(CPACK_RPM_COMPONENT_INSTALL ON)

cpack_add_component(cranedc
DISPLAY_NAME "craned"
DESCRIPTION "craned component"
DISPLAY_NAME "craned"
DESCRIPTION "craned component"
GROUP craned)
cpack_add_component(cranectldc
DISPLAY_NAME "cranectld"
DESCRIPTION "cranectld component"
DISPLAY_NAME "cranectld"
DESCRIPTION "cranectld component"
GROUP cranectld)

cpack_add_component_group(craned)
Expand All @@ -323,13 +323,13 @@ install(TARGETS craned
DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
COMPONENT cranedc
PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_WRITE GROUP_EXECUTE WORLD_READ WORLD_WRITE WORLD_EXECUTE
)
)

install(TARGETS cranectld
DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
COMPONENT cranectldc
PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_WRITE GROUP_EXECUTE WORLD_READ WORLD_WRITE WORLD_EXECUTE
)
)

install(FILES
${CMAKE_BINARY_DIR}/etc/cranectld.service
Expand Down
12 changes: 6 additions & 6 deletions dependencies/cmake/grpc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ if (CRANE_USE_GITEE_SOURCE)
set(RE2_SRC_URL "https://gitee.com/zenglingbo/crane-sched-deps/raw/master/re2-2022-06-01.tar.gz")
set(GRPC_SRC_URL "https://gitee.com/zenglingbo/crane-sched-deps/raw/master/grpc-1.51.0.tar.gz")
else ()
set(ABSL_SRC_URL "https://github.com/abseil/abseil-cpp/archive/refs/tags/20220623.1.tar.gz")
set(ABSL_SRC_URL "https://github.com/abseil/abseil-cpp/releases/download/20240116.2/abseil-cpp-20240116.2.tar.gz")
set(C_ARES_SRC_URL "https://github.com/c-ares/c-ares/releases/download/cares-1_18_1/c-ares-1.18.1.tar.gz")
set(PROTOBUF_SRC_URL "https://github.com/protocolbuffers/protobuf/releases/download/v21.8/protobuf-cpp-3.21.8.tar.gz")
set(PROTOBUF_SRC_URL "https://github.com/protocolbuffers/protobuf/releases/download/v27.2/protobuf-27.2.tar.gz")
set(RE2_SRC_URL "https://github.com/google/re2/archive/refs/tags/2022-06-01.tar.gz")
set(GRPC_SRC_URL "https://github.com/grpc/grpc/archive/refs/tags/v1.51.0.tar.gz")
set(GRPC_SRC_URL "https://github.com/grpc/grpc/archive/refs/tags/v1.65.2.tar.gz")
endif ()

set(ABSL_PROPAGATE_CXX_STD ON)
Expand All @@ -21,7 +21,7 @@ FetchContent_Declare(absl
OVERRIDE_FIND_PACKAGE

URL ${ABSL_SRC_URL}
URL_HASH SHA256=91ac87d30cc6d79f9ab974c51874a704de9c2647c40f6932597329a282217ba8
URL_HASH SHA256=733726b8c3a6d39a4120d7e45ea8b41a434cdacde401cba500f14236c49b39dc
INACTIVITY_TIMEOUT 5
)
FetchContent_MakeAvailable(absl)
Expand Down Expand Up @@ -60,7 +60,7 @@ FetchContent_Declare(protobuf
OVERRIDE_FIND_PACKAGE

URL ${PROTOBUF_SRC_URL}
URL_HASH SHA256=f6251f2d00aad41b34c1dfa3d752713cb1bb1b7020108168a4deaa206ba8ed42
URL_HASH SHA256=e4ff2aeb767da6f4f52485c2e72468960ddfe5262483879ef6ad552e52757a77
INACTIVITY_TIMEOUT 5
)
FetchContent_MakeAvailable(protobuf)
Expand Down Expand Up @@ -110,7 +110,7 @@ FetchContent_Declare(grpc
OVERRIDE_FIND_PACKAGE

URL ${GRPC_SRC_URL}
URL_HASH SHA256=7f42363711eb483a0501239fd5522467b31d8fe98d70d7867c6ca7b52440d828
URL_HASH SHA256=0ff2e0a6abf195cf23b4ce808570bcbb2ff4b5bee453af0b45afd496e661f2c0
INACTIVITY_TIMEOUT 5
)
FetchContent_MakeAvailable(grpc)
Expand Down
26 changes: 22 additions & 4 deletions etc/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,25 @@ PriorityWeightQ0S: 1000000
# list of configuration information of the computing machine
# Nodes and partitions settings
Nodes:
- name: "cn[15-18]"
- name: "cn[15-16]"
cpu: 2
memory: 2G

- name: "cn[17-18]"
cpu: 2
memory: 2G
gres:
- name: gpu
type: a100
DeviceFileRegex: /dev/nvidia[0-3]
DeviceFileList:
- /dev/dri/renderer[0-3]
EnvInjector: nvidia

- name: gpu
type: h100
file: /dev/nvidia[4-7]

# partition information list
Partitions:
- name: CPU
Expand All @@ -76,10 +91,10 @@ Partitions:
- name: GPU
nodes: "cn[17-18]"
priority: 3
# Optional default memory per cpu in MB
DefaultMemPerCpu: 100
# Optional default memory per cpu in MB, 0 lets scheduler to decide
DefaultMemPerCpu: 0
# Optional maximum memory per cpu in MB, 0 indicates no limit
MaxMemPerCpu: 100
MaxMemPerCpu: 0

DefaultPartition: CPU

Expand All @@ -94,6 +109,9 @@ PendingQueueMaxSize: 900000
# Default value is 100000.
ScheduledBatchSize: 100000

# The set licenses and their quantities (Only need to configure in the ctld config file.)
# Licenses: fluent:30,ansys:100

# Scheduler will reject all jobs beyond processing capacity set by PendingQueueMaxSize
# if this option is set to true.
# Default value is false.
Expand Down
52 changes: 48 additions & 4 deletions protos/Crane.proto
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ message ExecuteTasksReply {
message CreateCgroupForTasksRequest {
repeated uint32 task_id_list = 1;
repeated uint32 uid_list = 2;
repeated Resources res_list = 3;
repeated ResourceInNode res_list = 3;
repeated string execution_node = 4;
}

message CreateCgroupForTasksReply{}
Expand Down Expand Up @@ -174,6 +175,7 @@ message QueryTaskIdFromPortForwardReply{
string cgroup_path = 4;
}


message QueryCranedInfoRequest {
string craned_name = 1;
}
Expand All @@ -190,6 +192,15 @@ message QueryPartitionInfoReply {
repeated PartitionInfo partition_info = 1;
}

message QueryLicensesInfoRequest {
string license_name = 1;
}

message QueryLicensesInfoReply {
bool ok = 1;
repeated LicenseInfo license_info_list = 2;
}

message ModifyTaskRequest {
enum TargetAttributes {
TimeLimit = 0;
Expand Down Expand Up @@ -332,6 +343,27 @@ message MigrateSshProcToCgroupReply {
bool ok = 1;
}

message QueryTaskEnvVariablesRequest{
uint32 task_id = 1;
}

message QueryTaskEnvVariablesReply{
bool ok = 1;
repeated string name = 2;
repeated string value = 3;
}

message QueryTaskEnvVariablesForwardRequest{
uint32 task_id = 1;
string execution_node = 2;
}

message QueryTaskEnvVariablesForwardReply{
bool ok = 1;
repeated string name = 2;
repeated string value = 3;
}

message QueryClusterInfoRequest {
repeated string filter_partitions = 1;
repeated string filter_nodes = 2;
Expand All @@ -354,9 +386,10 @@ message QueryTasksInfoRequest{
repeated TaskStatus filter_task_states = 6;
repeated string filter_users = 7;
repeated string filter_accounts = 8;
TimeInterval filter_submit_time_interval = 9;
TimeInterval filter_start_time_interval = 10;
TimeInterval filter_end_time_interval = 11;
repeated string filter_licenses = 9;
TimeInterval filter_submit_time_interval = 10;
TimeInterval filter_start_time_interval = 11;
TimeInterval filter_end_time_interval = 12;

bool option_include_completed_tasks = 15;
}
Expand Down Expand Up @@ -519,6 +552,13 @@ message StreamCtldReply {
}
}

message QueryActualDresRequest{}

message QueryActualDresReply{
bool ok = 1;
DedicatedResourceInNode dres_in_node = 2;
}

message StreamCrunRequest{
enum CrunRequestType {
TASK_REQUEST = 0;
Expand Down Expand Up @@ -677,6 +717,7 @@ service CraneCtld {
/* PRCs called from ccontrol */
rpc QueryCranedInfo(QueryCranedInfoRequest) returns (QueryCranedInfoReply);
rpc QueryPartitionInfo(QueryPartitionInfoRequest) returns (QueryPartitionInfoReply);
rpc QueryLicensesInfo(QueryLicensesInfoRequest) returns (QueryLicensesInfoReply);
rpc ModifyTask(ModifyTaskRequest) returns (ModifyTaskReply);
rpc ModifyNode(ModifyCranedStateRequest) returns (ModifyCranedStateReply);

Expand Down Expand Up @@ -707,6 +748,7 @@ service Craned {
rpc CreateCgroupForTasks(CreateCgroupForTasksRequest) returns(CreateCgroupForTasksReply);
rpc ReleaseCgroupForTasks(ReleaseCgroupForTasksRequest) returns(ReleaseCgroupForTasksReply);

rpc QueryActualDres(QueryActualDresRequest) returns(QueryActualDresReply);
/*
If the task is an interactive task, the resource uuid is also revoked.
If there's no process in this interactive task, just deallocate all the resources.
Expand All @@ -724,6 +766,8 @@ service Craned {
/* ----------------------------------- Called from Pam Module --------------------------------------------------- */
rpc QueryTaskIdFromPortForward(QueryTaskIdFromPortForwardRequest) returns (QueryTaskIdFromPortForwardReply);
rpc MigrateSshProcToCgroup(MigrateSshProcToCgroupRequest) returns (MigrateSshProcToCgroupReply);
rpc QueryTaskEnvVariables(QueryTaskEnvVariablesRequest) returns (QueryTaskEnvVariablesReply);
rpc QueryTaskEnvVariablesForward(QueryTaskEnvVariablesForwardRequest) returns (QueryTaskEnvVariablesForwardReply);
}

service CraneForeD {
Expand Down
Loading

0 comments on commit 96f35d4

Please sign in to comment.