Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

User QoS resource total limit #370

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions dependencies/cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ add_subdirectory(mongodb-cxx-driver)
add_subdirectory(ranges-v3)
add_subdirectory(backward-cpp)
add_subdirectory(fpm)
add_subdirectory(parallel-hashmap)

#add_subdirectory(mariadb-connector-c)

Expand Down
11 changes: 11 additions & 0 deletions dependencies/cmake/parallel-hashmap/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
include(FetchContent)

set(HASHMAP_SRC_URL "https://github.com/greg7mdp/parallel-hashmap/archive/refs/tags/v1.4.1.tar.gz")

FetchContent_Declare(parallel-hashmap
URL ${HASHMAP_SRC_URL}
URL_HASH SHA256=949874f4207b8735422438b23b884fb1f4b926689bb5eebff38cc4d357d09cd2
INACTIVITY_TIMEOUT 5
)

FetchContent_MakeAvailable(parallel-hashmap)
6 changes: 4 additions & 2 deletions src/CraneCtld/AccountManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include "AccountManager.h"

#include "AccountMetaContainer.h"
#include "protos/PublicDefs.pb.h"
#include "range/v3/algorithm/contains.hpp"

Expand Down Expand Up @@ -832,6 +833,7 @@ AccountManager::CraneExpected<void> AccountManager::ModifyQos(
// Mongodb
Qos qos;
g_db_client->SelectQos("name", name, &qos);

*m_qos_map_[name] = std::move(qos);

return {};
Expand Down Expand Up @@ -969,8 +971,8 @@ result::result<void, std::string> AccountManager::CheckAndApplyQosLimitOnTask(
} else if (task->time_limit > qos_share_ptr->max_time_limit_per_task)
return result::fail("time-limit reached the user's limit.");

if (static_cast<double>(task->cpus_per_task) >
qos_share_ptr->max_cpus_per_user)
if (!g_account_meta_container->CheckAndMallocQosResourceFromUser(
user_share_ptr->name, *task, *qos_share_ptr))
return result::fail("cpus-per-task reached the user's limit.");

return {};
Expand Down
65 changes: 65 additions & 0 deletions src/CraneCtld/AccountMetaContainer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/**
* Copyright (c) 2024 Peking University and Peking University
* Changsha Institute for Computing and Digital Economy
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

#include "AccountMetaContainer.h"

#include "AccountManager.h"

namespace Ctld {

bool AccountMetaContainer::CheckAndMallocQosResourceFromUser(
const std::string& username, const TaskInCtld& task, const Qos& qos) {
if (static_cast<double>(task.cpus_per_task) > qos.max_cpus_per_user ||
qos.max_jobs_per_user == 0)
return false;

bool result = true;

ResourceView resource_view{};
resource_view.GetAllocatableRes().cpu_count = task.cpus_per_task;

user_meta_map_[username].qos_resource_in_use.try_emplace_l(
task.qos,
[&](std::pair<const std::string, QosResource>& pair) {
auto& val = pair.second;
if (val.resource.CpuCount() + static_cast<double>(task.cpus_per_task) >
qos.max_cpus_per_user ||
val.jobs_per_user >= qos.max_jobs_per_user) {
result = false;
return;
}

val.resource.GetAllocatableRes().cpu_count += task.cpus_per_task;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里应该直接加task.requested_node_res_view*node

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

噢噢对

val.jobs_per_user++;
},
QosResource{resource_view, 1});

return result;
}

void AccountMetaContainer::FreeQosResource(const std::string& username,
const TaskInCtld& task) {
user_meta_map_[username].qos_resource_in_use.modify_if(
task.qos, [&](std::pair<const std::string, QosResource>& pair) {
auto& val = pair.second;
val.resource.GetAllocatableRes().cpu_count -= task.cpus_per_task;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

同上

val.jobs_per_user--;
});
}

} // namespace Ctld
46 changes: 46 additions & 0 deletions src/CraneCtld/AccountMetaContainer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/**
* Copyright (c) 2024 Peking University and Peking University
* Changsha Institute for Computing and Digital Economy
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include "CtldPublicDefs.h"
// Precompiled header comes first!

namespace Ctld {

class AccountMetaContainer final {
public:
using UserResourceMetaMap = std::unordered_map<std::string, // username
ResourcePerUser>;

AccountMetaContainer() = default;
~AccountMetaContainer() = default;

bool CheckAndMallocQosResourceFromUser(const std::string& username,
const TaskInCtld& task,
const Qos& qos);

void FreeQosResource(const std::string& username, const TaskInCtld& task);

private:
UserResourceMetaMap user_meta_map_;
};

inline std::unique_ptr<Ctld::AccountMetaContainer> g_account_meta_container;

} // namespace Ctld
4 changes: 4 additions & 0 deletions src/CraneCtld/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ add_executable(cranectld
CranedMetaContainer.cpp
AccountManager.h
AccountManager.cpp
AccountMetaContainer.h
AccountMetaContainer.cpp
EmbeddedDbClient.cpp
EmbeddedDbClient.h
CraneCtld.cpp
Expand All @@ -38,6 +40,8 @@ target_link_libraries(cranectld PRIVATE
absl::synchronization
absl::flat_hash_map

phmap

crane_proto_lib

bs_thread_pool
Expand Down
3 changes: 3 additions & 0 deletions src/CraneCtld/CraneCtld.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <filesystem>

#include "AccountManager.h"
#include "AccountMetaContainer.h"
#include "CranedKeeper.h"
#include "CranedMetaContainer.h"
#include "CtldGrpcServer.h"
Expand Down Expand Up @@ -683,6 +684,8 @@ void InitializeCtldGlobalVariables() {
g_meta_container = std::make_unique<CranedMetaContainer>();
g_meta_container->InitFromConfig(g_config);

g_account_meta_container = std::make_unique<AccountMetaContainer>();

bool ok;
g_embedded_db_client = std::make_unique<Ctld::EmbeddedDbClient>();
ok = g_embedded_db_client->Init(g_config.CraneCtldDbPath);
Expand Down
3 changes: 1 addition & 2 deletions src/CraneCtld/CtldGrpcServer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -959,8 +959,7 @@ CtldServer::SubmitTaskToScheduler(std::unique_ptr<TaskInCtld> task) {
task->Username(), task->partition_id, task->account));
}

auto enable_res =
g_account_manager->CheckIfUserOfAccountIsEnabled(
auto enable_res = g_account_manager->CheckIfUserOfAccountIsEnabled(
task->Username(), task->account);
if (enable_res.has_error()) {
return result::fail(enable_res.error());
Expand Down
3 changes: 3 additions & 0 deletions src/CraneCtld/CtldPreCompiledHeader.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@
#include <absl/synchronization/mutex.h>
#include <absl/time/time.h> // NOLINT(modernize-deprecated-headers)

// parallel-hashmap
#include <parallel_hashmap/phmap.h>

// Thread pool
#include <BS_thread_pool.hpp>

Expand Down
16 changes: 16 additions & 0 deletions src/CraneCtld/CtldPublicDefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -703,6 +703,22 @@ inline bool CheckIfTimeLimitIsValid(absl::Duration d) {
return CheckIfTimeLimitSecIsValid(sec);
}

struct QosResource {
ResourceView resource;
uint32_t jobs_per_user;
};

struct ResourcePerUser {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个东西可以放AccountMetaContainer.h里面吧

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

可以的,我开始就是放在里面的

using QosToQosResourceMap = phmap::parallel_flat_hash_map<
std::string, // QosName
QosResource, phmap::priv::hash_default_hash<std::string>,
phmap::priv::hash_default_eq<std::string>,
std::allocator<std::pair<const std::string, QosResource>>, 4,
std::shared_mutex>;

QosToQosResourceMap qos_resource_in_use;
};

} // namespace Ctld

inline std::unique_ptr<BS::thread_pool> g_thread_pool;
5 changes: 3 additions & 2 deletions src/CraneCtld/TaskScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "TaskScheduler.h"

#include "AccountManager.h"
#include "AccountMetaContainer.h"
#include "CranedKeeper.h"
#include "CranedMetaContainer.h"
#include "CtldPublicDefs.h"
Expand Down Expand Up @@ -521,7 +522,6 @@ void TaskScheduler::PutRecoveredTaskIntoRunningQueueLock_(
for (const CranedId& craned_id : task->CranedIds())
g_meta_container->MallocResourceFromNode(craned_id, task->TaskId(),
task->Resources());

// The order of LockGuards matters.
LockGuard running_guard(&m_running_task_map_mtx_);
LockGuard indexes_guard(&m_task_indexes_mtx_);
Expand Down Expand Up @@ -956,6 +956,7 @@ void TaskScheduler::ScheduleThread_() {
auto& task = it.first;
for (CranedId const& craned_id : task->CranedIds())
g_meta_container->FreeResourceFromNode(craned_id, task->TaskId());
g_account_meta_container->FreeQosResource(task->Username(), *task);
}

// Construct the map for cgroups to be released of all failed tasks
Expand Down Expand Up @@ -1670,6 +1671,7 @@ void TaskScheduler::CleanTaskStatusChangeQueueCb_() {
for (CranedId const& craned_id : task->CranedIds()) {
g_meta_container->FreeResourceFromNode(craned_id, task_id);
}
g_account_meta_container->FreeQosResource(task->Username(), *task);

task_raw_ptr_vec.emplace_back(task.get());
task_ptr_vec.emplace_back(std::move(task));
Expand Down Expand Up @@ -2508,7 +2510,6 @@ void MinLoadFirst::NodeSelect(
for (CranedId const& craned_id : craned_ids)
g_meta_container->MallocResourceFromNode(craned_id, task->TaskId(),
task->Resources());

std::unique_ptr<TaskInCtld> moved_task;

// Move task out of pending_task_map and insert it to the
Expand Down
Loading