From 92dcc56370e2add8f81a4681b496b3dd215e0a30 Mon Sep 17 00:00:00 2001 From: Adrien Faure Date: Thu, 26 Oct 2023 17:32:05 +0200 Subject: [PATCH] [cgroup_manager] adapt process stealing for nixos --- .../job_resource_manager_cgroups_nixos.pl | 45 ++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/oar/tools/job_resource_manager_cgroups_nixos.pl b/oar/tools/job_resource_manager_cgroups_nixos.pl index c5203e11..c6f52e10 100644 --- a/oar/tools/job_resource_manager_cgroups_nixos.pl +++ b/oar/tools/job_resource_manager_cgroups_nixos.pl @@ -160,11 +160,12 @@ exit_myself(13,"Directory $Cpuset->{oar_tmp_directory} does not exist and cannot be created"); } + my @cgroup_list = (); if (defined($Cpuset_path_job)){ if (open(LOCKFILE,"> $Cpuset->{oar_tmp_directory}/job_manager_lock_file")){ flock(LOCKFILE,LOCK_EX) or exit_myself(17,"flock failed: $!"); if (!(-r $Cgroup_directory_collection_links.'/cpuset/tasks')){ - my @cgroup_list = ("cpuset","cpu","cpuacct","devices","freezer"); + @cgroup_list = ("cpuset","cpu","cpuacct","devices","freezer"); push(@cgroup_list, "memory") if ($Enable_mem_cg eq "YES"); push(@cgroup_list, "blkio") if ($Enable_blkio_cg eq "YES"); push(@cgroup_list, "net_cls") if ($Enable_net_cls_cg eq "YES"); @@ -483,6 +484,48 @@ exit_myself(10,"Error opening $Cpuset->{ssh_keys}->{public}->{file_name}"); } } + + # Finally steal processes if needed. + if(!($Cpuset->{evolving_migrate_processes_from_cpusetpath} eq "undef") and (-d $Cgroup_directory_collection_links.'/cpuset/'.$Cpuset->{cpuset_path}.'/'.$Cpuset->{evolving_migrate_processes_from_cpusetpath} )) { + my $Cpuset_migrate_from; + my $Previous_oarexec_pid_file; + my $Previous_oarexec_pid; + + $Previous_oarexec_pid_file = $Cpuset->{evolving_migrate_processes_from_oarexec_pid_file}; + $Cpuset_migrate_from = $Cpuset->{cpuset_path}.'/'.$Cpuset->{evolving_migrate_processes_from_cpusetpath}; + + # If this file exists, we are on the head node of the previous job + if(-e $Previous_oarexec_pid_file) { + $Previous_oarexec_pid_file = $Cpuset->{evolving_migrate_processes_from_oarexec_pid_file}; + # Thus, we get the pid of the oarexec because we do not want to migrate it + $Previous_oarexec_pid=`cat $Previous_oarexec_pid_file`; + } else { + # If we are not on a head node, we set pid to -1 which should be impossible + $Previous_oarexec_pid = "-1"; + } + + print_log(3, "Migrate processes from " . $Cpuset_migrate_from . " to ours. Old oarexec pid was: $Previous_oarexec_pid."); + + system('set -ux + echo "all cgroups: '.join(' ',@cgroup_list).'" + for cg_path in '.$Cgroup_directory_collection_links.'/* ; do + cg=$(basename $cg_path) + echo "Treat cgroup: $cg" + for d in '.$Cgroup_directory_collection_links.'/$cg/'.$Cpuset_migrate_from.'; do + if [ -f "$d/tasks" ]; then + echo "migration ! yaouh: $d" + PROCESSES=$(/run/current-system/sw/bin/cat $d/tasks | grep -v -E "\b+'.$Previous_oarexec_pid.'\b+" | head -n 1) + while [ "$PROCESSES" != "" ]; do + echo "write into: '.$Cgroup_directory_collection_links.'/$cg/'.$Cpuset_path_job.'/tasks" + OARDO_BECOME_USER=root oardodo /run/current-system/sw/bin/echo "$PROCESSES" >> '.$Cgroup_directory_collection_links.'/$cg/'.$Cpuset_path_job.'/tasks + PROCESSES=$(/run/current-system/sw/bin/cat $d/tasks | grep -v '.$Previous_oarexec_pid.' | head -n 1) + done + fi + done + done'); + } else { + print_log(3, "No processes to steal, maybe " . $Cgroup_directory_collection_links.'/cpuset/'.$Cpuset->{cpuset_path}.'/'.$Cpuset->{evolving_migrate_processes_from_cpusetpath} . " is empty ?"); + } }elsif ($ARGV[0] eq "clean"){ # delete ssh key files if ($Cpuset->{ssh_keys}->{private}->{key} ne ""){