From e43a5954524442e901644426181daf8c67f996dd Mon Sep 17 00:00:00 2001 From: Johan Guldmyr Date: Mon, 21 Sep 2015 09:01:01 +0300 Subject: [PATCH] Initial Release - copy from internal repo. --- .gitignore | 6 + README.md | 93 ++++++++++ group_vars/all/all | 9 + launcopenstackinstance.yml | 27 +++ roles/common/files/iptables | 15 ++ roles/common/tasks/main.yml | 65 +++++++ roles/common/vars/main.yml | 0 roles/slurm_common/files/slurm | 3 + roles/slurm_common/tasks/main.yml | 91 ++++++++++ roles/slurm_common/templates/slurm.conf.j2 | 136 +++++++++++++++ roles/slurm_compute/tasks/main.yml | 15 ++ roles/slurm_login/tasks/main.yml | 12 ++ roles/slurm_service/files/slurm | 3 + roles/slurm_service/tasks/main.yml | 161 ++++++++++++++++++ roles/slurm_service/templates/slurm.conf.j2 | 136 +++++++++++++++ .../slurm_service/templates/slurmdbd.conf.j2 | 37 ++++ site.yml | 6 + slurm_compute.yml | 13 ++ slurm_login.yml | 12 ++ slurm_service.yml | 15 ++ stage | 8 + 21 files changed, 863 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 group_vars/all/all create mode 100644 launcopenstackinstance.yml create mode 100644 roles/common/files/iptables create mode 100644 roles/common/tasks/main.yml create mode 100644 roles/common/vars/main.yml create mode 100644 roles/slurm_common/files/slurm create mode 100644 roles/slurm_common/tasks/main.yml create mode 100644 roles/slurm_common/templates/slurm.conf.j2 create mode 100644 roles/slurm_compute/tasks/main.yml create mode 100644 roles/slurm_login/tasks/main.yml create mode 100644 roles/slurm_service/files/slurm create mode 100644 roles/slurm_service/tasks/main.yml create mode 100644 roles/slurm_service/templates/slurm.conf.j2 create mode 100644 roles/slurm_service/templates/slurmdbd.conf.j2 create mode 100644 site.yml create mode 100644 slurm_compute.yml create mode 100644 slurm_login.yml create mode 100644 slurm_service.yml create mode 100644 stage diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d163b9d --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +# ignore the file that should contain the mysql_slurm_password +group_vars/*/mysql + +# Backup files +*~ +*.swp diff --git a/README.md b/README.md new file mode 100644 index 0000000..f114fb7 --- /dev/null +++ b/README.md @@ -0,0 +1,93 @@ +# Creates a slurm cluster in pouta + +Tested with slurm versions: + - 14.11.0 + - 14.11.3 + - 15.08.0 + +## How-To + +### Launch the Openstack instances: + + - ansible-playbook launcopenstackinstance.yml # launches the VMs. source the openstack-rc script before running this playbook. Also update the playbook to include the names of your key and tenant. + - see the group_vars/all/all file for default variables used for launching an OS instance + +### Initial configuration of the instances and your workstation: + + - First yum -y install nc on the bastion host. + - Then setup SSH config so you don't have to have a public IP on each instance. Change the Hostname in "Host bastion" to the service node. + +Put this in ~/.ssh/config : + +
+# http://edgeofsanity.net/article/2012/10/15/ssh-leap-frog.html
+# This applies to all hosts in your ssh config.
+ControlMaster auto
+ControlPath ~/.ssh/ssh_control_%h_%p_%r
+
+# Always ssh as cloud-user and don't save hostkeys
+Host bastion
+  User cloud-user
+  StrictHostKeyChecking no
+  UserKnownHostsFile /dev/null
+  Hostname 86.50.168.39
+     
+Host slurm* 
+   User cloud-user
+   StrictHostKeyChecking no
+   UserKnownHostsFile /dev/null
+   ForwardAgent yes
+   ProxyCommand ssh bastion nc %h %p
+
+ + - Second update the /etc/hosts on the service node (playbook should update the others): + + +# For second cluster +192.168.36.126 slurm2-compute1 +192.168.36.125 slurm2-service +192.168.36.124 slurm2-login + +# For first cluster +192.168.36.129 slurm-compute1 +192.168.36.128 slurm-service +192.168.36.127 slurm-login + + +### Then we can finally run the slurm configuration playbooks: + +\_ Update the files in group_vars/ to your settings + +You also need to add a mysql_slurm_password: "PASSWORD" string somewhere. This will be used to set a password for the slurm mysql user. + + +#### Description of the playbooks: + + - site.yml - calls the slurm*.yml playbooks + - slurm_*.yml # The playbooks that configure the servers + - set slurm_version - this is used to determine which version to download from schedmd.com + +#### Run them in this order: + + - Update stage to have the right IP addresses and hostnames. + - configuring 1st slurm: ansible-playbook site.yml + + - Update stage to have the right IP addresses and hostnames. + - configuring 2st slurm: ansible-playbook site.yml + +#### Add cloud-user to slurm + +
+sacctmgr create account name=csc
+sacctmgr create user name=cloud-user account=csc
+
+ +### Make changes to slurm.conf and distribute it to nodes and restart/reconfigure: + + - Would be nice with a role /tag where one could just run ansible-playbook site.yml --tag new-slur-config and it pushes new config and restarts/reconfigs as necessary. + +# Authors: + + - Marco Passerini (original author) + - Johan Guldmyr (updates done as part of FGCI work) + diff --git a/group_vars/all/all b/group_vars/all/all new file mode 100644 index 0000000..b8573c1 --- /dev/null +++ b/group_vars/all/all @@ -0,0 +1,9 @@ +--- + +tenant_name: "tenant_name" +key_name: "key_name" +image: "CentOS-7.0" +flavor: "small" +security_groups: "default,slurm" +network_name: "default" +auth_url: "https://pouta.csc.fi:5000/v2.0/" diff --git a/launcopenstackinstance.yml b/launcopenstackinstance.yml new file mode 100644 index 0000000..51a1933 --- /dev/null +++ b/launcopenstackinstance.yml @@ -0,0 +1,27 @@ +--- + - name: launch an instance + hosts: localhost + gather_facts: False + + tasks: + - name: launch the Slurm servers + nova_compute: + state: present + login_username: "{{ lookup('env','OS_USERNAME') }}" + login_password: "{{ lookup('env','OS_PASSWORD') }}" + login_tenant_name: "{{ tenant_name }}" + auth_url: "{{ auth_url }}" + name: "{{ item }}" + image: "{{ image }}" + key_name: "{{ key_name }}" + wait_for: 200 + flavor: "{{ flavor }}" + security_groups: "{{ security_groups }}" + nics: + - net-id: "{{ network_nameĀ }}" + meta: + hostname: slurms + with_items: + - slurm2-login + - slurm2-service + - slurm2-compute1 diff --git a/roles/common/files/iptables b/roles/common/files/iptables new file mode 100644 index 0000000..f51db86 --- /dev/null +++ b/roles/common/files/iptables @@ -0,0 +1,15 @@ +# Firewall configuration written by system-config-firewall +# Manual customization of this file is not recommended. +*filter +:INPUT ACCEPT [0:0] +:FORWARD ACCEPT [0:0] +:OUTPUT ACCEPT [0:0] +-A INPUT -m state --state ESTABLISHED,RELATED -j ACCEPT +-A INPUT -p icmp -j ACCEPT +-A INPUT -i lo -j ACCEPT +-A INPUT -m state --state NEW -m tcp -p tcp --dport 22 -j ACCEPT +-A INPUT -i eth0 -j ACCEPT +-A INPUT -j REJECT --reject-with icmp-host-prohibited +-A FORWARD -j REJECT --reject-with icmp-host-prohibited +COMMIT + diff --git a/roles/common/tasks/main.yml b/roles/common/tasks/main.yml new file mode 100644 index 0000000..840f1b3 --- /dev/null +++ b/roles/common/tasks/main.yml @@ -0,0 +1,65 @@ +--- + - name: update all packages first + yum: pkg=* state=latest + + - name: Remove duplicate CentOS repos + file: name=/etc/yum.repos.d/centos6-latest.repo state=absent + + - name: install EPEL6 + yum: name='http://www.nic.funet.fi/pub/mirrors/fedora.redhat.com/pub/epel/6/i386/epel-release-6-8.noarch.rpm' state=present + when: major_relase is "6" + + - name: install software + yum: name="{{item}}" state=present + with_items: + - "xterm" + - "sssd" + - "gcc" + - "make" + - "gcc-c++" + - "wget" + - "vim" + - "man" + - "rpm-build" + - "pam" + - "pam-devel" + - "hwloc" + - "munge" + - "munge-devel" + - "munge-libs" + - "readline-devel" + - "openssl-devel" + - "perl-ExtUtils-MakeMaker" + - "lua" + - "lua-devel" + - "lua-posix" + - "lua-filesystem" + + - debug: msg="{{groups['all']}}" + tags: debug + + + +# - name: create /etc/hosts +# lineinfile: dest=/etc/hosts regexp='.*{{ item }}$' line="{{ hostvars[item]['ansible_default_ipv4']['address'] }} {{item}}" state=present +# when: hostvars[item]['ansible_default_ipv4']['address'] is defined +# with_items: groups['all'] +# tags: debug + + - name: Add cluster hosts to local /etc/hosts + sudo: yes + action: lineinfile + state=present + dest=/etc/hosts + line="{{ hostvars[item]['ssh_host'] }} {{ item }}" + when: hostvars[item]['ssh_host'] is defined + with_items: groups.all + tags: debug + + - name: copy iptables settings + copy: src=iptables + dest=/etc/sysconfig/iptables owner=root mode=600 + + + - name: restart iptables + service: name=iptables state=restarted diff --git a/roles/common/vars/main.yml b/roles/common/vars/main.yml new file mode 100644 index 0000000..e69de29 diff --git a/roles/slurm_common/files/slurm b/roles/slurm_common/files/slurm new file mode 100644 index 0000000..5794311 --- /dev/null +++ b/roles/slurm_common/files/slurm @@ -0,0 +1,3 @@ + auth required pam_localuser.so + account required pam_unix.so + session required pam_limits.so diff --git a/roles/slurm_common/tasks/main.yml b/roles/slurm_common/tasks/main.yml new file mode 100644 index 0000000..0c2956a --- /dev/null +++ b/roles/slurm_common/tasks/main.yml @@ -0,0 +1,91 @@ +--- + +# - name: Copy slurm files +# synchronize: mode=pull src={{ item }} dest=/root/rpmbuild/RPMS/x86_64/ +# delegate_to: slurm-service +# with_items: +# - "/root/rpmbuild/RPMS/x86_64/slurm-plugins-{{ slurm_version }}-1.el6.x86_64.rpm" +# - "/root/rpmbuild/RPMS/x86_64/slurm-{{ slurm_version }}-1.el6.x86_64.rpm" +# - "/root/rpmbuild/RPMS/x86_64/slurm-perlapi-{{ slurm_version }}-1.el6.x86_64.rpm" +# - "/root/rpmbuild/RPMS/x86_64/slurm-slurmdb-direct-{{ slurm_version }}-1.el6.x86_64.rpm" +# - "/root/rpmbuild/RPMS/x86_64/slurm-sql-{{ slurm_version }}-1.el6.x86_64.rpm" + #- "roles/slurm_common/files/slurm-lua-{{ slurm_version }}-1.el6.x86_64.rpm" +# - "/root/rpmbuild/RPMS/x86_64/slurm-pam_slurm-{{ slurm_version }}-1.el6.x86_64.rpm" +# - "/root/rpmbuild/RPMS/x86_64/slurm-sjstat-{{ slurm_version }}-1.el6.x86_64.rpm" +# - "/root/rpmbuild/RPMS/x86_64/slurm-slurmdbd-{{ slurm_version }}-1.el6.x86_64.rpm" + #- "roles/slurm_common/files/slurm-spank-x11-{{ slurm_version }}-1.el6.x86_64.rpm" +# - "/root/rpmbuild/RPMS/x86_64/slurm-torque-{{ slurm_version }}-1.el6.x86_64.rpm" +# - "/root/rpmbuild/RPMS/x86_64/slurm-devel-{{ slurm_version }}-1.el6.x86_64.rpm" +# - "/root/rpmbuild/RPMS/x86_64/slurm-munge-{{ slurm_version }}-1.el6.x86_64.rpm" +# - "/root/rpmbuild/RPMS/x86_64/slurm-sjobexit-{{ slurm_version }}-1.el6.x86_64.rpm" + + + + - name: distribute the slurm RPMs to the nodes + copy: src={{ item }} + dest=/root/rpmbuild/RPMS/x86_64/ + with_items: + - "roles/slurm_common/files/slurm-plugins-{{ slurm_version }}-1.el6.x86_64.rpm" + - "roles/slurm_common/files/slurm-{{ slurm_version }}-1.el6.x86_64.rpm" + - "roles/slurm_common/files/slurm-perlapi-{{ slurm_version }}-1.el6.x86_64.rpm" + - "roles/slurm_common/files/slurm-slurmdb-direct-{{ slurm_version }}-1.el6.x86_64.rpm" + - "roles/slurm_common/files/slurm-sql-{{ slurm_version }}-1.el6.x86_64.rpm" + - "roles/slurm_common/files/slurm-lua-{{ slurm_version }}-1.el6.x86_64.rpm" + - "roles/slurm_common/files/slurm-devel-{{ slurm_version }}-1.el6.x86_64.rpm" + - "roles/slurm_common/files/slurm-pam_slurm-{{ slurm_version }}-1.el6.x86_64.rpm" + - "roles/slurm_common/files/slurm-sjstat-{{ slurm_version }}-1.el6.x86_64.rpm" + - "roles/slurm_common/files/slurm-slurmdbd-{{ slurm_version }}-1.el6.x86_64.rpm" + #- "roles/slurm_common/files/slurm-spank-x11-{{ slurm_version }}-1.el6.x86_64.rpm" + - "roles/slurm_common/files/slurm-torque-{{ slurm_version }}-1.el6.x86_64.rpm" + - "roles/slurm_common/files/slurm-munge-{{ slurm_version }}-1.el6.x86_64.rpm" + - "roles/slurm_common/files/slurm-sjobexit-{{ slurm_version }}-1.el6.x86_64.rpm" + + + - name: install Slurm + #yum: name="/root/rpmbuild/RPMS/x86_64/{{ item }}-{{ slurm_version }}-1.el6.x86_64.rpm" state=present + yum: name={{ item }} state=present + tags: slurm + with_items: + + - "/root/rpmbuild/RPMS/x86_64/slurm-plugins-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-perlapi-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-slurmdb-direct-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-sql-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-lua-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-devel-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-pam_slurm-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-sjstat-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-slurmdbd-{{ slurm_version }}-1.el6.x86_64.rpm" + #- "/root/rpmbuild/RPMS/x86_64/slurm-spank-x11-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-torque-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-munge-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-sjobexit-{{ slurm_version }}-1.el6.x86_64.rpm" + + + - name: pam.d/slurm + copy: src=slurm dest=/etc/pam.d/slurm owner=root mode="644" + tags: slurm + + + - name: slurm.conf + template: src=slurm.conf.j2 dest=/etc/slurm/slurm.conf owner=root mode="644" + tags: slurm + + - name: add slurm user + user: name=slurm shell=/sbin/nologin createhome=no home=/nonexixtent system=yes append=yes + tags: slurm + + - name: add slurm log dir + file: path="/var/log/slurm" state=directory owner=slurm group=slurm mode=750 + tags: slurm + + - name: add slurm tmp dir + file: path="/tmp/slurmd" state=directory owner=slurm group=slurm mode=750 + tags: slurm + + - name: add slurm tmp dir + file: path="/tmp/slurmstate" state=directory owner=slurm group=slurm mode=750 + tags: slurm + + diff --git a/roles/slurm_common/templates/slurm.conf.j2 b/roles/slurm_common/templates/slurm.conf.j2 new file mode 100644 index 0000000..935ff4e --- /dev/null +++ b/roles/slurm_common/templates/slurm.conf.j2 @@ -0,0 +1,136 @@ +# +# Example slurm.conf file. Please run configurator.html +# (in doc/html) to build a configuration file customized +# for your environment. +# +# +# slurm.conf file generated by configurator.html. +# +# See the slurm.conf man page for more information. +# +ClusterName=test_cluster +ControlMachine={{ hostvars[groups['slurm_service'][0]]['inventory_hostname'] }} +ControlAddr={{ hostvars[groups['slurm_service'][0]]['inventory_hostname'] }} +#BackupController=service02 +#BackupAddr=service02 +# +SlurmUser=slurm +#SlurmdUser=root +SlurmctldPort=6817 +SlurmdPort=6818 +AuthType=auth/munge +#JobCredentialPrivateKey= +#JobCredentialPublicCertificate= +StateSaveLocation=/tmp/slurmstate +SlurmdSpoolDir=/tmp/slurmd +SwitchType=switch/none +MpiDefault=none +MpiParams=ports=12000-12999 +SlurmctldPidFile=/var/run/slurmctld.pid +SlurmdPidFile=/var/run/slurmd.pid +#ProctrackType=proctrack/cgroup +#PluginDir= +CacheGroups=0 +FirstJobId=2230000 +ReturnToService=1 +MaxJobCount=30000 +#PlugStackConfig= +#PropagatePrioProcess= +#PropagateResourceLimits= +PropagateResourceLimitsExcept=MEMLOCK,RLIMIT_AS,RLIMIT_CPU,RLIMIT_NPROC,RLIMIT_CORE,RLIMIT_DATA,RLIMIT_RSS,STACK +EnforcePartLimits=YES +#Prolog=/etc/slurm/prolog +#Epilog=/etc/slurm/epilog +#PrologSlurmctld=/etc/slurm/slurmctld_prolog +#EpilogSlurmctld=/etc/slurm/slurmctld_epilog +#SrunProlog= +#SrunEpilog= +#TaskProlog= +#TaskEpilog= +#TaskPlugin=task/cgroup +TaskPlugin=task/none +#TrackWCKey=no +#TreeWidth=50 +#TmpFs= +UsePAM=1 +RebootProgram=/sbin/reboot +# +#HealthCheckInterval=1800 +#HealthCheckProgram=/etc/slurm/health_check +#HealthCheckNodeState=IDLE + +# +# +#GresTypes=mic,gpu +#GresTypes=gpu +# TIMERS +SlurmctldTimeout=300 +SlurmdTimeout=600 +InactiveLimit=1800 +MinJobAge=300 +MessageTimeout=99 +KillWait=10 +CompleteWait=12 +Waittime=0 +KillOnBadExit=1 +KeepAliveTime=60 +# +# SCHEDULING +SchedulerType=sched/backfill +SchedulerParameters = bf_max_job_user=30,bf_continue,bf_interval=60,bf_resolution=180,max_job_bf=300,defer_rpc_cnt=10 +#SchedulerAuth= +#SchedulerPort= +#SchedulerRootFilter= +SelectType=select/cons_res +SelectTypeParameters=CR_Core_Memory,CR_CORE_DEFAULT_DIST_BLOCK +DefMemPerCPU=512 +FastSchedule=2 +PriorityType=priority/multifactor +PriorityDecayHalfLife=7-0 +#PriorityUsageResetPeriod=14-0 +PriorityWeightFairshare=1000 +PriorityWeightAge=500 +PriorityWeightPartition=1000 +PriorityWeightJobSize=1000 +PriorityMaxAge=6-0 +Licenses=mdcs:256 +#MaxSubmitJobs=2000 +# +# LOGGING +SlurmctldDebug=4 +SlurmctldLogFile=/var/log/slurm/Slurmctld.log +SlurmdDebug=3 +#DebugFlags=backfill +SlurmdLogFile=/var/log/slurm/Slurmd.log +#JobCompType=jobcomp/filetxt +#JobCompLoc=/slurmdb/log/jobcomp.log +#JobCompLoc= +# +# ACCOUNTING +JobAcctGatherType=jobacct_gather/linux +JobAcctGatherFrequency=energy=30,task=30 +AcctGatherEnergyType=acct_gather_energy/rapl +AcctGatherNodeFreq=30 +#JobAcctGatherFrequency=task=30 +# + +# +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageHost={{ hostvars[groups['slurm_service'][0]]['inventory_hostname'] }} +AccountingStorageLoc=slurm_acct_db +#AccountingStoragePass= +AccountingStorageUser=slurm +AccountingStorageEnforce=associations,limits + + +#JobSubmitPlugins=lua + +# +# TOPOLOGY +# +#TopologyPlugin=topology/tree +# COMPUTE NODES +NodeName={{ hostvars[groups['slurm_compute'][0]]['inventory_hostname'] }} Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 RealMemory=3500 TmpDisk=1000 Weight=10 + +PartitionName=serial Nodes={{ hostvars[groups['slurm_compute'][0]]['inventory_hostname'] }} Default=YES MaxNodes=1 Shared=No DefaultTime=5 MaxTime=3-0 State=UP + diff --git a/roles/slurm_compute/tasks/main.yml b/roles/slurm_compute/tasks/main.yml new file mode 100644 index 0000000..3df4188 --- /dev/null +++ b/roles/slurm_compute/tasks/main.yml @@ -0,0 +1,15 @@ +--- + + - name: get munge key for distribution to nodes + copy: src=roles/slurm_common/files/munge.key + dest=/etc/munge/munge.key + owner=munge + group=munge + mode=400 + + + - name: start munge + service: name=munge state=restarted + + - name: start slurm + service: name=slurm state=restarted diff --git a/roles/slurm_login/tasks/main.yml b/roles/slurm_login/tasks/main.yml new file mode 100644 index 0000000..2d45928 --- /dev/null +++ b/roles/slurm_login/tasks/main.yml @@ -0,0 +1,12 @@ +--- + + - name: get munge key for distribution to nodes + copy: src=roles/slurm_common/files/munge.key + dest=/etc/munge/munge.key + owner=munge + group=munge + mode=400 + + - name: start munge + service: name=munge state=restarted + diff --git a/roles/slurm_service/files/slurm b/roles/slurm_service/files/slurm new file mode 100644 index 0000000..5794311 --- /dev/null +++ b/roles/slurm_service/files/slurm @@ -0,0 +1,3 @@ + auth required pam_localuser.so + account required pam_unix.so + session required pam_limits.so diff --git a/roles/slurm_service/tasks/main.yml b/roles/slurm_service/tasks/main.yml new file mode 100644 index 0000000..5317915 --- /dev/null +++ b/roles/slurm_service/tasks/main.yml @@ -0,0 +1,161 @@ +--- + - name: name install service specific packages + yum: name="{{item}}" state=present + with_items: + - "mysql" + - "mysql-server" + - "mysql-devel" + - "lua-devel" + - "MySQL-python" + when: major_relase is "6" + + - name: start mysql + service: name=mysqld state=started + register: mysql_start + + + - name: create slurm acct db + mysql_db: name=slurm_acct_db state=present + + - name: create slurm db user + mysql_user: name=slurm password="d*d_ev_Eod42_24dArXx-" priv=slurm_acct_db.*:ALL state=present + + + - name: create munge key + command: /usr/sbin/create-munge-key creates=/etc/munge/munge.key + + - name: get munge key for distribution to nodes + fetch: src=/etc/munge/munge.key + dest=roles/slurm_common/files/munge.key + fail_on_missing=yes + flat=yes + + - name: create build dirs + file: path="/root/rpmbuild/{{ item }}" state=directory + with_items: + - "BUILD" + - "RPMS" + - "SOURCES" + - "SPECS" + - "SRPMS" + + - name: download Slurm source + get_url: url="http://www.schedmd.com/download/latest/slurm-{{ slurm_version }}.tar.bz2" dest="/root/rpmbuild/SOURCES/slurm-{{ slurm_version }}.tar.bz2" + tags: slurm + + - name: Extract Slurm source 2 + command: "tar -xjf /root/rpmbuild/SOURCES/slurm-{{ slurm_version}}.tar.bz2 -C /root/rpmbuild/SOURCES/ creates=/root/rpmbuild/SOURCES/slurm-{{ slurm_version }}" + tags: slurm + + - name: build Slurm + command: "{{ item }} chdir=/root/rpmbuild/SOURCES/slurm-{{ slurm_version}}/ creates=/root/rpmbuild/RPMS/x86_64/slurm-{{ slurm_version }}-1.el{{major_release}}.x86_64.rpm" + with_items: + - rpmbuild -bb --with lua slurm.spec + #- ./configure + #- "/usr/bin/make" + #- "/usr/bin/make install" + tags: + - slurm + + - name: get slurm rpms to then distribute them to the nodes + fetch: src={{ item }} + dest=roles/slurm_common/files/ + fail_on_missing=yes + flat=yes + with_items: + - "/root/rpmbuild/RPMS/x86_64/slurm-plugins-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-perlapi-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-slurmdb-direct-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-sql-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-lua-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-devel-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-pam_slurm-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-sjstat-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-slurmdbd-{{ slurm_version }}-1.el6.x86_64.rpm" + #- "/root/rpmbuild/RPMS/x86_64/slurm-spank-x11-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-torque-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-munge-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-sjobexit-{{ slurm_version }}-1.el6.x86_64.rpm" + + - name: add slurm etc dir + file: path="/etc/slurm" state=directory owner=root group=root mode=755 + tags: slurm + + - name: slurmdbd.conf + template: src=slurmdbd.conf.j2 dest=/etc/slurm/slurmdbd.conf owner=root mode="640" + tags: slurm + + - name: start munge + service: name=munge state=restarted + + + + + - name: install Slurm + #yum: name="/root/rpmbuild/RPMS/x86_64/{{ item }}-{{ slurm_version }}-1.el6.x86_64.rpm" state=present + yum: name={{ item }} state=present + tags: slurm + with_items: + + - "/root/rpmbuild/RPMS/x86_64/slurm-plugins-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-perlapi-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-slurmdb-direct-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-sql-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-lua-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-devel-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-pam_slurm-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-sjstat-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-slurmdbd-{{ slurm_version }}-1.el6.x86_64.rpm" + #- "/root/rpmbuild/RPMS/x86_64/slurm-spank-x11-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-torque-{{ slurm_version }}-1.el6.x86_64.rpm" + + - "/root/rpmbuild/RPMS/x86_64/slurm-munge-{{ slurm_version }}-1.el6.x86_64.rpm" + - "/root/rpmbuild/RPMS/x86_64/slurm-sjobexit-{{ slurm_version }}-1.el6.x86_64.rpm" + + + - name: pam.d/slurm + copy: src=slurm dest=/etc/pam.d/slurm owner=root mode="644" + tags: slurm + + + - name: slurm.conf + template: src=slurm.conf.j2 dest=/etc/slurm/slurm.conf owner=root mode="644" + tags: slurm + + - name: add slurm user + user: name=slurm shell=/sbin/nologin createhome=no home=/nonexixtent system=yes append=yes + tags: slurm + + - name: add slurm log dir + file: path="/var/log/slurm" state=directory owner=slurm group=slurm mode=750 + tags: slurm + + - name: add slurm tmp dir + file: path="/tmp/slurmd" state=directory owner=slurm group=slurm mode=750 + tags: slurm + + - name: add slurm tmp dir + file: path="/tmp/slurmstate" state=directory owner=slurm group=slurm mode=750 + tags: slurm + + + + + + + + + + - name: start slurmdbd + service: name=slurmdbd state=restarted + + - name: add cluster to accounting + command: "sacctmgr -i add cluster test_cluster" + ignore_errors: yes + + - name: start slurmctld + service: name=slurm state=restarted + + diff --git a/roles/slurm_service/templates/slurm.conf.j2 b/roles/slurm_service/templates/slurm.conf.j2 new file mode 100644 index 0000000..935ff4e --- /dev/null +++ b/roles/slurm_service/templates/slurm.conf.j2 @@ -0,0 +1,136 @@ +# +# Example slurm.conf file. Please run configurator.html +# (in doc/html) to build a configuration file customized +# for your environment. +# +# +# slurm.conf file generated by configurator.html. +# +# See the slurm.conf man page for more information. +# +ClusterName=test_cluster +ControlMachine={{ hostvars[groups['slurm_service'][0]]['inventory_hostname'] }} +ControlAddr={{ hostvars[groups['slurm_service'][0]]['inventory_hostname'] }} +#BackupController=service02 +#BackupAddr=service02 +# +SlurmUser=slurm +#SlurmdUser=root +SlurmctldPort=6817 +SlurmdPort=6818 +AuthType=auth/munge +#JobCredentialPrivateKey= +#JobCredentialPublicCertificate= +StateSaveLocation=/tmp/slurmstate +SlurmdSpoolDir=/tmp/slurmd +SwitchType=switch/none +MpiDefault=none +MpiParams=ports=12000-12999 +SlurmctldPidFile=/var/run/slurmctld.pid +SlurmdPidFile=/var/run/slurmd.pid +#ProctrackType=proctrack/cgroup +#PluginDir= +CacheGroups=0 +FirstJobId=2230000 +ReturnToService=1 +MaxJobCount=30000 +#PlugStackConfig= +#PropagatePrioProcess= +#PropagateResourceLimits= +PropagateResourceLimitsExcept=MEMLOCK,RLIMIT_AS,RLIMIT_CPU,RLIMIT_NPROC,RLIMIT_CORE,RLIMIT_DATA,RLIMIT_RSS,STACK +EnforcePartLimits=YES +#Prolog=/etc/slurm/prolog +#Epilog=/etc/slurm/epilog +#PrologSlurmctld=/etc/slurm/slurmctld_prolog +#EpilogSlurmctld=/etc/slurm/slurmctld_epilog +#SrunProlog= +#SrunEpilog= +#TaskProlog= +#TaskEpilog= +#TaskPlugin=task/cgroup +TaskPlugin=task/none +#TrackWCKey=no +#TreeWidth=50 +#TmpFs= +UsePAM=1 +RebootProgram=/sbin/reboot +# +#HealthCheckInterval=1800 +#HealthCheckProgram=/etc/slurm/health_check +#HealthCheckNodeState=IDLE + +# +# +#GresTypes=mic,gpu +#GresTypes=gpu +# TIMERS +SlurmctldTimeout=300 +SlurmdTimeout=600 +InactiveLimit=1800 +MinJobAge=300 +MessageTimeout=99 +KillWait=10 +CompleteWait=12 +Waittime=0 +KillOnBadExit=1 +KeepAliveTime=60 +# +# SCHEDULING +SchedulerType=sched/backfill +SchedulerParameters = bf_max_job_user=30,bf_continue,bf_interval=60,bf_resolution=180,max_job_bf=300,defer_rpc_cnt=10 +#SchedulerAuth= +#SchedulerPort= +#SchedulerRootFilter= +SelectType=select/cons_res +SelectTypeParameters=CR_Core_Memory,CR_CORE_DEFAULT_DIST_BLOCK +DefMemPerCPU=512 +FastSchedule=2 +PriorityType=priority/multifactor +PriorityDecayHalfLife=7-0 +#PriorityUsageResetPeriod=14-0 +PriorityWeightFairshare=1000 +PriorityWeightAge=500 +PriorityWeightPartition=1000 +PriorityWeightJobSize=1000 +PriorityMaxAge=6-0 +Licenses=mdcs:256 +#MaxSubmitJobs=2000 +# +# LOGGING +SlurmctldDebug=4 +SlurmctldLogFile=/var/log/slurm/Slurmctld.log +SlurmdDebug=3 +#DebugFlags=backfill +SlurmdLogFile=/var/log/slurm/Slurmd.log +#JobCompType=jobcomp/filetxt +#JobCompLoc=/slurmdb/log/jobcomp.log +#JobCompLoc= +# +# ACCOUNTING +JobAcctGatherType=jobacct_gather/linux +JobAcctGatherFrequency=energy=30,task=30 +AcctGatherEnergyType=acct_gather_energy/rapl +AcctGatherNodeFreq=30 +#JobAcctGatherFrequency=task=30 +# + +# +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageHost={{ hostvars[groups['slurm_service'][0]]['inventory_hostname'] }} +AccountingStorageLoc=slurm_acct_db +#AccountingStoragePass= +AccountingStorageUser=slurm +AccountingStorageEnforce=associations,limits + + +#JobSubmitPlugins=lua + +# +# TOPOLOGY +# +#TopologyPlugin=topology/tree +# COMPUTE NODES +NodeName={{ hostvars[groups['slurm_compute'][0]]['inventory_hostname'] }} Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 RealMemory=3500 TmpDisk=1000 Weight=10 + +PartitionName=serial Nodes={{ hostvars[groups['slurm_compute'][0]]['inventory_hostname'] }} Default=YES MaxNodes=1 Shared=No DefaultTime=5 MaxTime=3-0 State=UP + diff --git a/roles/slurm_service/templates/slurmdbd.conf.j2 b/roles/slurm_service/templates/slurmdbd.conf.j2 new file mode 100644 index 0000000..ce65046 --- /dev/null +++ b/roles/slurm_service/templates/slurmdbd.conf.j2 @@ -0,0 +1,37 @@ +# +# See the slurmdbd.conf man page for more information. +# +# Archive info +ArchiveJobs=yes +#ArchiveDir="/tmp" +#ArchiveSteps=yes +#ArchiveScript= +#JobPurge=12 +#StepPurge=1 +# +# Authentication info +AuthType=auth/munge +#AuthInfo=/var/run/munge/munge.socket.2 +# +# slurmDBD info +DbdAddr={{ hostvars[groups['slurm_service'][0]]['inventory_hostname'] }} +DbdHost=localhost +#DbdPort=7031 +SlurmUser=slurm +#MessageTimeout=300 +DebugLevel=4 +#DefaultQOS=normal,standby +LogFile=/var/log/slurm/slurmdbd.log +PidFile=/var/run/slurmdbd.pid +#PluginDir=/usr/lib/slurm +#PrivateData=accounts,users,usage,jobs +#TrackWCKey=yes +# +# Database info +StorageType=accounting_storage/mysql +StorageHost=localhost +StoragePort=1234 +StoragePass=d*d_ev_Eod42_24dArXx- +StorageUser=slurm +StorageLoc=slurm_acct_db + diff --git a/site.yml b/site.yml new file mode 100644 index 0000000..be3529b --- /dev/null +++ b/site.yml @@ -0,0 +1,6 @@ +--- +# file: site.yml + - include: slurm_service.yml + - include: slurm_compute.yml + - include: slurm_login.yml + diff --git a/slurm_compute.yml b/slurm_compute.yml new file mode 100644 index 0000000..2a27293 --- /dev/null +++ b/slurm_compute.yml @@ -0,0 +1,13 @@ +--- +- hosts: slurm2-compute1 + user: cloud-user + sudo: yes + vars: + slurm_version: "15.08.0" + roles: + - common + - slurm_common + - slurm_compute + + + diff --git a/slurm_login.yml b/slurm_login.yml new file mode 100644 index 0000000..07e073b --- /dev/null +++ b/slurm_login.yml @@ -0,0 +1,12 @@ +--- +- hosts: slurm2-login + user: cloud-user + sudo: yes + vars: + slurm_version: "15.08.0" + roles: + - common + - slurm_common + - slurm_login + + diff --git a/slurm_service.yml b/slurm_service.yml new file mode 100644 index 0000000..59633d8 --- /dev/null +++ b/slurm_service.yml @@ -0,0 +1,15 @@ +--- +- hosts: slurm2-service + user: cloud-user + sudo: yes + vars: + slurm_version: "15.08.0" + roles: + - common + + - slurm_service +# - slurm_common + + + + diff --git a/stage b/stage new file mode 100644 index 0000000..b7609aa --- /dev/null +++ b/stage @@ -0,0 +1,8 @@ +[slurm_compute] +slurm2-compute1 ssh_host=192.168.36.33 + +[slurm_service] +slurm2-service ssh_host=192.168.36.32 + +[slurm2_login] +slurm2-login ssh_host=192.168.36.101