Slurm is an open source, fault-tolerant, and highly scalable cluster management and job scheduling system for large and small Linux clusters.
export MUNGEUSER=966
groupadd -g $MUNGEUSER munge
useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGEUSER -g munge -s /sbin/nologin munge
export SLURMUSER=967
groupadd -g $SLURMUSER slurm
useradd -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURMUSER -g slurm -s /bin/bash slurm
yum install munge munge-libs munge-devel -y
create-munge-key
chown -R munge: /etc/munge/ /var/log/munge/ /var/lib/munge/ /run/munge/
chmod 0700 /etc/munge/ /var/log/munge/ /var/lib/munge/ /run/munge/
systemctl enable --now munge
yum install mariadb-server -y
systemctl enable --now mariadb
mysql_secure_installation
mysql_secure_installation
vim /etc/my.cnf.d/innodb.cnf
#--| [mysqld]
#--| innodb_buffer_pool_size=1024M
#--| innodb_log_file_size=64M
#--| innodb_lock_wait_timeout=900
systemctl stop mariadb
mv /var/lib/mysql/ib_logfile? /tmp/
systemctl start mariadb
yum install openssl openssl-devel pam-devel rpmbuild numactl numactl-devel hwloc hwloc-devel lua lua-devel readline-devel rrdtool-devel ncurses-devel man2html libibmad libibumad -y
yum install pmix pmix-devel -y
wget https://download.schedmd.com/slurm/slurm-20.11.3.tar.bz2
rpmbuild --define '_with_pmix --with-pmix=/usr' -tb slurm-20.11.3.tar.bz2
cd /root/rpmbuild/RPMS/x86_64/
yum --nogpgcheck localinstall slurm-*
mysql -u root -p
#--mysql> grant all on slurm_acct_db.* TO 'slurm'@'localhost' identified by 'some_pass' with grant option;
#--mysql> create database slurm_acct_db;
systemctl enable --now slurmdbd
# check status
systemctl status slurmdbd
mkdir /var/spool/slurmctld
chown slurm:slurm /var/spool/slurmctld
chmod 755 /var/spool/slurmctld
mkdir /var/log/slurm
touch /var/log/slurm/slurmctld.log
touch /var/log/slurm/slurm_jobacct.log /var/log/slurm/slurm_jobcomp.log
chown -R slurm:slurm /var/log/slurm/
Launch the slurmctld service
systemctl enable --now slurmctld
# check status
systemctl status slurmctld
scontrol update NodeName=$NODENAME State=RESUME
export MUNGEUSER=966
groupadd -g $MUNGEUSER munge
useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGEUSER -g munge -s /sbin/nologin munge
export SLURMUSER=967
groupadd -g $SLURMUSER slurm
useradd -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURMUSER -g slurm -s /bin/bash slurm
yum install munge munge-libs munge-devel -y
scp <masternode_hostname>:/etc/munge/munge.key /etc/munge/
chown -R munge: /etc/munge/ /var/log/munge/ /var/lib/munge/ /run/munge/
chmod 0700 /etc/munge/ /var/log/munge/ /var/lib/munge/ /run/munge/
systemctl enable --now munge
munge -n | unmunge
munge -n | ssh <masternode_hostname> unmunge
yum install openssl openssl-devel pam-devel rpmbuild numactl numactl-devel hwloc hwloc-devel lua lua-devel readline-devel rrdtool-devel ncurses-devel man2html libibmad libibumad -y
yum install pmix pmix-devel -y
wget https://download.schedmd.com/slurm/slurm-20.11.3.tar.bz2
rpmbuild --define '_with_pmix --with-pmix=/usr' -tb slurm-20.11.3.tar.bz2
cd /root/rpmbuild/RPMS/x86_64/
yum --nogpgcheck localinstall slurm-*
scp <masternode_hostname>:/etc/slurm/*.conf /etc/slurm/
mkdir /var/spool/slurmd
chown slurm: /var/spool/slurmd
chmod 755 /var/spool/slurmd
mkdir /var/log/slurm/
touch /var/log/slurm/slurmd.log
chown -R slurm:slurm /var/log/slurm/slurmd.log
systemctl enable --now slurmd
# check status
systemctl status slurmd