Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ COMPOSE_PROJECT_NAME=slurm
# - Selecting version-specific configuration files
SLURM_VERSION=25.11.4

# Lmod version (https://github.com/TACC/Lmod/releases)
LMOD_VERSION=9.1.2

# Spack version (https://github.com/spack/spack/releases)
# Spack is cloned at image build time into /usr/local/spack (baked into the image).
# Installed packages and generated Lmod modules are stored in the spack_root volume (/opt/spack).
SPACK_VERSION=v1.1.1

# SlurmDB MySQL credentials
# Default values shown below (suitable for local development/testing only)
MYSQL_USER=slurm
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,5 @@ docker-compose.override.yml
rpmbuild/
archive/
.claude/
*.md
!README*md
101 changes: 99 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

ARG SLURM_VERSION
ARG GOSU_VERSION=1.19
ARG LMOD_VERSION=9.1.2
ARG SPACK_VERSION=v1.1.1
# BUILDER_BASE and RUNTIME_BASE overridden when GPU_ENABLE=true is set in .env
ARG BUILDER_BASE=rockylinux/rockylinux:9
ARG RUNTIME_BASE=rockylinux/rockylinux:9
Expand Down Expand Up @@ -118,7 +120,70 @@ RUN set -ex \
&& ls -lh /root/rpmbuild/RPMS/${RPM_ARCH}/

# ============================================================================
# Stage 3: Runtime image
# Stage 3: Build Lmod from source
# (hardcoded Rocky Linux 9 base — GPU CUDA images are not needed to build Lmod)
# ============================================================================
FROM rockylinux/rockylinux:9 AS lmod-builder

ARG LMOD_VERSION

RUN set -ex \
&& dnf -y install dnf-plugins-core epel-release \
&& dnf config-manager --set-enabled crb \
&& dnf -y install \
bc \
gcc \
lua \
lua-devel \
lua-posix \
lua-filesystem \
tcl \
tcl-devel \
procps-ng \
python3 \
wget \
&& dnf clean all \
&& rm -rf /var/cache/dnf

RUN set -ex \
&& wget -O /tmp/Lmod-${LMOD_VERSION}.tar.gz \
https://github.com/TACC/Lmod/archive/refs/tags/${LMOD_VERSION}.tar.gz \
&& tar -xzf /tmp/Lmod-${LMOD_VERSION}.tar.gz -C /tmp \
&& cd /tmp/Lmod-${LMOD_VERSION} \
&& ./configure --prefix=/usr/local \
&& make install

# ============================================================================
# Stage 4: Clone Spack from source
# (hardcoded Rocky Linux 9 — no GPU overhead needed for this stage)
# ============================================================================
FROM rockylinux/rockylinux:9 AS spack-builder

ARG SPACK_VERSION

RUN dnf -y install git gcc && dnf clean all

RUN git clone --depth=1 --branch "${SPACK_VERSION}" \
https://github.com/spack/spack.git /usr/local/spack

# Write site-level Spack config:
# - modules.yaml: generate Lmod modulefiles using the system GCC as the core compiler
# - config.yaml: store installed packages and generated modules under /opt/spack (the named volume)
# - packages.yaml: pin target to the base ISA (x86_64 or aarch64) so the module
# path is predictable and matches the MODULEPATH baked into lmod.sh
RUN set -ex \
&& GCC_VER=$(gcc --version | head -1 | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1) \
&& ARCH=$(uname -m) \
&& mkdir -p /usr/local/spack/etc/spack \
&& printf 'modules:\n default:\n enable:\n - lmod\n roots:\n lmod: /opt/spack/modules\n lmod:\n core_compilers:\n - gcc@%s\n' \
"${GCC_VER}" > /usr/local/spack/etc/spack/modules.yaml \
&& printf 'config:\n install_tree:\n root: /opt/spack\n' \
> /usr/local/spack/etc/spack/config.yaml \
&& printf 'packages:\n all:\n require:\n - "target=%s"\n' \
"${ARCH}" > /usr/local/spack/etc/spack/packages.yaml

# ============================================================================
# Stage 5: Runtime image
# ============================================================================
FROM ${RUNTIME_BASE}

Expand All @@ -141,7 +206,13 @@ RUN set -ex \
apptainer \
bash-completion \
bzip2 \
bzip2-devel \
file \
gcc \
gcc-c++ \
gcc-gfortran \
gettext \
git \
hdf5 \
http-parser \
hwloc \
Expand All @@ -150,18 +221,24 @@ RUN set -ex \
libaec \
libyaml \
lua \
lua-posix \
lua-filesystem \
lz4 \
make \
mariadb \
munge \
numactl \
openssh-server \
patch \
perl \
procps-ng \
psmisc \
python3.12 \
readline \
tcl \
vim-enhanced \
wget \
xz \
libjwt \
&& dnf clean all \
&& rm -rf /var/cache/dnf \
Expand All @@ -171,10 +248,29 @@ RUN set -ex \
&& sed -i 's/^#\?PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config \
&& sed -i 's/^#\?PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config

# Install gosu (built from source in stage 1)
# Install gosu
COPY --from=gosu-builder /go/bin/gosu /usr/local/bin/gosu
RUN gosu --version && gosu nobody true

# Install Lmod
COPY --from=lmod-builder /usr/local/lmod /usr/local/lmod

# Install Spack
COPY --from=spack-builder /usr/local/spack /usr/local/spack

# Configure Lmod system-wide and source Spack's shell integration.
# MODULEPATH uses the base ISA arch (x86_64/aarch64 from uname -m) which matches
# the target forced in packages.yaml, avoiding microarch mismatches (e.g. zen2).
RUN ARCH=$(uname -m) \
&& printf '%s\n' \
'source /usr/local/lmod/lmod/init/bash' \
'export SPACK_ROOT=/usr/local/spack' \
"export MODULEPATH=\"/opt/spack/modules/linux-rocky9-${ARCH}/Core:/opt/modulefiles\"" \
'source /usr/local/spack/share/spack/setup-env.sh' \
> /etc/profile.d/lmod.sh \
&& chmod 644 /etc/profile.d/lmod.sh \
&& mkdir -p /opt/modulefiles /opt/spack /opt/spack/modules

COPY --from=builder /root/rpmbuild/RPMS/*/*.rpm /tmp/rpms/

# Install Slurm RPMs
Expand Down Expand Up @@ -248,6 +344,7 @@ RUN set -ex \
&& chmod 644 /etc/slurm/slurm.conf /etc/slurm/cgroup.conf \
&& chmod 600 /etc/slurm/slurmdbd.conf \
&& rm -rf /tmp/slurm-config

COPY --chown=slurm:slurm --chmod=0600 examples /root/examples

COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
Expand Down
38 changes: 31 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,11 @@ docker tag giovtorres/slurm-docker-cluster:latest slurm-docker-cluster:25.11.4
# Option B: Build from source
make build

# Start the cluster
# Then, start the cluster
make up
make status # verify nodes are idle
make test # run full test suite
make help # see all available commands
```

**Supported Slurm versions:** 25.11.x, 25.05.x (last two Major.Minor releases)
**Supported Slurm versions:** 25.11, 25.05

**Supported architectures (auto-detected):** AMD64, ARM64

Expand Down Expand Up @@ -194,17 +191,44 @@ make test-gpu

> **Note:** GPU testing is not included in CI (GitHub-hosted runners have no GPUs). Run `make test-gpu` manually on a host with an NVIDIA GPU and `nvidia-container-toolkit` installed.

## 📦 Software Installation

[Spack](https://spack.io) is included in the image and integrates with [Lmod](https://lmod.readthedocs.io) so installed packages appear immediately as modules. All nodes share the same Spack and module tree.

```bash
make shell

spack install python@3.14
module avail
module load python/3.14.0
python --version
```

Modules are also available in batch jobs without any extra setup:

```bash
sbatch --wrap="module load python/3.14.0 && python3 --version"
```

To add a custom modulefile outside of Spack, drop a `.lua` file into the `opt_modulefiles` volume — it appears immediately on all nodes without a restart:

```bash
docker exec slurmctld mkdir -p /opt/modulefiles/myapp
docker cp myapp/1.0.lua slurmctld:/opt/modulefiles/myapp/1.0.lua
module avail
```

## 🔄 Cluster Management

Run `make` to see all available commands. Common ones:

```bash
make down # Stop cluster (keeps data)
make clean # Remove all containers and volumes
make rebuild # Clean, rebuild, and restart
make logs # View container logs
```

> **Note:** If `ELASTICSEARCH_HOST` is set in `.env`, monitoring containers are automatically managed.

## 🐳 Docker Hub

Pre-built multi-arch images (amd64 + arm64) are published on each [GitHub release](https://github.com/giovtorres/slurm-docker-cluster/releases):
Expand Down
10 changes: 10 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ services:
context: .
args:
SLURM_VERSION: ${SLURM_VERSION:-25.11.4}
LMOD_VERSION: ${LMOD_VERSION:-9.1.2}
SPACK_VERSION: ${SPACK_VERSION:-v1.1.1}
GPU_ENABLE: ${GPU_ENABLE:-false}
BUILDER_BASE: ${BUILDER_BASE:-rockylinux/rockylinux:9}
RUNTIME_BASE: ${RUNTIME_BASE:-rockylinux/rockylinux:9}
Expand Down Expand Up @@ -69,6 +71,8 @@ services:
- etc_slurm:/etc/slurm:z
- slurm_jobdir:/data:z
- var_log_slurm:/var/log/slurm:z
- opt_modulefiles:/opt/modulefiles
- spack_root:/opt/spack
- ${SSH_AUTHORIZED_KEYS:-/dev/null}:/tmp/authorized_keys_host:ro,z
ports:
- "${SSH_PORT:-3022}:22"
Expand Down Expand Up @@ -125,6 +129,8 @@ services:
- slurm_jobdir:/data
- var_log_slurm:/var/log/slurm
- home_ood:/home/ood # shared with OOD for job I/O
- opt_modulefiles:/opt/modulefiles
- spack_root:/opt/spack
expose:
- "6818"
depends_on:
Expand Down Expand Up @@ -157,6 +163,8 @@ services:
- slurm_jobdir:/data
- var_log_slurm:/var/log/slurm
- home_ood:/home/ood # shared with OOD for job I/O
- opt_modulefiles:/opt/modulefiles
- spack_root:/opt/spack
expose:
- "6818"
depends_on:
Expand Down Expand Up @@ -265,6 +273,8 @@ volumes:
var_lib_mysql:
var_log_slurm:
home_ood:
opt_modulefiles:
spack_root:
elasticsearch_data:

networks:
Expand Down
Loading
Loading