PrimeIntellect-ai · samsja · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/src/prime_rl/templates/inference.sbatch.j2 b/src/prime_rl/templates/inference.sbatch.j2
@@ -107,6 +107,22 @@ echo "INFER_URLS=${INFER_URLS}"
 {{ pre_run_command }}
 {% endif %}
 
+# Cleanup stale node-local state from prior jobs. Orphan python/torchrun/vllm
+# processes and vLLM/torch IPC files under /dev/shm and /tmp can survive SLURM
+# termination and cause the next launch to hang (e.g. decode engines stuck at
+# "Waiting for READY message from DP Coordinator"). Harmless on clean nodes.
+srun bash -c '
+    pkill -9 -f "python.*prime_rl" 2>/dev/null
+    pkill -9 -f "torchrun" 2>/dev/null
+    pkill -9 -f "vllm" 2>/dev/null
+    pkill -9 -f "prime_rl" 2>/dev/null
+    sleep 2
+    rm -rf /dev/shm/vllm-* /dev/shm/vllm_* /tmp/vllm-* /tmp/vllm_* /tmp/torch-* /tmp/torchelastic_* 2>/dev/null
+    procs=$(ps -ef | grep -E "python|torchrun|vllm" | grep -v grep | wc -l)
+    gpu=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | awk "{s+=\$1} END {print s}")
+    echo "[node-cleanup] $(hostname) procs=$procs gpu_mem=${gpu}MiB"
+'
+
 # Run inference
 srun bash -c '
     cd $PROJECT_DIR

diff --git a/src/prime_rl/templates/multi_node_rl.sbatch.j2 b/src/prime_rl/templates/multi_node_rl.sbatch.j2
@@ -151,6 +151,22 @@ uv sync --all-extras
 {{ pre_run_command }}
 {% endif %}
 
+# Cleanup stale node-local state from prior jobs. Orphan python/torchrun/vllm
+# processes and vLLM/torch IPC files under /dev/shm and /tmp can survive SLURM
+# termination and cause the next launch to hang (e.g. decode engines stuck at
+# "Waiting for READY message from DP Coordinator"). Harmless on clean nodes.
+srun bash -c '
+    pkill -9 -f "python.*prime_rl" 2>/dev/null
+    pkill -9 -f "torchrun" 2>/dev/null
+    pkill -9 -f "vllm" 2>/dev/null
+    pkill -9 -f "prime_rl" 2>/dev/null
+    sleep 2
+    rm -rf /dev/shm/vllm-* /dev/shm/vllm_* /tmp/vllm-* /tmp/vllm_* /tmp/torch-* /tmp/torchelastic_* 2>/dev/null
+    procs=$(ps -ef | grep -E "python|torchrun|vllm" | grep -v grep | wc -l)
+    gpu=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | awk "{s+=\$1} END {print s}")
+    echo "[node-cleanup] $(hostname) procs=$procs gpu_mem=${gpu}MiB"
+'
+
 # Run RL
 srun bash -c '
     # Source environment

diff --git a/src/prime_rl/templates/multi_node_sft.sbatch.j2 b/src/prime_rl/templates/multi_node_sft.sbatch.j2
@@ -59,6 +59,21 @@ cd $PROJECT_DIR && uv sync --all-extras
 {{ pre_run_command }}
 {% endif %}
 
+# Cleanup stale node-local state from prior jobs. Orphan python/torchrun/vllm
+# processes and vLLM/torch IPC files under /dev/shm and /tmp can survive SLURM
+# termination and cause the next launch to hang. Harmless on clean nodes.
+srun bash -c '
+    pkill -9 -f "python.*prime_rl" 2>/dev/null
+    pkill -9 -f "torchrun" 2>/dev/null
+    pkill -9 -f "vllm" 2>/dev/null
+    pkill -9 -f "prime_rl" 2>/dev/null
+    sleep 2
+    rm -rf /dev/shm/vllm-* /dev/shm/vllm_* /tmp/vllm-* /tmp/vllm_* /tmp/torch-* /tmp/torchelastic_* 2>/dev/null
+    procs=$(ps -ef | grep -E "python|torchrun|vllm" | grep -v grep | wc -l)
+    gpu=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | awk "{s+=\$1} END {print s}")
+    echo "[node-cleanup] $(hostname) procs=$procs gpu_mem=${gpu}MiB"
+'
+
 # Run SFT
 srun bash -c '
     # Setup environment