Skip to content

Commit bcd99ca

Browse files
committed
Add corona profile
1 parent b33b7d3 commit bcd99ca

2 files changed

Lines changed: 106 additions & 1 deletion

File tree

hpc_launcher/systems/autodetect.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from hpc_launcher.systems.lc.el_capitan_family import ElCapitan
1616
from hpc_launcher.systems.lc.cts2 import CTS2
1717
from hpc_launcher.systems.lc.sierra_family import Sierra
18+
from hpc_launcher.systems.lc.corona import Corona
1819
import logging
1920
import socket
2021
import re
@@ -213,7 +214,10 @@ def autodetect_current_system(quiet: bool = False) -> System:
213214
if sys in ("lassen", "sierra", "rzanzel"):
214215
return Sierra(sys)
215216

216-
# Try to find current system via other means
217+
if sys == "corona":
218+
return Corona(sys)
219+
220+
# Try to find current system via other means
217221
(generic_name, num_gpus, mem_per_gpu, gpu_arch) = find_gpus()
218222
num_cpus = count_cpus()
219223
scheduler = find_scheduler()

hpc_launcher/systems/lc/corona.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
# Copyright (c) 2014-2026, Lawrence Livermore National Security, LLC.
2+
# Produced at the Lawrence Livermore National Laboratory.
3+
# Written by the LBANN Research Team (B. Van Essen, et al.) listed in
4+
# the CONTRIBUTORS file. See the top-level LICENSE file for details.
5+
#
6+
# LLNL-CODE-697807.
7+
# All rights reserved.
8+
#
9+
# This file is part of LBANN: Livermore Big Artificial Neural Network
10+
# Toolkit. For details, see http://software.llnl.gov/LBANN or
11+
# https://github.com/LBANN and https://github.com/LLNL/LBANN.
12+
#
13+
# SPDX-License-Identifier: (Apache-2.0)
14+
from hpc_launcher.schedulers.scheduler import Scheduler
15+
from hpc_launcher.schedulers.flux import FluxScheduler
16+
from hpc_launcher.systems.system import System, SystemParams
17+
import os
18+
19+
20+
# Corona (toss_4_x86_64_ib): AMD MI50 (gfx906), 48 CPU cores/node, 8 GPUs/node,
21+
# 32 GiB VRAM/GPU. This system does not have a Slingshot network, so do not
22+
# enable any Slingshot/Cassini RCCL-OFI tuning by default.
23+
_mi50_node = SystemParams(48, 8, "gfx906", 32.0, 2, "flux")
24+
_system_params = {
25+
"corona": (
26+
"pbatch",
27+
{
28+
"pbatch": _mi50_node,
29+
"pdebug": _mi50_node,
30+
},
31+
),
32+
}
33+
34+
35+
class Corona(System):
36+
"""
37+
LLNL LC system profile for Corona (AMD GPUs, non-Slingshot network).
38+
"""
39+
40+
def __init__(self, system_name: str):
41+
super().__init__(system_name, _system_params)
42+
43+
def environment_variables(self) -> list[tuple[str, str]]:
44+
env_list = []
45+
46+
# ROCm/RCCL tuning that is not network-fabric specific
47+
env_list.append(("NCCL_MIN_NCHANNELS", "24"))
48+
49+
# MIOpen cache locations (avoid home filesystem contention)
50+
env_list.append(("MIOPEN_DEBUG_DISABLE_FIND_DB", "0"))
51+
env_list.append(("MIOPEN_DISABLE_CACHE", "0"))
52+
tmpdir = os.environ.get("TMPDIR")
53+
if tmpdir:
54+
env_list.append(("MIOPEN_USER_DB_PATH", f"{tmpdir}/MIOpen_user_db"))
55+
env_list.append(("MIOPEN_CUSTOM_CACHE_DIR", f"{tmpdir}/MIOpen_custom_cache"))
56+
57+
# If running on a Cray environment, preserve CRAY_LD_LIBRARY_PATH.
58+
if os.getenv("CRAY_LD_LIBRARY_PATH") is not None:
59+
env_list.append(
60+
(
61+
"LD_LIBRARY_PATH",
62+
os.getenv("CRAY_LD_LIBRARY_PATH") + ":${LD_LIBRARY_PATH}",
63+
)
64+
)
65+
66+
# Ensure ROCm LLVM libs are visible if ROCM_PATH is set.
67+
if os.getenv("ROCM_PATH") is not None:
68+
rocm_path = os.getenv("ROCM_PATH")
69+
env_list.append(
70+
(
71+
"LD_LIBRARY_PATH",
72+
os.path.join(f"{rocm_path}", "llvm", "lib") + ":${LD_LIBRARY_PATH}",
73+
)
74+
)
75+
76+
# Allow user override to add a specific OFI plugin path, even though
77+
# Corona is expected not to need RCCL-OFI by default.
78+
different_ofi_plugin = os.getenv("LBANN_USE_THIS_OFI_PLUGIN")
79+
if different_ofi_plugin is not None:
80+
env_list.append(
81+
("LD_LIBRARY_PATH", different_ofi_plugin + ":${LD_LIBRARY_PATH}")
82+
)
83+
84+
for i in self._aux_env_list:
85+
env_list.append(i)
86+
87+
return env_list
88+
89+
def customize_scheduler(self, scheduler):
90+
use_this_rccl = os.getenv("LBANN_USE_THIS_RCCL")
91+
if isinstance(scheduler, FluxScheduler):
92+
scheduler.common_launch_args["--exclusive"] = None
93+
94+
if use_this_rccl is not None:
95+
scheduler.ld_preloads = [f"{use_this_rccl}"]
96+
97+
return
98+
99+
@property
100+
def preferred_scheduler(self) -> type[Scheduler]:
101+
return FluxScheduler

0 commit comments

Comments
 (0)