|
| 1 | +# Copyright (c) 2014-2026, Lawrence Livermore National Security, LLC. |
| 2 | +# Produced at the Lawrence Livermore National Laboratory. |
| 3 | +# Written by the LBANN Research Team (B. Van Essen, et al.) listed in |
| 4 | +# the CONTRIBUTORS file. See the top-level LICENSE file for details. |
| 5 | +# |
| 6 | +# LLNL-CODE-697807. |
| 7 | +# All rights reserved. |
| 8 | +# |
| 9 | +# This file is part of LBANN: Livermore Big Artificial Neural Network |
| 10 | +# Toolkit. For details, see http://software.llnl.gov/LBANN or |
| 11 | +# https://github.com/LBANN and https://github.com/LLNL/LBANN. |
| 12 | +# |
| 13 | +# SPDX-License-Identifier: (Apache-2.0) |
| 14 | +from hpc_launcher.schedulers.scheduler import Scheduler |
| 15 | +from hpc_launcher.schedulers.flux import FluxScheduler |
| 16 | +from hpc_launcher.systems.system import System, SystemParams |
| 17 | +import os |
| 18 | + |
| 19 | + |
| 20 | +# Corona (toss_4_x86_64_ib): AMD MI50 (gfx906), 48 CPU cores/node, 8 GPUs/node, |
| 21 | +# 32 GiB VRAM/GPU. This system does not have a Slingshot network, so do not |
| 22 | +# enable any Slingshot/Cassini RCCL-OFI tuning by default. |
| 23 | +_mi50_node = SystemParams(48, 8, "gfx906", 32.0, 2, "flux") |
| 24 | +_system_params = { |
| 25 | + "corona": ( |
| 26 | + "pbatch", |
| 27 | + { |
| 28 | + "pbatch": _mi50_node, |
| 29 | + "pdebug": _mi50_node, |
| 30 | + }, |
| 31 | + ), |
| 32 | +} |
| 33 | + |
| 34 | + |
| 35 | +class Corona(System): |
| 36 | + """ |
| 37 | + LLNL LC system profile for Corona (AMD GPUs, non-Slingshot network). |
| 38 | + """ |
| 39 | + |
| 40 | + def __init__(self, system_name: str): |
| 41 | + super().__init__(system_name, _system_params) |
| 42 | + |
| 43 | + def environment_variables(self) -> list[tuple[str, str]]: |
| 44 | + env_list = [] |
| 45 | + |
| 46 | + # ROCm/RCCL tuning that is not network-fabric specific |
| 47 | + env_list.append(("NCCL_MIN_NCHANNELS", "24")) |
| 48 | + |
| 49 | + # MIOpen cache locations (avoid home filesystem contention) |
| 50 | + env_list.append(("MIOPEN_DEBUG_DISABLE_FIND_DB", "0")) |
| 51 | + env_list.append(("MIOPEN_DISABLE_CACHE", "0")) |
| 52 | + tmpdir = os.environ.get("TMPDIR") |
| 53 | + if tmpdir: |
| 54 | + env_list.append(("MIOPEN_USER_DB_PATH", f"{tmpdir}/MIOpen_user_db")) |
| 55 | + env_list.append(("MIOPEN_CUSTOM_CACHE_DIR", f"{tmpdir}/MIOpen_custom_cache")) |
| 56 | + |
| 57 | + # If running on a Cray environment, preserve CRAY_LD_LIBRARY_PATH. |
| 58 | + if os.getenv("CRAY_LD_LIBRARY_PATH") is not None: |
| 59 | + env_list.append( |
| 60 | + ( |
| 61 | + "LD_LIBRARY_PATH", |
| 62 | + os.getenv("CRAY_LD_LIBRARY_PATH") + ":${LD_LIBRARY_PATH}", |
| 63 | + ) |
| 64 | + ) |
| 65 | + |
| 66 | + # Ensure ROCm LLVM libs are visible if ROCM_PATH is set. |
| 67 | + if os.getenv("ROCM_PATH") is not None: |
| 68 | + rocm_path = os.getenv("ROCM_PATH") |
| 69 | + env_list.append( |
| 70 | + ( |
| 71 | + "LD_LIBRARY_PATH", |
| 72 | + os.path.join(f"{rocm_path}", "llvm", "lib") + ":${LD_LIBRARY_PATH}", |
| 73 | + ) |
| 74 | + ) |
| 75 | + |
| 76 | + # Allow user override to add a specific OFI plugin path, even though |
| 77 | + # Corona is expected not to need RCCL-OFI by default. |
| 78 | + different_ofi_plugin = os.getenv("LBANN_USE_THIS_OFI_PLUGIN") |
| 79 | + if different_ofi_plugin is not None: |
| 80 | + env_list.append( |
| 81 | + ("LD_LIBRARY_PATH", different_ofi_plugin + ":${LD_LIBRARY_PATH}") |
| 82 | + ) |
| 83 | + |
| 84 | + for i in self._aux_env_list: |
| 85 | + env_list.append(i) |
| 86 | + |
| 87 | + return env_list |
| 88 | + |
| 89 | + def customize_scheduler(self, scheduler): |
| 90 | + use_this_rccl = os.getenv("LBANN_USE_THIS_RCCL") |
| 91 | + if isinstance(scheduler, FluxScheduler): |
| 92 | + scheduler.common_launch_args["--exclusive"] = None |
| 93 | + |
| 94 | + if use_this_rccl is not None: |
| 95 | + scheduler.ld_preloads = [f"{use_this_rccl}"] |
| 96 | + |
| 97 | + return |
| 98 | + |
| 99 | + @property |
| 100 | + def preferred_scheduler(self) -> type[Scheduler]: |
| 101 | + return FluxScheduler |
0 commit comments