Skip to content
Draft
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 94 additions & 25 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,22 @@ def _skip_reason(reason):


def is_mem_monitoring_supported():
if not sys.platform.startswith("linux"):
return False, "mem-monitoring only supported on linux"

try:
assert sys.platform.startswith("linux")
subprocess.check_output(["nvidia-smi"], stderr=subprocess.STDOUT, timeout=10)
return True, None
except Exception as exc: # platform or nvidia-smi unavailable
return False, exc
except Exception as exc:
pass

try:
subprocess.check_output(["rocm-smi"], stderr=subprocess.STDOUT, timeout=10)
return True, None
except Exception as exc:
pass
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • Avoid duplicated logics
  • Never catch generic Exception unless it is absolutely necessary:
utilities = ("nvidia-smi", "rocm-smi")
for exe in utilities:
    try:
        subprocess.check_output([exe], stderr=subprocess.STDOUT, check=True, timeout=10)
        return True, None
    except (FileNotFoundError, subprocess.TimeoutExpired, subprocess.CalledProcessError, OSError):
        pass

return False, f"None of the executables {utilities} exited successfully (code 0)."


return False, "neither nvidia-smi nor rocm-smi executable found or runnable"


def pytest_make_parametrize_id(config, val, argname):
Expand Down Expand Up @@ -224,19 +234,81 @@ def _get_gpu_indices():
return tuple(map(int, cuda_visible_devices.split(",")))

if sys.platform == "linux":
nvidia_gpu_interface_path = "/proc/driver/nvidia/gpus/"
try:
return tuple(range(len(os.listdir(nvidia_gpu_interface_path))))
except FileNotFoundError:
warnings.warn(
f"'{nvidia_gpu_interface_path}' is not available. Multi-GPU support will be disabled. This is expected "
"on WSL2 where the NVIDIA proc interface is not mounted.",
stacklevel=2,
)
if NVIDIA_GPU_INTERFACE_PATH.is_dir():
return tuple(i for i, _ in enumerate(NVIDIA_GPU_INTERFACE_PATH.iterdir()))
if KFD_SYSFS_PATH.is_dir():
return tuple(i for i, _ in enumerate(get_kfd_gpu_nodes_properties()))

warnings.warn(
f"'{NVIDIA_GPU_INTERFACE_PATH!s}' or '{KFD_SYSFS_PATH!s}' is not available. Multi-GPU support will be disabled. This is expected "
"on WSL2 where the NVIDIA proc interface is not mounted.",
stacklevel=2,
)

return (0,)


def parse_kfd_node_properties(kfd_properties_str: str):
props = {}
for l in kfd_properties_str.split("\n"):
l = l.strip()

if not l:
continue

name, value_str = l.split()
props[name] = int(value_str)

return props


KFD_SYSFS_PATH = Path("/sys/devices/virtual/kfd/kfd/topology")


def get_kfd_gpu_nodes_properties():
kfd_sysfs_path_nodes = Path(KFD_SYSFS_PATH) / "nodes"

gpu_nodes_properties = {}

for node_path in kfd_sysfs_path_nodes.iterdir():
with (node_path / "properties").open() as node_properties_f:
properties_str = node_properties_f.read()
node_props = parse_kfd_node_properties(properties_str)

if node_props["cpu_cores_count"] == 0:
gpu_nodes_properties[int(node_path.name)] = node_props

return gpu_nodes_properties


def amdgpu_get_node_rank_from_uuid(device_uuid):
device_uuid = device_uuid.replace("-", "")
hip_uuid = "".join([chr(int(device_uuid[i : i + 2], 16)) for i in range(0, len(device_uuid), 2)])
unique_id = int(hip_uuid, 16)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Decode ROCm UUID bytes correctly before integer conversion

amdgpu_get_node_rank_from_uuid() converts each UUID byte to a character and then calls int(..., 16), but that intermediate string will contain non-hex characters for almost all valid UUIDs (e.g., byte 0x7d becomes }), so int raises ValueError before any node lookup happens. On ROCm systems this propagates from _torch_get_gpu_idx() and can fail GPU test setup as soon as device-index validation runs, so memory/device routing checks break even when KFD topology is present.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No because actually hip UUID is already a 16 bytes hex representation of a int64 integer. PyTorch converts this 16 bytes hex rep into a 32 bytes hex rep (so it is a hex rep of an hex rep).


UNIQUE_ID = "unique_id"
gpu_nodes_properties = get_kfd_gpu_nodes_properties()

for node_rank, gpu_props in enumerate(gpu_nodes_properties.values()):
if gpu_props["unique_id"] == unique_id:
return node_rank

return -1


NVIDIA_GPU_INTERFACE_PATH = Path("/proc/driver/nvidia/gpus/")


def nvidia_get_node_rank_from_uuid(device_uuid):
for device_idx, device_path in enumerate(NVIDIA_GPU_INTERFACE_PATH.iterdir()):
with (device_path / "information").open() as f:
device_info = f.read()
if re.search(rf"GPU UUID:\s+GPU-{device_uuid}", device_info):
return device_idx

return -1
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is starting to look very fragile. Is there any library out there that would be used to hiding all this logics?



def _torch_get_gpu_idx(device):
if sys.platform == "darwin":
return 0
Expand All @@ -247,19 +319,16 @@ def _torch_get_gpu_idx(device):
device_property = torch.cuda.get_device_properties(device)
device_uuid = str(device_property.uuid)

nvidia_gpu_interface_path = "/proc/driver/nvidia/gpus/"
try:
for device_idx, device_path in enumerate(os.listdir(nvidia_gpu_interface_path)):
with open(os.path.join(nvidia_gpu_interface_path, device_path, "information"), "r") as f:
device_info = f.read()
if re.search(rf"GPU UUID:\s+GPU-{device_uuid}", device_info):
return device_idx
except FileNotFoundError:
warnings.warn(
f"'{nvidia_gpu_interface_path}' is not available. Multi-GPU support will be disabled. This is expected "
"on WSL2 where the NVIDIA proc interface is not mounted.",
stacklevel=2,
)
if NVIDIA_GPU_INTERFACE_PATH.is_dir():
return nvidia_get_node_rank_from_uuid(device_uuid)
if KFD_SYSFS_PATH.is_dir():
return amdgpu_get_node_rank_from_uuid(device_uuid)

warnings.warn(
f"'{NVIDIA_GPU_INTERFACE_PATH!s}' or '{KFD_SYSFS_PATH!s}' is not available. Multi-GPU support will be disabled. This is expected "
"on WSL2 where the NVIDIA proc interface is not mounted.",
stacklevel=2,
)

return -1

Expand Down
64 changes: 62 additions & 2 deletions tests/monitor_test_mem.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,7 @@ def parse_test_name(test_name: str) -> dict[str, str]:
return filtered_params


def get_cuda_usage() -> dict[int, int]:
output = subprocess.check_output(["nvidia-smi"]).decode("utf-8")
def parse_nvidia_smi_output_for_mem_per_process(output: str) -> dict[int, int]:
section = 0
subsec = 0
res = {}
Expand All @@ -70,6 +69,67 @@ def get_cuda_usage() -> dict[int, int]:
return res


def parse_rocm_smi_output_for_mem_per_process(output: str) -> dict[int, int]:
NO_PIDS = "No KFD PIDs currently running"

if NO_PIDS in output:
return {}

PID = "PID"
VRAM_USED = "VRAM USED"
TABLE_BORDER_STR = "===="

pid_idx = output.find(PID)
output = output[pid_idx:]

lines = output.split("\n")
header = lines[0]

# the header line looks like:
#
# PID PROCESS NAME GPU(s) VRAM USED SDMA USED CU OCCUPANCY
#
# we extract the span from each of them (it assumes they end with at least
# 2 whitespaces)
header_to_span = {header_m.group(0).strip(): header_m.span() for header_m in re.finditer(r"(\S+( |$))+ *", header)}

assert PID in header_to_span and VRAM_USED in header_to_span, (
"The way rocm-smi presents information has changed and could not be parsed anymore"
)

pid_to_mem_use = {}
for l in lines[1:]:
l = l.strip()
if not l or l.startswith(TABLE_BORDER_STR):
continue

header_to_info_str = {h: l[start:end].strip() for h, (start, end) in header_to_span.items()}

pid = int(header_to_info_str[PID])
mem_bytes = int(header_to_info_str[VRAM_USED])

# Use MB as does nvidia-smi
pid_to_mem_use[pid] = mem_bytes >> 20

return pid_to_mem_use


def get_cuda_usage() -> dict[int, int]:
try:
output = subprocess.check_output(["nvidia-smi"]).decode("utf-8")
return parse_nvidia_smi_output_for_mem_per_process(output)
except FileNotFoundError:
pass
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Catch nvidia-smi command failures before ROCm fallback

is_mem_monitoring_supported() now allows monitoring when nvidia-smi fails but rocm-smi works, yet get_cuda_usage() only falls back when nvidia-smi is missing. On ROCm hosts where nvidia-smi is present but exits non-zero (raising subprocess.CalledProcessError), the monitor process crashes before trying rocm-smi, so --mem-monitoring-filepath is reported as supported but fails at runtime.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume on a ROCm machine there is no nvidia tooling, otherwise it would be hard to distinguish between both of them.


try:
output = subprocess.check_output(["rocm-smi", "--showpids"]).decode("utf-8")
return parse_rocm_smi_output_for_mem_per_process(output)
except FileNotFoundError:
pass

raise RuntimeError("Neither nvidia-smi nor rocm-smi available.")


def get_test_name_by_pid() -> dict[int, str]:
test_by_psid = {}
for proc in psutil.process_iter(["pid", "cmdline"]):
Expand Down
37 changes: 37 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from genesis.utils.urdf import compose_inertial_properties

from .utils import assert_allclose
from . import monitor_test_mem


TOL = 1e-7
Expand Down Expand Up @@ -518,3 +519,39 @@ def test_polar_decomposition_batched_pure_rotation(side, tol):
np_reconstructed = np_P @ np_U

assert_allclose(np_A, np_reconstructed, tol=tol)


def test_parse_rocm_smi_output_for_mem_per_process():
"""This function is in tests/"""

output = """


============================ ROCm System Management Interface ============================
===================================== KFD Processes ======================================
KFD process information:
PID PROCESS NAME GPU(s) VRAM USED SDMA USED CU OCCUPANCY
19191 [pytest-xdist r 1 2680758272 0 0
19188 [pytest-xdist r 1 2680078336 0 0
19206 [pytest-xdist r 1 2867257344 0 0
19194 [pytest-xdist r 1 2680664064 0 0
19200 [pytest-xdist r 1 2866966528 0 0
19209 [pytest-xdist r 1 2680922112 0 0
19197 [pytest-xdist r 1 2686627840 0 0
19203 [pytest-xdist r 1 2681561088 0 0
==========================================================================================
================================== End of ROCm SMI Log ===================================
"""

expected_output = {
19191: 2556,
19188: 2555,
19206: 2734,
19194: 2556,
19200: 2734,
19209: 2556,
19197: 2562,
19203: 2557,
}
results = monitor_test_mem.parse_rocm_smi_output_for_mem_per_process(output)
assert results == expected_output