Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,22 @@ def _skip_reason(reason):


def is_mem_monitoring_supported():
if sys.platform.startswith("linux"):
return False, "mem-monitoring only supported on linux"
Comment thread
v01dXYZ marked this conversation as resolved.
Outdated

try:
assert sys.platform.startswith("linux")
subprocess.check_output(["nvidia-smi"], stderr=subprocess.STDOUT, timeout=10)
return True, None
except Exception as exc: # platform or nvidia-smi unavailable
return False, exc
except Exception as exc:
pass

try:
subprocess.check_output(["rocm-smi"], stderr=subprocess.STDOUT, timeout=10)
return True, None
except Exception as exc:
pass
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • Avoid duplicated logics
  • Never catch generic Exception unless it is absolutely necessary:
utilities = ("nvidia-smi", "rocm-smi")
for exe in utilities:
    try:
        subprocess.check_output([exe], stderr=subprocess.STDOUT, check=True, timeout=10)
        return True, None
    except (FileNotFoundError, subprocess.TimeoutExpired, subprocess.CalledProcessError, OSError):
        pass

return False, f"None of the executables {utilities} exited successfully (code 0)."


return False, "neither nvidia-smi nor rocm-smi executable found or runnable"


def pytest_make_parametrize_id(config, val, argname):
Expand Down
59 changes: 57 additions & 2 deletions tests/monitor_test_mem.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,7 @@ def parse_test_name(test_name: str) -> dict[str, str]:
return filtered_params


def get_cuda_usage() -> dict[int, int]:
output = subprocess.check_output(["nvidia-smi"]).decode("utf-8")
def parse_nvidia_smi_output_for_mem_per_process(output: str) -> dict[int, int]:
section = 0
subsec = 0
res = {}
Expand All @@ -70,6 +69,62 @@ def get_cuda_usage() -> dict[int, int]:
return res


def parse_rocm_smi_output_for_mem_per_process(output: str) -> dict[int, int]:
PID = "PID"
VRAM_USED = "VRAM USED"
TABLE_BORDER_STR = "===="

pid_idx = output.find(PID)
output = output[pid_idx:]

lines = output.split("\n")
header = lines[0]

# the header line looks like:
#
# PID PROCESS NAME GPU(s) VRAM USED SDMA USED CU OCCUPANCY
#
# we extract the span from each of them (it assumes they end with at least
# 2 whitespaces)
header_to_span = {header_m.group(0).strip(): header_m.span() for header_m in re.finditer(r"(\S+( |$))+ *", header)}

assert PID in header_to_span and VRAM_USED in header_to_span, (
"The way rocm-smi presents information has changed and could not be parsed anymore"
)

pid_to_mem_use = {}
for l in lines[1:]:
l = l.strip()
if not l or l.startswith(TABLE_BORDER_STR):
continue

header_to_info_str = {h: l[start:end].strip() for h, (start, end) in header_to_span.items()}

pid = int(header_to_info_str[PID])
mem_bytes = int(header_to_info_str[VRAM_USED])

# Use MB as does nvidia-smi
pid_to_mem_use[pid] = mem_bytes >> 20

return pid_to_mem_use


def get_cuda_usage() -> dict[int, int]:
try:
output = subprocess.check_output(["nvidia-smi"]).decode("utf-8")
return parse_nvidia_smi_output_for_mem_per_process(output)
except FileNotFoundError:
pass
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Catch nvidia-smi command failures before ROCm fallback

is_mem_monitoring_supported() now allows monitoring when nvidia-smi fails but rocm-smi works, yet get_cuda_usage() only falls back when nvidia-smi is missing. On ROCm hosts where nvidia-smi is present but exits non-zero (raising subprocess.CalledProcessError), the monitor process crashes before trying rocm-smi, so --mem-monitoring-filepath is reported as supported but fails at runtime.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume on a ROCm machine there is no nvidia tooling, otherwise it would be hard to distinguish between both of them.


try:
output = subprocess.check_output(["rocm-smi", "--showpids"]).decode("utf-8")
return parse_rocm_smi_output_for_mem_per_process(output)
except FileNotFoundError:
pass

raise RuntimeError("Neither nvidia-smi nor rocm-smi available.")


def get_test_name_by_pid() -> dict[int, str]:
test_by_psid = {}
for proc in psutil.process_iter(["pid", "cmdline"]):
Expand Down
37 changes: 37 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from genesis.utils.urdf import compose_inertial_properties

from .utils import assert_allclose
from . import monitor_test_mem


TOL = 1e-7
Expand Down Expand Up @@ -518,3 +519,39 @@ def test_polar_decomposition_batched_pure_rotation(side, tol):
np_reconstructed = np_P @ np_U

assert_allclose(np_A, np_reconstructed, tol=tol)


def test_parse_rocm_smi_output_for_mem_per_process():
"""This function is in tests/"""

output = """


============================ ROCm System Management Interface ============================
===================================== KFD Processes ======================================
KFD process information:
PID PROCESS NAME GPU(s) VRAM USED SDMA USED CU OCCUPANCY
19191 [pytest-xdist r 1 2680758272 0 0
19188 [pytest-xdist r 1 2680078336 0 0
19206 [pytest-xdist r 1 2867257344 0 0
19194 [pytest-xdist r 1 2680664064 0 0
19200 [pytest-xdist r 1 2866966528 0 0
19209 [pytest-xdist r 1 2680922112 0 0
19197 [pytest-xdist r 1 2686627840 0 0
19203 [pytest-xdist r 1 2681561088 0 0
==========================================================================================
================================== End of ROCm SMI Log ===================================
"""

expected_output = {
19191: 2556,
19188: 2555,
19206: 2734,
19194: 2556,
19200: 2734,
19209: 2556,
19197: 2562,
19203: 2557,
}
results = monitor_test_mem.parse_rocm_smi_output_for_mem_per_process(output)
assert results == expected_output