-
Notifications
You must be signed in to change notification settings - Fork 2.7k
[MISC] Add full support of AMD GPU to unit test and benchmark infra. #2680
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 4 commits
87d60d8
436babb
cf0a407
453d137
7e69881
6b83ea0
d149a69
862b4c6
b7cddb2
8721784
079aea5
c8c5640
eec2c82
60dbf58
b0760a7
3f4cd65
339cead
aba6b21
e098ebf
d702678
8260523
f608785
f39e51c
959e671
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -107,12 +107,22 @@ def _skip_reason(reason): | |
|
|
||
|
|
||
| def is_mem_monitoring_supported(): | ||
| if not sys.platform.startswith("linux"): | ||
| return False, "mem-monitoring only supported on linux" | ||
|
|
||
| try: | ||
| assert sys.platform.startswith("linux") | ||
| subprocess.check_output(["nvidia-smi"], stderr=subprocess.STDOUT, timeout=10) | ||
| return True, None | ||
| except Exception as exc: # platform or nvidia-smi unavailable | ||
| return False, exc | ||
| except Exception as exc: | ||
| pass | ||
|
|
||
| try: | ||
| subprocess.check_output(["rocm-smi"], stderr=subprocess.STDOUT, timeout=10) | ||
| return True, None | ||
| except Exception as exc: | ||
| pass | ||
|
|
||
| return False, "neither nvidia-smi nor rocm-smi executable found or runnable" | ||
|
|
||
|
|
||
| def pytest_make_parametrize_id(config, val, argname): | ||
|
|
@@ -224,19 +234,81 @@ def _get_gpu_indices(): | |
| return tuple(map(int, cuda_visible_devices.split(","))) | ||
|
|
||
| if sys.platform == "linux": | ||
| nvidia_gpu_interface_path = "/proc/driver/nvidia/gpus/" | ||
| try: | ||
| return tuple(range(len(os.listdir(nvidia_gpu_interface_path)))) | ||
| except FileNotFoundError: | ||
| warnings.warn( | ||
| f"'{nvidia_gpu_interface_path}' is not available. Multi-GPU support will be disabled. This is expected " | ||
| "on WSL2 where the NVIDIA proc interface is not mounted.", | ||
| stacklevel=2, | ||
| ) | ||
| if NVIDIA_GPU_INTERFACE_PATH.is_dir(): | ||
| return tuple(i for i, _ in enumerate(NVIDIA_GPU_INTERFACE_PATH.iterdir())) | ||
| if KFD_SYSFS_PATH.is_dir(): | ||
| return tuple(i for i, _ in enumerate(get_kfd_gpu_nodes_properties())) | ||
|
|
||
| warnings.warn( | ||
| f"'{NVIDIA_GPU_INTERFACE_PATH!s}' or '{KFD_SYSFS_PATH!s}' is not available. Multi-GPU support will be disabled. This is expected " | ||
| "on WSL2 where the NVIDIA proc interface is not mounted.", | ||
| stacklevel=2, | ||
| ) | ||
|
|
||
| return (0,) | ||
|
|
||
|
|
||
| def parse_kfd_node_properties(kfd_properties_str: str): | ||
| props = {} | ||
| for l in kfd_properties_str.split("\n"): | ||
| l = l.strip() | ||
|
|
||
| if not l: | ||
| continue | ||
|
|
||
| name, value_str = l.split() | ||
| props[name] = int(value_str) | ||
|
|
||
| return props | ||
|
|
||
|
|
||
| KFD_SYSFS_PATH = Path("/sys/devices/virtual/kfd/kfd/topology") | ||
|
|
||
|
|
||
| def get_kfd_gpu_nodes_properties(): | ||
| kfd_sysfs_path_nodes = Path(KFD_SYSFS_PATH) / "nodes" | ||
|
|
||
| gpu_nodes_properties = {} | ||
|
|
||
| for node_path in kfd_sysfs_path_nodes.iterdir(): | ||
| with (node_path / "properties").open() as node_properties_f: | ||
| properties_str = node_properties_f.read() | ||
| node_props = parse_kfd_node_properties(properties_str) | ||
|
|
||
| if node_props["cpu_cores_count"] == 0: | ||
| gpu_nodes_properties[int(node_path.name)] = node_props | ||
|
|
||
| return gpu_nodes_properties | ||
|
|
||
|
|
||
| def amdgpu_get_node_rank_from_uuid(device_uuid): | ||
| device_uuid = device_uuid.replace("-", "") | ||
| hip_uuid = "".join([chr(int(device_uuid[i : i + 2], 16)) for i in range(0, len(device_uuid), 2)]) | ||
| unique_id = int(hip_uuid, 16) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No because actually hip UUID is already a 16 bytes hex representation of a int64 integer. PyTorch converts this 16 bytes hex rep into a 32 bytes hex rep (so it is a hex rep of an hex rep). |
||
|
|
||
| UNIQUE_ID = "unique_id" | ||
| gpu_nodes_properties = get_kfd_gpu_nodes_properties() | ||
|
|
||
| for node_rank, gpu_props in enumerate(gpu_nodes_properties.values()): | ||
| if gpu_props["unique_id"] == unique_id: | ||
| return node_rank | ||
|
|
||
| return -1 | ||
|
|
||
|
|
||
| NVIDIA_GPU_INTERFACE_PATH = Path("/proc/driver/nvidia/gpus/") | ||
|
|
||
|
|
||
| def nvidia_get_node_rank_from_uuid(device_uuid): | ||
| for device_idx, device_path in enumerate(NVIDIA_GPU_INTERFACE_PATH.iterdir()): | ||
| with (device_path / "information").open() as f: | ||
| device_info = f.read() | ||
| if re.search(rf"GPU UUID:\s+GPU-{device_uuid}", device_info): | ||
| return device_idx | ||
|
|
||
| return -1 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is starting to look very fragile. Is there any library out there that would be used to hiding all this logics? |
||
|
|
||
|
|
||
| def _torch_get_gpu_idx(device): | ||
| if sys.platform == "darwin": | ||
| return 0 | ||
|
|
@@ -247,19 +319,16 @@ def _torch_get_gpu_idx(device): | |
| device_property = torch.cuda.get_device_properties(device) | ||
| device_uuid = str(device_property.uuid) | ||
|
|
||
| nvidia_gpu_interface_path = "/proc/driver/nvidia/gpus/" | ||
| try: | ||
| for device_idx, device_path in enumerate(os.listdir(nvidia_gpu_interface_path)): | ||
| with open(os.path.join(nvidia_gpu_interface_path, device_path, "information"), "r") as f: | ||
| device_info = f.read() | ||
| if re.search(rf"GPU UUID:\s+GPU-{device_uuid}", device_info): | ||
| return device_idx | ||
| except FileNotFoundError: | ||
| warnings.warn( | ||
| f"'{nvidia_gpu_interface_path}' is not available. Multi-GPU support will be disabled. This is expected " | ||
| "on WSL2 where the NVIDIA proc interface is not mounted.", | ||
| stacklevel=2, | ||
| ) | ||
| if NVIDIA_GPU_INTERFACE_PATH.is_dir(): | ||
| return nvidia_get_node_rank_from_uuid(device_uuid) | ||
| if KFD_SYSFS_PATH.is_dir(): | ||
| return amdgpu_get_node_rank_from_uuid(device_uuid) | ||
|
|
||
| warnings.warn( | ||
| f"'{NVIDIA_GPU_INTERFACE_PATH!s}' or '{KFD_SYSFS_PATH!s}' is not available. Multi-GPU support will be disabled. This is expected " | ||
| "on WSL2 where the NVIDIA proc interface is not mounted.", | ||
| stacklevel=2, | ||
| ) | ||
|
|
||
| return -1 | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -47,8 +47,7 @@ def parse_test_name(test_name: str) -> dict[str, str]: | |
| return filtered_params | ||
|
|
||
|
|
||
| def get_cuda_usage() -> dict[int, int]: | ||
| output = subprocess.check_output(["nvidia-smi"]).decode("utf-8") | ||
| def parse_nvidia_smi_output_for_mem_per_process(output: str) -> dict[int, int]: | ||
| section = 0 | ||
| subsec = 0 | ||
| res = {} | ||
|
|
@@ -70,6 +69,67 @@ def get_cuda_usage() -> dict[int, int]: | |
| return res | ||
|
|
||
|
|
||
| def parse_rocm_smi_output_for_mem_per_process(output: str) -> dict[int, int]: | ||
| NO_PIDS = "No KFD PIDs currently running" | ||
|
|
||
| if NO_PIDS in output: | ||
| return {} | ||
|
|
||
| PID = "PID" | ||
| VRAM_USED = "VRAM USED" | ||
| TABLE_BORDER_STR = "====" | ||
|
|
||
| pid_idx = output.find(PID) | ||
| output = output[pid_idx:] | ||
|
|
||
| lines = output.split("\n") | ||
| header = lines[0] | ||
|
|
||
| # the header line looks like: | ||
| # | ||
| # PID PROCESS NAME GPU(s) VRAM USED SDMA USED CU OCCUPANCY | ||
| # | ||
| # we extract the span from each of them (it assumes they end with at least | ||
| # 2 whitespaces) | ||
| header_to_span = {header_m.group(0).strip(): header_m.span() for header_m in re.finditer(r"(\S+( |$))+ *", header)} | ||
|
|
||
| assert PID in header_to_span and VRAM_USED in header_to_span, ( | ||
| "The way rocm-smi presents information has changed and could not be parsed anymore" | ||
| ) | ||
|
|
||
| pid_to_mem_use = {} | ||
| for l in lines[1:]: | ||
| l = l.strip() | ||
| if not l or l.startswith(TABLE_BORDER_STR): | ||
| continue | ||
|
|
||
| header_to_info_str = {h: l[start:end].strip() for h, (start, end) in header_to_span.items()} | ||
|
|
||
| pid = int(header_to_info_str[PID]) | ||
| mem_bytes = int(header_to_info_str[VRAM_USED]) | ||
|
|
||
| # Use MB as does nvidia-smi | ||
| pid_to_mem_use[pid] = mem_bytes >> 20 | ||
|
|
||
| return pid_to_mem_use | ||
|
|
||
|
|
||
| def get_cuda_usage() -> dict[int, int]: | ||
| try: | ||
| output = subprocess.check_output(["nvidia-smi"]).decode("utf-8") | ||
| return parse_nvidia_smi_output_for_mem_per_process(output) | ||
| except FileNotFoundError: | ||
| pass | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume on a ROCm machine there is no nvidia tooling, otherwise it would be hard to distinguish between both of them. |
||
|
|
||
| try: | ||
| output = subprocess.check_output(["rocm-smi", "--showpids"]).decode("utf-8") | ||
| return parse_rocm_smi_output_for_mem_per_process(output) | ||
| except FileNotFoundError: | ||
| pass | ||
|
|
||
| raise RuntimeError("Neither nvidia-smi nor rocm-smi available.") | ||
|
|
||
|
|
||
| def get_test_name_by_pid() -> dict[int, str]: | ||
| test_by_psid = {} | ||
| for proc in psutil.process_iter(["pid", "cmdline"]): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Exceptionunless it is absolutely necessary: