-
Notifications
You must be signed in to change notification settings - Fork 172
refactor(agent): Pin container memory to NUMA-local node via CpusetMems #11222
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
7d460fc
b52b7f5
e4ba8ee
64529b2
0eda461
ba650bf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Pin container memory to the CPU's NUMA node via `CpusetMems` when the allocation is node-local. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -433,12 +433,29 @@ async def generate_docker_args( | |
| ) -> Mapping[str, Any]: | ||
| cores = [*map(int, device_alloc[SlotName("cpu")].keys())] | ||
| sorted_core_ids = [*map(str, sorted(cores))] | ||
| host_config: dict[str, Any] = { | ||
| "Cpus": len(cores), | ||
| "CpusetCpus": ",".join(sorted_core_ids), | ||
| } | ||
| # Skip CpusetMems entirely when NUMA is unsupported (non-Linux hosts, | ||
| # Linux without libnuma.so, Docker Desktop, WSL, etc.) or when the host | ||
| # exposes a single node; libnuma.node_of_cpu would otherwise fall back | ||
| # to 0 and cause every container to be pinned to "CpusetMems": "0". | ||
| if libnuma.num_nodes() > 1: | ||
| allocated_nodes: set[int] = set() | ||
| for core in cores: | ||
| node = libnuma.node_of_cpu(core) | ||
| if node < 0: | ||
| allocated_nodes.clear() | ||
| break | ||
| allocated_nodes.add(node) | ||
| # Pin memory only when the CPU allocation is fully node-local. | ||
| # For multi-node CPU allocations, intentionally leave CpusetMems unset | ||
| # so Docker/kernel default NUMA memory placement policy can apply. | ||
| if len(allocated_nodes) == 1: | ||
| host_config["CpusetMems"] = str(next(iter(allocated_nodes))) | ||
| return { | ||
| "HostConfig": { | ||
| "Cpus": len(cores), | ||
| "CpusetCpus": ",".join(sorted_core_ids), | ||
| # 'CpusetMems': f'{resource_spec.numa_node}', | ||
| }, | ||
| "HostConfig": host_config, | ||
| } | ||
|
Comment on lines
+436
to
459
|
||
|
|
||
| async def restore_from_container( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,6 +4,7 @@ | |
| from collections.abc import Generator | ||
| from contextlib import contextmanager | ||
| from dataclasses import dataclass | ||
| from decimal import Decimal | ||
| from pathlib import Path | ||
| from typing import Any | ||
| from unittest.mock import AsyncMock, MagicMock, patch | ||
|
|
@@ -17,6 +18,7 @@ | |
| read_proc_net_dev, | ||
| ) | ||
| from ai.backend.agent.stats import StatModes | ||
| from ai.backend.common.types import DeviceId, SlotName | ||
|
|
||
|
|
||
| class BaseDockerIntrinsicTest: | ||
|
|
@@ -610,3 +612,137 @@ def test_raises_oserror_for_nonexistent_pid(self) -> None: | |
| """Raises OSError when /proc/[pid]/net/dev does not exist.""" | ||
| with pytest.raises(OSError): | ||
| read_proc_net_dev(999999999) | ||
|
|
||
|
|
||
| class TestCPUPluginGenerateDockerArgsNumaLocality: | ||
| """Tests for CPUPlugin.generate_docker_args() NUMA-locality CpusetMems logic.""" | ||
|
|
||
| @pytest.fixture | ||
| def cpu_plugin(self) -> CPUPlugin: | ||
| return CPUPlugin.__new__(CPUPlugin) | ||
|
|
||
| @staticmethod | ||
| def _device_alloc(core_ids: list[int]) -> dict[SlotName, dict[DeviceId, Decimal]]: | ||
| return { | ||
| SlotName("cpu"): {DeviceId(str(cid)): Decimal("1") for cid in core_ids}, | ||
| } | ||
|
|
||
| @staticmethod | ||
| @contextmanager | ||
| def _patch_node_of_cpu( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since this method is intended to reproduce specific test scenarios, it would be more readable to inject fixtures for each scenario and apply parametrization. |
||
| core_to_node: dict[int, int], | ||
| *, | ||
| num_nodes: int = 2, | ||
| ) -> Generator[None, None, None]: | ||
| """Patch libnuma.node_of_cpu; return -1 for any core missing from the map | ||
| (matches real libnuma's behavior for unknown cores when NUMA info is | ||
| unavailable). | ||
|
|
||
| Also patches libnuma.num_nodes to report a multi-node host by default | ||
| so the NUMA-aware branch is exercised. Tests covering the non-NUMA | ||
| short-circuit can pass ``num_nodes=1``. | ||
| """ | ||
| with ( | ||
| patch( | ||
| "ai.backend.agent.docker.intrinsic.libnuma.num_nodes", | ||
| return_value=num_nodes, | ||
| ), | ||
| patch( | ||
| "ai.backend.agent.docker.intrinsic.libnuma.node_of_cpu", | ||
| side_effect=lambda core: core_to_node.get(core, -1), | ||
| ), | ||
| ): | ||
| yield | ||
|
|
||
| async def test_single_node_allocation_sets_cpuset_mems( | ||
| self, | ||
| cpu_plugin: CPUPlugin, | ||
| ) -> None: | ||
| """When all allocated cores are on the same NUMA node, CpusetMems is pinned | ||
| to that node as a string.""" | ||
| with self._patch_node_of_cpu({0: 0, 1: 0, 2: 1, 3: 1}): | ||
| result = await cpu_plugin.generate_docker_args( | ||
| AsyncMock(), | ||
| self._device_alloc([0, 1]), | ||
| ) | ||
|
|
||
| host_config = result["HostConfig"] | ||
| assert host_config["CpusetMems"] == "0" | ||
| # Sanity: core-list plumbing still works. | ||
| assert host_config["Cpus"] == 2 | ||
| assert host_config["CpusetCpus"] == "0,1" | ||
|
|
||
| async def test_multi_node_allocation_omits_cpuset_mems( | ||
| self, | ||
| cpu_plugin: CPUPlugin, | ||
| ) -> None: | ||
| """When cores span multiple NUMA nodes, CpusetMems must be omitted so that | ||
| the Docker/kernel default NUMA memory placement policy can apply.""" | ||
| with self._patch_node_of_cpu({0: 0, 1: 0, 2: 1, 3: 1}): | ||
| result = await cpu_plugin.generate_docker_args( | ||
| AsyncMock(), | ||
| self._device_alloc([0, 2]), | ||
| ) | ||
|
|
||
| host_config = result["HostConfig"] | ||
| assert "CpusetMems" not in host_config | ||
| # Sanity: core-list plumbing still works. | ||
| assert host_config["Cpus"] == 2 | ||
| assert host_config["CpusetCpus"] == "0,2" | ||
|
|
||
| @pytest.mark.parametrize( | ||
| "core_to_node", | ||
| [ | ||
| {0: 0}, | ||
| {0: 0, 1: -1}, | ||
| ], | ||
| ids=["unknown_node", "negative_node"], | ||
| ) | ||
| async def test_unknown_or_negative_node_omits_cpuset_mems( | ||
| self, | ||
| cpu_plugin: CPUPlugin, | ||
| core_to_node: dict[int, int], | ||
| ) -> None: | ||
| """When any allocated core maps to an unknown (libnuma returns -1) or | ||
| explicitly negative NUMA node, CpusetMems must be omitted. | ||
|
|
||
| `unknown_node` covers the case where libnuma cannot resolve a core (the | ||
| patched side_effect returns -1 for unmapped cores). `negative_node` covers | ||
| the case where libnuma reports -1 for a known core (NUMA info unavailable). | ||
| Both collapse to the same `node < 0` branch in the SUT. | ||
| """ | ||
| with self._patch_node_of_cpu(core_to_node): | ||
| result = await cpu_plugin.generate_docker_args( | ||
| AsyncMock(), | ||
| self._device_alloc([0, 1]), | ||
| ) | ||
|
|
||
| host_config = result["HostConfig"] | ||
| assert "CpusetMems" not in host_config | ||
| # Sanity: core-list plumbing still works. | ||
| assert host_config["Cpus"] == 2 | ||
| assert host_config["CpusetCpus"] == "0,1" | ||
|
|
||
| async def test_non_numa_host_omits_cpuset_mems( | ||
| self, | ||
| cpu_plugin: CPUPlugin, | ||
| ) -> None: | ||
| """On non-NUMA / non-Linux hosts (macOS, Docker Desktop, WSL, Linux | ||
| without libnuma.so) libnuma.num_nodes() reports 1 and node_of_cpu() | ||
| hardcodes 0. The plugin must short-circuit before inspecting per-core | ||
| nodes so containers are not unconditionally pinned to CpusetMems="0". | ||
| """ | ||
| # node_of_cpu would return 0 for every core on a non-NUMA host; assert | ||
| # we never reach that branch by mapping cores to a bogus node that | ||
| # would otherwise produce a stale CpusetMems assignment. | ||
| with self._patch_node_of_cpu({0: 0, 1: 0}, num_nodes=1): | ||
| result = await cpu_plugin.generate_docker_args( | ||
| AsyncMock(), | ||
| self._device_alloc([0, 1]), | ||
| ) | ||
|
|
||
| host_config = result["HostConfig"] | ||
| assert "CpusetMems" not in host_config | ||
| # Sanity: core-list plumbing still works. | ||
| assert host_config["Cpus"] == 2 | ||
| assert host_config["CpusetCpus"] == "0,1" | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about extracting this logic into a separate private method?