Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions lisa/microsoft/testsuites/core/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from lisa.sut_orchestrator.azure.common import AzureNodeSchema
from lisa.sut_orchestrator.azure.platform_ import AzurePlatform
from lisa.tools import Cat, InterruptInspector, Lscpu, TaskSet, Uname
from lisa.tools.lscpu import UNKNOWN_CACHE_ID

hyperv_interrupt_substr = ["hyperv", "hypervsum", "Hypervisor", "Hyper-V"]

Expand Down Expand Up @@ -121,6 +122,15 @@ def verify_l3_cache(
# For all other cases, check L3 cache mapping with socket awareness
cpu_info = lscpu.get_cpu_info()

self._check_cache_topology_exposed(
cpu_info,
skip_message=(
"Cache topology is not exposed on this VM. "
"lscpu reports no cache information (likely a confidential VM "
"or a VM size that does not expose cache topology to the guest)."
),
)

# Build a mapping of socket -> NUMA nodes and socket -> L3 caches
socket_to_numa_nodes: dict[int, set[int]] = {}
socket_to_l3_caches: dict[int, set[int]] = {}
Expand Down Expand Up @@ -299,6 +309,13 @@ def _create_stimer_interrupts(self, node: Node, cpu_count: int) -> None:

def _verify_node_mapping(self, node: Node, numa_node_size: int) -> None:
cpu_info = node.tools[Lscpu].get_cpu_info()
self._check_cache_topology_exposed(
cpu_info,
skip_message=(
"Cache topology is not exposed on this VM. "
"lscpu reports no cache information."
),
)
cpu_info.sort(key=lambda cpu: cpu.cpu)
for i, cpu in enumerate(cpu_info):
numa_node_id = i // numa_node_size
Expand All @@ -308,6 +325,28 @@ def _verify_node_mapping(self, node: Node, numa_node_size: int) -> None:
"associated with the core.",
).is_equal_to(numa_node_id)

def _check_cache_topology_exposed(
self,
cpu_info: list[Any],
skip_message: str,
) -> None:
# On some VMs (e.g. confidential VMs), cache topology is not exposed
# by the hypervisor, so lscpu reports "-" for all cache values.
# If all CPUs lack cache info, skip the test. If only some do, treat it
# as a real failure so a partial/mixed state isn't silently masked.
unknown_l3_cache_count = sum(
1 for cpu in cpu_info if cpu.l3_cache == UNKNOWN_CACHE_ID
)
if unknown_l3_cache_count == len(cpu_info):
raise SkippedException(skip_message)
if unknown_l3_cache_count:
raise LisaException(
"Inconsistent L3 cache topology reported by lscpu: "
f"{unknown_l3_cache_count} of {len(cpu_info)} CPUs have unknown "
"L3 cache IDs while others have valid values. Investigate lscpu "
"parsing or host cache-topology exposure on this VM."
)
Comment on lines 311 to +348

def _is_one_to_one_mapping(
self,
socket_to_numa_nodes: dict[int, set[int]],
Expand Down
63 changes: 49 additions & 14 deletions lisa/tools/lscpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
)


# Sentinel value used in CPUInfo cache fields when lscpu does not expose
# cache topology (e.g. confidential VMs where lscpu outputs "-" for the
# CACHE column instead of "L1d:L1i:L2:L3").
UNKNOWN_CACHE_ID = -1


class CPUInfo:
def __init__(
self,
Expand Down Expand Up @@ -91,6 +97,12 @@ class Lscpu(Tool):
r"(?P<l1_data_cache>\d+):(?P<l1_instruction_cache>\d+):"
r"(?P<l2_cache>\d+):(?P<l3_cache>\d+)$"
)
# On some VMs (e.g. confidential VMs), cache topology is not exposed
# and lscpu outputs "-" instead of cache IDs:
# 0 0 0 -
_core_numa_no_cache = re.compile(
r"\s*(?P<cpu>\d+)\s+(?P<numa_node>\d+)\s+(?P<socket>\d+)\s+-$"
)
# Model name: Intel(R) Xeon(R) Platinum 8168 CPU @ 2.70GHz
# Model name: AMD EPYC 7763 64-Core Processor
# Model name: AMD EPYC 7763 64-Core Processor
Expand Down Expand Up @@ -267,6 +279,10 @@ def get_cpu_info(self) -> List[CPUInfo]:
# CPU NODE SOCKET L1d:L1i:L2:L3
# 0 0 0 0:0:0:0
# 1 0 0 0:0:0:0
#
# On some VMs (e.g. confidential VMs), cache topology is not exposed:
# CPU NODE SOCKET CACHE
# 0 0 0 -
result = self.run(
"--extended=cpu,node,socket,cache", expected_exit_code=0
).stdout
Expand All @@ -278,21 +294,40 @@ def get_cpu_info(self) -> List[CPUInfo]:
output: List[CPUInfo] = []
for item in mappings:
match_result = self._core_numa_mappings.fullmatch(item)
assert (
match_result
), f"lscpu NUMA node mapping is not in expected format: {item}"
output.append(
CPUInfo(
cpu=int(match_result.group("cpu")),
numa_node=int(match_result.group("numa_node")),
socket=int(match_result.group("socket")),
l1_data_cache=int(match_result.group("l1_data_cache")),
l1_instruction_cache=int(
match_result.group("l1_instruction_cache")
),
l2_cache=int(match_result.group("l2_cache")),
l3_cache=int(match_result.group("l3_cache")),
if match_result:
output.append(
CPUInfo(
cpu=int(match_result.group("cpu")),
numa_node=int(match_result.group("numa_node")),
socket=int(match_result.group("socket")),
l1_data_cache=int(match_result.group("l1_data_cache")),
l1_instruction_cache=int(
match_result.group("l1_instruction_cache")
),
l2_cache=int(match_result.group("l2_cache")),
l3_cache=int(match_result.group("l3_cache")),
)
)
continue
no_cache_match = self._core_numa_no_cache.fullmatch(item)
if no_cache_match:
output.append(
CPUInfo(
cpu=int(no_cache_match.group("cpu")),
numa_node=int(no_cache_match.group("numa_node")),
socket=int(no_cache_match.group("socket")),
l1_data_cache=UNKNOWN_CACHE_ID,
l1_instruction_cache=UNKNOWN_CACHE_ID,
l2_cache=UNKNOWN_CACHE_ID,
l3_cache=UNKNOWN_CACHE_ID,
)
)
continue
raise LisaException(
"lscpu NUMA node mapping is not in the expected format: "
f"{item}. Verify the output of "
"'lscpu --extended=cpu,node,socket,cache' on the target node "
"and update the parser if the format has changed."
)
return output

Expand Down
Loading