Skip to content

Commit 67e1b27

Browse files
authored
DAOS-18881 test: fix the csum_error_logging test (#18455)
Since #17828 faults injected by this test does not increase the NVMe device error counter. Instead a RAS event is emitted. The test has to be adjusted accordingly. Signed-off-by: Jan Michalski <jan-marian.michalski@hpe.com>
1 parent fbf70d6 commit 67e1b27

1 file changed

Lines changed: 29 additions & 70 deletions

File tree

Lines changed: 29 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,102 +1,61 @@
11
"""
22
(C) Copyright 2020-2024 Intel Corporation.
3+
(C) Copyright 2026 Hewlett Packard Enterprise Development LP
34
45
SPDX-License-Identifier: BSD-2-Clause-Patent
56
"""
67

78
from avocado import fail_on
89
from daos_core_base import DaosCoreBase
9-
from dmg_utils import get_dmg_smd_info
1010
from exception_utils import CommandFailure
11-
from general_utils import get_log_file
11+
from general_utils import get_host_data, get_journalctl_command, journalctl_time
1212

1313

1414
class CsumErrorLog(DaosCoreBase):
1515
"""
16-
Test Class Description: This test runs
17-
daos_test -z (Checksum tests) and verifies
18-
whether Checksum Error Counters are incremented
19-
in the NVME device due to checksum fault injection.
16+
Test Class Description: Test checksum error logging.
17+
2018
:avocado: recursive
2119
"""
22-
# pylint: disable=too-many-instance-attributes
2320

2421
@fail_on(CommandFailure)
25-
def get_checksum_error_value(self, dmg, device_id):
26-
"""Get checksum error value from dmg storage_query_list_devices with health.
22+
def get_checksum_error_value(self, t_start, t_end):
23+
"""Query journalctl logs and count checksum error occurrences.
2724
2825
Args:
29-
dmg (DmgCommand): the DmgCommand object used to call storage_query_list_devices()
30-
device_id (str): Device UUID.
26+
t_start (str): The start time for the journalctl query.
27+
t_end (str): The end time for the journalctl query.
3128
3229
Returns:
33-
int: the number of checksum errors on the device
30+
int: the number of checksum errors found
3431
"""
35-
info = get_dmg_smd_info(dmg.storage_query_list_devices, 'devices', uuid=device_id,
36-
health=True)
37-
for devices in info.values():
38-
for device in devices:
39-
try:
40-
if device['uuid'] == device_id:
41-
return device['ctrlr']['health_stats']['checksum_errs']
42-
except KeyError as error:
43-
self.fail(
44-
'Error parsing dmg storage query list-devices --health output: {}'.format(
45-
error))
46-
return 0
32+
cmd = get_journalctl_command(t_start, t_end, system=True, units="daos_server")
33+
results = get_host_data(self.hostlist_servers, cmd, text="journalctl",
34+
error="Error gathering system log events")
35+
self.log.debug(results)
36+
str_to_match = "CSUM error"
37+
occurrence = 0
38+
for host_result in results:
39+
occurrence += host_result["data"].count(str_to_match)
40+
return occurrence
4741

4842
@fail_on(CommandFailure)
4943
def test_csum_error_logging(self):
50-
"""Jira ID: DAOS-3927.
44+
"""Jira ID: DAOS-3927, DAOS-18881.
5145
52-
Test Description: Write Avocado Test to verify single data after
53-
pool/container disconnect/reconnect.
46+
Test Description: Inject checksum errors using daos_test -z and verify that the errors are
47+
logged in the system journal.
5448
5549
:avocado: tags=all,daily_regression
5650
:avocado: tags=hw,medium
5751
:avocado: tags=checksum,faults,daos_test
5852
:avocado: tags=CsumErrorLog,test_csum_error_logging
5953
"""
60-
self.log_step('Detecting server devices (dmg storage query list-devices)')
61-
test_run = False
62-
dmg = self.get_dmg_command()
63-
dmg.hostlist = self.hostlist_servers[0]
64-
host_devices = get_dmg_smd_info(dmg.storage_query_list_devices, 'devices')
65-
for host, devices in host_devices.items():
66-
for device in devices:
67-
for entry in ('uuid', 'tgt_ids', 'role_bits'):
68-
if entry not in device:
69-
self.fail(
70-
'Missing {} info from dmg storage query list devices'.format(entry))
71-
self.log.info(
72-
'Host %s device: uuid=%s, targets=%s, role_bits=%s',
73-
host, device['uuid'], device['tgt_ids'], device['role_bits'])
74-
if not device['tgt_ids']:
75-
self.log_step('Skipping device without targets on {}'.format(device['uuid']))
76-
continue
77-
if (int(device['role_bits']) > 0) and not int(device['role_bits']) & 1:
78-
self.log_step(
79-
'Skipping {} device without data on {}'.format(
80-
device['role_bits'], device['uuid']))
81-
continue
82-
if not device['uuid']:
83-
self.fail('Device uuid undefined')
84-
self.log_step(
85-
'Get checksum errors before running the test (dmg storage query list-devices '
86-
'--health)')
87-
check_sum = self.get_checksum_error_value(dmg, device['uuid'])
88-
dmg.copy_certificates(get_log_file("daosCA/certs"), self.hostlist_clients)
89-
dmg.copy_configuration(self.hostlist_clients)
90-
self.log.info("Checksum Errors before: %d", check_sum)
91-
self.log_step('Run the test (daos_test -z)')
92-
self.run_subtest()
93-
test_run = True
94-
self.log_step(
95-
'Get checksum errors after running the test (dmg storage query list-devices '
96-
'--health)')
97-
check_sum_latest = self.get_checksum_error_value(dmg, device['uuid'])
98-
self.log.info('Checksum Errors after: %d', check_sum_latest)
99-
self.assertTrue(check_sum_latest > check_sum, 'Checksum Error Log not incremented')
100-
if not test_run:
101-
self.fail('No tests run for the devices found')
102-
self.log_step('Test Passed')
54+
t_start = journalctl_time()
55+
self.log_step('Run the test (daos_test -z)')
56+
self.run_subtest()
57+
t_end = journalctl_time()
58+
self.log_step('Check checksum error logs')
59+
checksum_errs = self.get_checksum_error_value(t_start, t_end)
60+
self.log.info('Checksum Errors reported: %d', checksum_errs)
61+
self.assertGreater(checksum_errs, 0, 'Checksum Errors not detected')

0 commit comments

Comments
 (0)