Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 34 additions & 12 deletions sonic-chassisd/scripts/chassisd
Original file line number Diff line number Diff line change
Expand Up @@ -288,14 +288,14 @@ class ModuleUpdater(logger.Logger):

self.chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB")
if self._is_supervisor():
self.asic_table = swsscommon.Table(self.chassis_state_db,
self.asic_table = swsscommon.Table(self.chassis_state_db,
CHASSIS_FABRIC_ASIC_INFO_TABLE)
else:
self.asic_table = swsscommon.Table(self.chassis_state_db,
self.asic_table = swsscommon.Table(self.chassis_state_db,
CHASSIS_ASIC_INFO_TABLE)

self.hostname_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_HOSTNAME_TABLE)
self.module_reboot_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_REBOOT_INFO_TABLE)
self.module_reboot_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_REBOOT_INFO_TABLE)
self.down_modules = {}
self.chassis_app_db_clean_sha = None

Expand All @@ -306,7 +306,7 @@ class ModuleUpdater(logger.Logger):
field = line.split('=')[0].strip()
if field == "linecard_reboot_timeout":
self.linecard_reboot_timeout = int(line.split('=')[1].strip())

self.midplane_initialized = try_get(chassis.init_midplane_switch, default=False)
if not self.midplane_initialized:
self.log_error("Chassisd midplane intialization failed")
Expand Down Expand Up @@ -408,7 +408,7 @@ class ModuleUpdater(logger.Logger):
else:
if self.phy_entity_table.get(key) is not None:
self.phy_entity_table._del(key)

# Construct key for down_modules dict. Example down_modules key format: LINE-CARD0|<hostname>
fvs = self.hostname_table.get(key)
if isinstance(fvs, list) and fvs[0] is True:
Expand All @@ -422,7 +422,7 @@ class ModuleUpdater(logger.Logger):
if prev_status == ModuleBase.MODULE_STATUS_ONLINE:
notOnlineModules.append(key)
# Record the time when the module down was detected to track the
# module down time. Used for chassis db cleanup for all asics of the module if the module is down for a
# module down time. Used for chassis db cleanup for all asics of the module if the module is down for a
# long time like 30 mins.
# All down modules including supervisor are added to the down modules dictionary. This is to help
# identifying module operational status change. But the clean up will not be attempted for supervisor
Expand Down Expand Up @@ -458,12 +458,12 @@ class ModuleUpdater(logger.Logger):
self.asic_table.set(asic_key, asic_fvs)

# In line card push the hostname of the module and num_asics to the chassis state db.
# The hostname is used as key to access chassis app db entries
# The hostname is used as key to access chassis app db entries
if not self._is_supervisor():
module_info_dict = self._get_module_info(my_index)
hostname_key = "{}{}".format(ModuleBase.MODULE_TYPE_LINE, int(self.my_slot) - 1)
hostname = try_get(device_info.get_hostname, default="None")
hostname_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_SLOT_FIELD, str(self.my_slot)),
hostname_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_SLOT_FIELD, str(self.my_slot)),
(CHASSIS_MODULE_INFO_HOSTNAME_FIELD, hostname),
(CHASSIS_MODULE_INFO_NUM_ASICS_FIELD, str(len(module_info_dict[CHASSIS_MODULE_INFO_ASICS])))])
self.hostname_table.set(hostname_key, hostname_fvs)
Expand Down Expand Up @@ -521,12 +521,12 @@ class ModuleUpdater(logger.Logger):
if fvs[CHASSIS_MODULE_REBOOT_REBOOT_FIELD] == "expected":
return True
return False

def module_reboot_set_time(self, key):
time_now = time.time()
fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD, str(time_now))])
self.module_reboot_table.set(key,fvs)

def is_module_reboot_system_up_expired(self, key):
fvs = self.module_reboot_table.get(key)
if isinstance(fvs, list) and fvs[0] is True:
Expand All @@ -538,7 +538,7 @@ class ModuleUpdater(logger.Logger):
self.module_reboot_table._del(key)
return True
return False

def check_midplane_reachability(self):
if not self.midplane_initialized:
return
Expand Down Expand Up @@ -585,7 +585,7 @@ class ModuleUpdater(logger.Logger):
elif midplane_access is False and current_midplane_state == 'False':
if self.is_module_reboot_system_up_expired(module_key):
self.log_warning("Unexpected: Module {} (Slot {}) midplane connectivity is not restored in {} seconds".format(module_key, module.get_slot(), self.linecard_reboot_timeout))

# Update db with midplane information
fvs = swsscommon.FieldValuePairs([(CHASSIS_MIDPLANE_INFO_IP_FIELD, midplane_ip),
(CHASSIS_MIDPLANE_INFO_ACCESS_FIELD, str(midplane_access))])
Expand Down Expand Up @@ -810,6 +810,28 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
# publish reboot cause to db
self.update_dpu_reboot_cause_to_db(key)

# DPU discovered offline after chassisd restart (e.g. config reload
# during an ongoing DPU reboot). prev_status is EMPTY because
# STATE_DB was flushed. Check whether the reboot cause changed
# since the last persisted one and, if so, record it.
elif prev_status == ModuleBase.MODULE_STATUS_EMPTY and current_status == str(ModuleBase.MODULE_STATUS_OFFLINE):
reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause)
Comment thread
vvolam marked this conversation as resolved.
Outdated
stored_cause, _ = self.retrieve_dpu_reboot_info(key)

if stored_cause is not None:
if isinstance(reboot_cause, (tuple, list)):
current_cause = reboot_cause[0]
else:
current_cause = reboot_cause

if current_cause and current_cause != stored_cause:
self.log_notice(
f"{key}: Reboot cause changed while chassisd was down "
f"(stored: {stored_cause}, current: {current_cause})")
self.persist_dpu_reboot_time(key)
self.persist_dpu_reboot_cause(reboot_cause, key)
self.update_dpu_reboot_cause_to_db(key)

elif (prev_status == ModuleBase.MODULE_STATUS_EMPTY or prev_status == str(ModuleBase.MODULE_STATUS_OFFLINE)) and current_status != str(ModuleBase.MODULE_STATUS_OFFLINE):
self.log_notice(f"{key} operational status transitioning to online")

Expand Down
123 changes: 104 additions & 19 deletions sonic-chassisd/tests/test_chassisd.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def test_moduleupdater_check_phyentity_entry_after_fabric_removal():
module_updater.module_db_update()
fvs = module_updater.phy_entity_table.get(name)
assert fvs == None

def test_smartswitch_moduleupdater_check_valid_fields():
chassis = MockSmartSwitchChassis()
index = 0
Expand Down Expand Up @@ -284,6 +284,91 @@ def test_online_transition_skips_reboot_update():
mock_persist.assert_not_called()
mock_update.assert_not_called()

def test_empty_to_offline_persists_changed_reboot_cause():
"""
If chassisd restarts (e.g. config reload) while a DPU is offline,
prev_status will be EMPTY (STATE_DB flushed) and current status OFFLINE.
If the reboot cause changed since the last persisted one, the new cause
must be recorded. Regression test for issue #24275.
"""
chassis = MockSmartSwitchChassis()
name = "DPU0"
module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0")
# DPU is currently offline
module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE)
chassis.module_list.append(module)

updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis)
# Ensure module_table is empty so prev_status == EMPTY (simulates STATE_DB flush)

with patch.object(module, 'get_reboot_cause', return_value=("Power Loss", "power auxiliary outage or reload")), \
patch.object(updater, 'retrieve_dpu_reboot_info',
return_value=("Reboot", "2025_10_10_07_36_12")), \
patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \
patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \
patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db:

updater.module_db_update()

# Reboot cause changed ("Reboot" → "Power Loss") so it must be persisted
mock_persist_time.assert_called_once_with(name)
mock_persist_cause.assert_called_once()
Comment thread
vvolam marked this conversation as resolved.
mock_update_db.assert_called_once_with(name)

def test_empty_to_offline_skips_same_reboot_cause():
"""
After chassisd restart, if the DPU is offline but the reboot cause
has NOT changed, we must NOT create a duplicate entry.
"""
chassis = MockSmartSwitchChassis()
name = "DPU0"
module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0")
module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE)
chassis.module_list.append(module)

updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis)

with patch.object(module, 'get_reboot_cause', return_value=("Reboot", "Reset from Main board")), \
patch.object(updater, 'retrieve_dpu_reboot_info',
return_value=("Reboot", "2025_10_10_07_36_12")), \
patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \
patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \
patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db:

updater.module_db_update()

# Same reboot cause — no persistence expected
mock_persist_time.assert_not_called()
mock_persist_cause.assert_not_called()
mock_update_db.assert_not_called()

def test_empty_to_offline_skips_first_boot():
"""
On first boot there is no previous-reboot-cause.json so stored_cause
is None. The EMPTY→OFFLINE path must NOT persist in that case.
"""
chassis = MockSmartSwitchChassis()
name = "DPU0"
module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0")
module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE)
chassis.module_list.append(module)

updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis)

with patch.object(module, 'get_reboot_cause', return_value=("Power Loss", "N/A")), \
patch.object(updater, 'retrieve_dpu_reboot_info',
return_value=(None, None)), \
patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \
patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \
patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db:

updater.module_db_update()

# No stored cause (first boot) — should NOT persist
mock_persist_time.assert_not_called()
mock_persist_cause.assert_not_called()
mock_update_db.assert_not_called()

def test_retrieve_dpu_reboot_info_success():
class DummyChassis:
def get_num_modules(self): return 0
Expand Down Expand Up @@ -833,7 +918,7 @@ def test_moduleupdater_check_string_slot():
midplane_table = module_updater.midplane_table
#Check only one entry in database
assert 1 == midplane_table.size()

def test_midplane_presence_modules():
chassis = MockChassis()

Expand Down Expand Up @@ -1039,7 +1124,7 @@ def lc_mock_open(*args, **kwargs):
@patch('os.path.isfile', MagicMock(return_value=True))
def test_midplane_presence_modules_linecard_reboot():
chassis = MockChassis()

#Supervisor
index = 0
name = "SUPERVISOR0"
Expand Down Expand Up @@ -1104,7 +1189,7 @@ def test_midplane_presence_modules_linecard_reboot():
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]


#Set access of line-card to Down (to mock midplane connectivity state change)
module.set_midplane_reachable(False)
# set expected reboot of linecard
Expand Down Expand Up @@ -1144,9 +1229,9 @@ def test_midplane_presence_modules_linecard_reboot():
if isinstance(fvs, list):
fvs = dict(fvs[-1])
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]
assert module_updater.linecard_reboot_timeout == 240
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]
assert module_updater.linecard_reboot_timeout == 240

def test_midplane_presence_supervisor():
chassis = MockChassis()

Expand Down Expand Up @@ -1454,7 +1539,7 @@ def test_daemon_run_smartswitch():
def test_set_initial_dpu_admin_state_up():
"""Test set_initial_dpu_admin_state when admin state is up"""
chassis = MockSmartSwitchChassis()

# DPU0 details
index = 0
name = "DPU0"
Expand All @@ -1469,12 +1554,12 @@ def test_set_initial_dpu_admin_state_up():
status = ModuleBase.MODULE_STATUS_ONLINE
module.set_oper_status(status)
chassis.module_list.append(module)

# Supervisor ModuleUpdater
module_updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis)
module_updater.module_db_update()
module_updater.modules_num_update()

# ChassisdDaemon setup
daemon_chassisd = ChassisdDaemon(SYSLOG_IDENTIFIER, chassis)
daemon_chassisd.module_updater = module_updater
Expand Down Expand Up @@ -1505,7 +1590,7 @@ def test_set_initial_dpu_admin_state_up():
def test_set_initial_dpu_admin_state_empty_offline():
"""Test set_initial_dpu_admin_state when admin state is empty and operational state is offline"""
chassis = MockSmartSwitchChassis()

# DPU0 details
index = 0
name = "DPU0"
Expand All @@ -1520,12 +1605,12 @@ def test_set_initial_dpu_admin_state_empty_offline():
status = ModuleBase.MODULE_STATUS_OFFLINE
module.set_oper_status(status)
chassis.module_list.append(module)

# Supervisor ModuleUpdater
module_updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis)
module_updater.module_db_update()
module_updater.modules_num_update()

# ChassisdDaemon setup
daemon_chassisd = ChassisdDaemon(SYSLOG_IDENTIFIER, chassis)
daemon_chassisd.module_updater = module_updater
Expand Down Expand Up @@ -1607,7 +1692,7 @@ def test_set_initial_dpu_admin_state_empty_not_offline():
def test_set_initial_dpu_admin_state_exception():
"""Test set_initial_dpu_admin_state handles exceptions gracefully"""
chassis = MockSmartSwitchChassis()

# DPU0 details
index = 0
name = "DPU0"
Expand Down Expand Up @@ -1816,7 +1901,7 @@ def test_chassis_db_cleanup():
# Mock hostname table update for the line card LINE-CARD0
hostname = "lc1-host-00"
num_asics = 1
hostname_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_SLOT_FIELD, str(lc_slot)),
hostname_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_SLOT_FIELD, str(lc_slot)),
(CHASSIS_MODULE_INFO_HOSTNAME_FIELD, hostname),
(CHASSIS_MODULE_INFO_NUM_ASICS_FIELD, str(num_asics))])
sup_module_updater.hostname_table.set(lc_name, hostname_fvs)
Expand Down Expand Up @@ -1849,7 +1934,7 @@ def test_chassis_db_cleanup():
# Mock >= CHASSIS_DB_CLEANUP_MODULE_DOWN_PERIOD module down period for LINE-CARD1
down_module_key = lc2_name+"|"
assert down_module_key not in sup_module_updater.down_modules.keys()

sup_module_updater.module_down_chassis_db_cleanup()

def test_chassis_db_bootup_with_empty_slot():
Expand Down Expand Up @@ -1895,7 +1980,7 @@ def test_chassis_db_bootup_with_empty_slot():
# Supervisor ModuleUpdater
sup_module_updater = ModuleUpdater(SYSLOG_IDENTIFIER, chassis, sup_slot, sup_slot)
sup_module_updater.modules_num_update()

sup_module_updater.module_db_update()

# check LC1 STATUS ONLINE in module table
Expand All @@ -1904,14 +1989,14 @@ def test_chassis_db_bootup_with_empty_slot():
fvs = dict(fvs[-1])
assert ModuleBase.MODULE_STATUS_ONLINE == fvs[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]

# check LC2 STATUS EMPTY in module table
# check LC2 STATUS EMPTY in module table
fvs = sup_module_updater.module_table.get(lc2_name)
if isinstance(fvs, list):
fvs = dict(fvs[-1])
assert ModuleBase.MODULE_STATUS_EMPTY == fvs[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]

# Both should no tbe in down_module keys.

down_module_lc1_key = lc_name+"|"
assert down_module_lc1_key not in sup_module_updater.down_modules.keys()
down_module_lc2_key = lc_name+"|"
Expand Down
Loading