Skip to content
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
221 changes: 146 additions & 75 deletions sonic-chassisd/scripts/chassisd
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ CHASSIS_MODULE_INFO_HOSTNAME_FIELD = 'hostname'
CHASSIS_MODULE_REBOOT_INFO_TABLE = 'CHASSIS_MODULE_REBOOT_INFO_TABLE'
CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD = 'timestamp'
CHASSIS_MODULE_REBOOT_REBOOT_FIELD = 'reboot'

# Deferred reboot-cause check reasons stored in _pending_reboot_check
DEFERRED_DPU_REBOOT = 'dpu_reboot' # chassisd witnessed Online→Offline
DEFERRED_CHASSISD_RESTART = 'chassisd_restart' # chassisd found DPU already offline after restart

DEFAULT_LINECARD_REBOOT_TIMEOUT = 180
DEFAULT_DPU_REBOOT_TIMEOUT = 360
MAX_DPU_REBOOT_DURATION = 800
Expand Down Expand Up @@ -287,14 +292,14 @@ class ModuleUpdater(logger.Logger):

self.chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB")
if self._is_supervisor():
self.asic_table = swsscommon.Table(self.chassis_state_db,
self.asic_table = swsscommon.Table(self.chassis_state_db,
CHASSIS_FABRIC_ASIC_INFO_TABLE)
else:
self.asic_table = swsscommon.Table(self.chassis_state_db,
self.asic_table = swsscommon.Table(self.chassis_state_db,
CHASSIS_ASIC_INFO_TABLE)

self.hostname_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_HOSTNAME_TABLE)
self.module_reboot_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_REBOOT_INFO_TABLE)
self.module_reboot_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_REBOOT_INFO_TABLE)
self.down_modules = {}
self.chassis_app_db_clean_sha = None

Expand All @@ -305,7 +310,7 @@ class ModuleUpdater(logger.Logger):
field = line.split('=')[0].strip()
if field == "linecard_reboot_timeout":
self.linecard_reboot_timeout = int(line.split('=')[1].strip())

self.midplane_initialized = try_get(chassis.init_midplane_switch, default=False)
if not self.midplane_initialized:
self.log_error("Chassisd midplane intialization failed")
Expand Down Expand Up @@ -407,7 +412,7 @@ class ModuleUpdater(logger.Logger):
else:
if self.phy_entity_table.get(key) is not None:
self.phy_entity_table._del(key)

# Construct key for down_modules dict. Example down_modules key format: LINE-CARD0|<hostname>
fvs = self.hostname_table.get(key)
if isinstance(fvs, list) and fvs[0] is True:
Expand All @@ -421,7 +426,7 @@ class ModuleUpdater(logger.Logger):
if prev_status == ModuleBase.MODULE_STATUS_ONLINE:
notOnlineModules.append(key)
# Record the time when the module down was detected to track the
# module down time. Used for chassis db cleanup for all asics of the module if the module is down for a
# module down time. Used for chassis db cleanup for all asics of the module if the module is down for a
# long time like 30 mins.
# All down modules including supervisor are added to the down modules dictionary. This is to help
# identifying module operational status change. But the clean up will not be attempted for supervisor
Expand Down Expand Up @@ -457,12 +462,12 @@ class ModuleUpdater(logger.Logger):
self.asic_table.set(asic_key, asic_fvs)

# In line card push the hostname of the module and num_asics to the chassis state db.
# The hostname is used as key to access chassis app db entries
# The hostname is used as key to access chassis app db entries
if not self._is_supervisor():
module_info_dict = self._get_module_info(my_index)
hostname_key = "{}{}".format(ModuleBase.MODULE_TYPE_LINE, int(self.my_slot) - 1)
hostname = try_get(device_info.get_hostname, default="None")
hostname_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_SLOT_FIELD, str(self.my_slot)),
hostname_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_SLOT_FIELD, str(self.my_slot)),
(CHASSIS_MODULE_INFO_HOSTNAME_FIELD, hostname),
(CHASSIS_MODULE_INFO_NUM_ASICS_FIELD, str(len(module_info_dict[CHASSIS_MODULE_INFO_ASICS])))])
self.hostname_table.set(hostname_key, hostname_fvs)
Expand Down Expand Up @@ -520,12 +525,12 @@ class ModuleUpdater(logger.Logger):
if fvs[CHASSIS_MODULE_REBOOT_REBOOT_FIELD] == "expected":
return True
return False

def module_reboot_set_time(self, key):
time_now = time.time()
fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD, str(time_now))])
self.module_reboot_table.set(key,fvs)

def is_module_reboot_system_up_expired(self, key):
fvs = self.module_reboot_table.get(key)
if isinstance(fvs, list) and fvs[0] is True:
Expand All @@ -537,7 +542,7 @@ class ModuleUpdater(logger.Logger):
self.module_reboot_table._del(key)
return True
return False

def check_midplane_reachability(self):
if not self.midplane_initialized:
return
Expand Down Expand Up @@ -584,7 +589,7 @@ class ModuleUpdater(logger.Logger):
elif midplane_access is False and current_midplane_state == 'False':
if self.is_module_reboot_system_up_expired(module_key):
self.log_warning("Unexpected: Module {} (Slot {}) midplane connectivity is not restored in {} seconds".format(module_key, module.get_slot(), self.linecard_reboot_timeout))

# Update db with midplane information
fvs = swsscommon.FieldValuePairs([(CHASSIS_MIDPLANE_INFO_IP_FIELD, midplane_ip),
(CHASSIS_MIDPLANE_INFO_ACCESS_FIELD, str(midplane_access))])
Expand Down Expand Up @@ -718,6 +723,7 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
if not self.midplane_initialized:
self.log_error("Chassisd midplane intialization failed")

self._pending_reboot_check = {}
self.dpu_reboot_timeout = DEFAULT_DPU_REBOOT_TIMEOUT
if os.path.isfile(PLATFORM_JSON_FILE):
try:
Expand Down Expand Up @@ -774,69 +780,143 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
self.log_error(f"{module}: Failed to read previous-reboot-cause.json: {e}")
return None, None

def module_db_update(self):
for module_index in range(0, self.num_modules):
module_info_dict = self._get_module_info(module_index)
if module_info_dict is not None:
key = module_info_dict[CHASSIS_MODULE_INFO_NAME_FIELD]

if not key.startswith(ModuleBase.MODULE_TYPE_DPU):
self.log_error("Incorrect module-name {}. Should start with {} ".format(key,
ModuleBase.MODULE_TYPE_DPU))
continue

fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_DESC_FIELD, module_info_dict[CHASSIS_MODULE_INFO_DESC_FIELD]),
(CHASSIS_MODULE_INFO_SLOT_FIELD, module_info_dict[CHASSIS_MODULE_INFO_SLOT_FIELD]),
(CHASSIS_MODULE_INFO_OPERSTATUS_FIELD, module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]),
(CHASSIS_MODULE_INFO_SERIAL_FIELD, module_info_dict[CHASSIS_MODULE_INFO_SERIAL_FIELD])])
def _extract_cause(self, reboot_cause):
"""Extract the scalar cause string from a reboot_cause that may be a tuple, list, or string."""
if isinstance(reboot_cause, (tuple, list)):
return reboot_cause[0]
return reboot_cause

def _is_known_to_offline(self, prev_status, current_status):
"""Online (or any non-EMPTY non-OFFLINE) → OFFLINE — chassisd witnessed it."""
return (prev_status != ModuleBase.MODULE_STATUS_EMPTY
and prev_status != str(ModuleBase.MODULE_STATUS_OFFLINE)
and current_status == str(ModuleBase.MODULE_STATUS_OFFLINE))

def _is_empty_to_offline(self, prev_status, current_status):

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vvolum IMO this can be handled with a minimal addition in the existing module_db_update() logic itself.

Today, module_db_update() already:

  • reads prev_status using get_module_current_status(),
  • updates the module table,
  • and compares previous/current oper-status for transition handling.

The new EMPTY→Offline case is simply another transition condition in that same flow.

The helper functions introduced in this PR ultimately still reduce to checking:

prev_status == EMPTY and current_status == Offline

That does not require introducing a new transition framework, deferred reboot-cause processing, or restructuring the existing reboot-cause handling model.

A couple of targeted lines in the existing transition handling path should be sufficient while preserving the current Online→Offline reboot-cause persistence semantics and overall design behavior.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rameshraghupathy after our discussion in the meeting offline, do you have any other suggestions?

"""EMPTY → OFFLINE — chassisd just (re)started, found DPU already offline."""
return (prev_status == ModuleBase.MODULE_STATUS_EMPTY
and current_status == str(ModuleBase.MODULE_STATUS_OFFLINE))

def _is_back_to_online(self, prev_status, current_status):
"""EMPTY/OFFLINE → ONLINE — DPU usable again, safe to call get_reboot_cause()."""
return (prev_status in (ModuleBase.MODULE_STATUS_EMPTY,
str(ModuleBase.MODULE_STATUS_OFFLINE))
and current_status != str(ModuleBase.MODULE_STATUS_OFFLINE))

def _handle_module_status_change(self, key, module_index, prev_status, current_status):
"""Handle DPU operational-status transitions and reboot-cause tracking."""
prev_status_empty = prev_status == ModuleBase.MODULE_STATUS_EMPTY

# Transition to offline from a known non-offline state.
# Defer get_reboot_cause until the DPU is back online —
# querying a powered-off DPU is unreliable.
if self._is_known_to_offline(prev_status, current_status):
self.log_notice(f"{key} operational status transitioning to offline")
self._pending_reboot_check[key] = DEFERRED_DPU_REBOOT
return

# Get a copy of the previous operational status of the module
prev_status = self.get_module_current_status(key)
self.module_table.set(key, fvs)
# DPU discovered offline after chassisd restart (e.g. config reload
# during an ongoing DPU reboot). prev_status is EMPTY because
# STATE_DB was flushed. Defer reboot-cause check until DPU is online.
if self._is_empty_to_offline(prev_status, current_status):
self._pending_reboot_check[key] = DEFERRED_CHASSISD_RESTART
return

# Get a copy of the current operational status of the module
current_status = module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]
# Transition to online from EMPTY or OFFLINE
if self._is_back_to_online(prev_status, current_status):
self.log_notice(f"{key} operational status transitioning to online")

# Operational status transitioning to offline
if prev_status != ModuleBase.MODULE_STATUS_EMPTY and prev_status != str(ModuleBase.MODULE_STATUS_OFFLINE) and current_status == str(ModuleBase.MODULE_STATUS_OFFLINE):
self.log_notice("{} operational status transitioning to offline".format(key))
deferred = self._pending_reboot_check.pop(key, None)

# Persist dpu down time
self.persist_dpu_reboot_time(key)
# persist reboot cause
# EMPTY→ONLINE with no deferred: chassisd restarted while the
# DPU was already online — no reboot happened during this cycle.
# On first boot persist the initial cause; otherwise just
# repopulate CHASSIS_STATE_DB from on-disk data.
if deferred is None and prev_status_empty:
if self._is_first_boot(key):
reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause)
self.persist_dpu_reboot_cause(reboot_cause, key)
# publish reboot cause to db
self.update_dpu_reboot_cause_to_db(key)
return

reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause)
current_cause = self._extract_cause(reboot_cause)
stored_cause, stored_time_str = self.retrieve_dpu_reboot_info(key)

# Deferred from Online→Offline: the DPU went down and is now back.
# Reboot time was already persisted. Skip persisting if the
# cause was already recorded after this reboot's execution time.
if deferred == DEFERRED_DPU_REBOOT:
reboot_time = self.retrieve_dpu_reboot_time(key)
if stored_time_str and reboot_time and stored_time_str >= reboot_time:
self.log_notice(f"{key}: Reboot cause already persisted after reboot at {reboot_time}")
self.update_dpu_reboot_cause_to_db(key)
return
self.persist_dpu_reboot_cause(reboot_cause, key)
self.update_dpu_reboot_cause_to_db(key)
return

# Deferred from EMPTY→Offline: chassisd restarted while DPU was
# offline, now it is online so we can read the real reboot cause.
# Persist if: (a) a reboot execution time exists that is newer than
# the stored cause timestamp (same logic as DEFERRED_DPU_REBOOT), or
# (b) the cause string itself changed. This handles back-to-back
# reboots that produce the same cause type.
if deferred == DEFERRED_CHASSISD_RESTART:
reboot_time = self.retrieve_dpu_reboot_time(key)
should_persist = False
if reboot_time and stored_time_str and stored_time_str < reboot_time:
self.log_notice(
f"{key}: New reboot detected (stored time {stored_time_str} "
f"< reboot time {reboot_time})")
should_persist = True
elif stored_cause is not None and current_cause and current_cause != stored_cause:
self.log_notice(
f"{key}: Reboot cause changed while chassisd was down "
f"(stored: {stored_cause}, current: {current_cause})")
should_persist = True
elif stored_cause is None and current_cause:
# First reboot ever — no stored cause yet
should_persist = True

if should_persist:
self.persist_dpu_reboot_cause(reboot_cause, key)
self.update_dpu_reboot_cause_to_db(key)
return

elif (prev_status == ModuleBase.MODULE_STATUS_EMPTY or prev_status == str(ModuleBase.MODULE_STATUS_OFFLINE)) and current_status != str(ModuleBase.MODULE_STATUS_OFFLINE):
self.log_notice(f"{key} operational status transitioning to online")
# OFFLINE→ONLINE with no deferred (edge case).
already_persisted = False
reboot_time = self.retrieve_dpu_reboot_time(key)
if stored_time_str and reboot_time and stored_time_str >= reboot_time:
self.log_notice(f"{key}: Reboot cause already persisted after reboot at {reboot_time}")
already_persisted = True

reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause)
if isinstance(reboot_cause, (tuple, list)):
current_cause = reboot_cause[0]
else:
current_cause = reboot_cause

stored_cause, stored_time_str = self.retrieve_dpu_reboot_info(key)

is_reboot = False
if current_cause and stored_cause and stored_time_str:
try:
stored_dt = datetime.strptime(stored_time_str, "%Y_%m_%d_%H_%M_%S").replace(tzinfo=timezone.utc)
now = datetime.now(timezone.utc)
delta_sec = (now - stored_dt).total_seconds()

if current_cause == stored_cause and delta_sec < MAX_DPU_REBOOT_DURATION:
self.log_info(f"{key}: is_reboot=True — same reboot cause within {int(delta_sec)}s")
is_reboot = True
except Exception as e:
self.log_error(f"{key}: Reboot cause/time comparison failed: {e}")

if not is_reboot and (stored_time_str is not None or self._is_first_boot(key)):
# persist reboot cause and publish to db
self.persist_dpu_reboot_cause(reboot_cause, key)
self.update_dpu_reboot_cause_to_db(key)
if not already_persisted and (stored_time_str is not None or self._is_first_boot(key)):
self.persist_dpu_reboot_cause(reboot_cause, key)
self.update_dpu_reboot_cause_to_db(key)

def module_db_update(self):
for module_index in range(0, self.num_modules):
module_info_dict = self._get_module_info(module_index)
if module_info_dict is None:
continue

key = module_info_dict[CHASSIS_MODULE_INFO_NAME_FIELD]
if not key.startswith(ModuleBase.MODULE_TYPE_DPU):
self.log_error("Incorrect module-name {}. Should start with {} ".format(key,
ModuleBase.MODULE_TYPE_DPU))
continue

fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_DESC_FIELD, module_info_dict[CHASSIS_MODULE_INFO_DESC_FIELD]),
(CHASSIS_MODULE_INFO_SLOT_FIELD, module_info_dict[CHASSIS_MODULE_INFO_SLOT_FIELD]),
(CHASSIS_MODULE_INFO_OPERSTATUS_FIELD, module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]),
(CHASSIS_MODULE_INFO_SERIAL_FIELD, module_info_dict[CHASSIS_MODULE_INFO_SERIAL_FIELD])])

prev_status = self.get_module_current_status(key)
self.module_table.set(key, fvs)
current_status = module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]

self._handle_module_status_change(key, module_index, prev_status, current_status)

def _get_module_info(self, module_index):
"""
Expand Down Expand Up @@ -933,15 +1013,6 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
except FileNotFoundError:
return False

def persist_dpu_reboot_time(self, module):
"""Persist the current reboot time to a file."""
time_str = self._get_current_time_str()
path = os.path.join(MODULE_REBOOT_CAUSE_DIR, module.lower(), "prev_reboot_time.txt")

os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w') as f:
f.write(time_str)

def retrieve_dpu_reboot_time(self, module):
"""Retrieve the persisted reboot time from a file."""
path = os.path.join(MODULE_REBOOT_CAUSE_DIR, module.lower(), "prev_reboot_time.txt")
Expand Down
Loading
Loading