Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 32 additions & 4 deletions sonic-chassisd/scripts/chassisd
Original file line number Diff line number Diff line change
Expand Up @@ -776,12 +776,17 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
def update_dpu_state(self, key, state):
"""
Update specific DPU state fields in chassisStateDB using the given key.
If state is 'down', delete the table first before setting new values.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gpunathilell @vvolam
The original intent is to "Update specific DPU state fields in chassisStateDB using the given key." and not to delete the table. Doing so, will wipe out the "CP, DP" details that was there around the time of midplane failure. The intent was to preserved that data which has important information that will help debug easily.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gpunathilell I think you are deleting only the entry in the table right?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The state is only changed to down at this point, We are not deleting any entries

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gpunathilell Then can you fix the comment accordingly? It says you are deleting the table first.

"""
try:
# Connect to the CHASSIS_STATE_DB using daemon_base
if not self.chassis_state_db:
self.chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB")

# If state is down, delete the table first

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gpunathilell @vvolam We shouldn't delete the entire table. If there is a need to delete the table (I need to understand the requirement fully), save the CP/DP values and restore them.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rameshraghupathy I guess Gagan is deleting the table entry here and not entire table as per my understanding. I think entry is deleted here and added at line 799?

@gpunathilell, But why do we have to delete entry and updating entry is not helping here?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The state is only changed to down at this point, We are not deleting any entries

if state == "down":
self.chassis_state_db.delete(key)

# Prepare the fields to update
updates = {
"dpu_midplane_link_state": state,
Expand Down Expand Up @@ -1367,12 +1372,14 @@ class DpuStateManagerTask(ProcessTaskBase):
self.dpu_state_updater = dpu_state_updater
self.state_db = daemon_base.db_connect('STATE_DB')
self.app_db = daemon_base.db_connect('APPL_DB')
self.chassis_state_db = daemon_base.db_connect('CHASSIS_STATE_DB')

def task_worker(self):
sel = swsscommon.Select()
selectable = [
swsscommon.SubscriberStateTable(self.app_db, 'PORT_TABLE'),
swsscommon.SubscriberStateTable(self.state_db, 'SYSTEM_READY')
swsscommon.SubscriberStateTable(self.state_db, 'SYSTEM_READY'),
swsscommon.SubscriberStateTable(self.chassis_state_db, 'DPU_STATE')
]

for s in selectable:
Expand All @@ -1388,10 +1395,31 @@ class DpuStateManagerTask(ProcessTaskBase):
if state != swsscommon.Select.OBJECT:
continue

for s in selectable:
s.pops()
update_required = False

self.dpu_state_updater.update_state()
for s in selectable:
result = s.pop()
update_required = True # If there is any selectable object, we need to update the state
Comment on lines +1439 to +1443

Copilot AI May 20, 2025

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 'update_required' flag is being reinitialized for each selectable within the loop instead of accumulating conditions from all objects. Consider initializing 'update_required' to false before the loop and then OR-ing or accumulating the condition for each selectable to ensure proper state update decision.

Copilot uses AI. Check for mistakes.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The update_required is set this way, because it shows a clear distinction on when we perform the actual update, and when the update is disabled, I think implementing it this way is more clearer to understand

if result is None:
print("Here with None for {}".format(type(s)))
continue
key, op, fvp = result # Changed from _ to fvp to match what we use below
# Check if this is the DPU_STATE table
if s.getDbConnector().getDbName() == 'CHASSIS_STATE_DB':
# Don't update if this is a change for another DPU
if key != self.dpu_state_updater.name:
update_required = False
continue
# Don't update if this is our own dataplane/control plane state update to avoid recursion
if op == 'SET' and isinstance(fvp, tuple):
fvs = dict(fvp)
if 'dpu_data_plane_state' in fvs and 'dpu_control_plane_state' in fvs:
update_required = False
continue
self.logger.log_info(f"DPU_STATE change detected: operation={op}, key={key}")

if update_required:
self.dpu_state_updater.update_state()

except KeyboardInterrupt:
pass
Expand Down
9 changes: 9 additions & 0 deletions sonic-chassisd/tests/mock_swsscommon.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,15 @@ def pop(self):
def pops(self):
return None

def getDbConnector(self):
return MockDbConnector()


class MockDbConnector:

def getDbName(self):
return 'CHASSIS_STATE_DB'

class RedisPipeline:
def __init__(self, db):
self.db = db
Expand Down
97 changes: 97 additions & 0 deletions sonic-chassisd/tests/test_dpu_chassisd.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,3 +233,100 @@ def is_valid_date(date_str):
daemon_chassisd.stop.wait.return_value = True

thread.join()


def test_dpu_state_manager_table_deletion():
chassis = MockDpuChassis()
chassis.get_dpu_id = MagicMock(return_value=0)
chassis.get_dataplane_state = MagicMock(return_value=True)
chassis.get_controlplane_state = MagicMock(return_value=True)

chassis_state_db = {}

def hset(key, field, value):
if key not in chassis_state_db:
chassis_state_db[key] = {}
chassis_state_db[key][field] = value

with mock.patch.object(swsscommon.Table, 'hset', side_effect=hset):
with mock.patch.object(swsscommon.SubscriberStateTable, 'pop', return_value=('DPU0', 'DEL', None)):
with mock.patch.object(swsscommon.Select, 'select',
side_effect=[(swsscommon.Select.OBJECT, None), KeyboardInterrupt]):

dpu_updater = DpuStateUpdater(SYSLOG_IDENTIFIER, chassis)
dpu_updater._time_now = MagicMock(return_value='Sat Jan 01 12:00:00 AM UTC 2000')

dpu_state_mng = DpuStateManagerTask(SYSLOG_IDENTIFIER, dpu_updater)
dpu_state_mng.task_worker()

# Verify state was updated since this is a DEL operation
assert chassis_state_db == {'DPU0': {
'dpu_data_plane_state': 'up',
'dpu_data_plane_time': 'Sat Jan 01 12:00:00 AM UTC 2000',
'dpu_control_plane_state': 'up',
'dpu_control_plane_time': 'Sat Jan 01 12:00:00 AM UTC 2000'
}}


def test_dpu_state_manager_specific_key_update():
chassis = MockDpuChassis()
chassis.get_dpu_id = MagicMock(return_value=0)
chassis.get_dataplane_state = MagicMock(return_value=True)
chassis.get_controlplane_state = MagicMock(return_value=True)

chassis_state_db = {}

def hset(key, field, value):
if key not in chassis_state_db:
chassis_state_db[key] = {}
chassis_state_db[key][field] = value

with mock.patch.object(swsscommon.Table, 'hset', side_effect=hset):
with mock.patch.object(swsscommon.SubscriberStateTable, 'pop', return_value=('DPU0', 'SET', (('dpu_control_plane_state', 'up'), ('dpu_data_plane_state', 'up')))):
with mock.patch.object(swsscommon.Select, 'select',
side_effect=[(swsscommon.Select.OBJECT, None), KeyboardInterrupt]):

dpu_updater = DpuStateUpdater(SYSLOG_IDENTIFIER, chassis)
dpu_updater._time_now = MagicMock(return_value='Sat Jan 01 12:00:00 AM UTC 2000')

dpu_state_mng = DpuStateManagerTask(SYSLOG_IDENTIFIER, dpu_updater)
dpu_state_mng.task_worker()

# Verify no state update occurred since this was our own state update
assert chassis_state_db == {}


def test_dpu_state_manager_none_result():
chassis = MockDpuChassis()
chassis.get_dpu_id = MagicMock(return_value=0)
chassis.get_dataplane_state = MagicMock(return_value=True)
chassis.get_controlplane_state = MagicMock(return_value=True)

chassis_state_db = {}

def hset(key, field, value):
if key not in chassis_state_db:
chassis_state_db[key] = {}
chassis_state_db[key][field] = value

def mock_pop():
return None

with mock.patch.object(swsscommon.Table, 'hset', side_effect=hset):
with mock.patch.object(swsscommon.SubscriberStateTable, 'pop', side_effect=mock_pop):
with mock.patch.object(swsscommon.Select, 'select',
side_effect=[(swsscommon.Select.OBJECT, None), KeyboardInterrupt]):

dpu_updater = DpuStateUpdater(SYSLOG_IDENTIFIER, chassis)
dpu_updater._time_now = MagicMock(return_value='Sat Jan 01 12:00:00 AM UTC 2000')

dpu_state_mng = DpuStateManagerTask(SYSLOG_IDENTIFIER, dpu_updater)
dpu_state_mng.task_worker()

# Verify state was updated even with None result
assert chassis_state_db == {'DPU0': {
'dpu_data_plane_state': 'up',
'dpu_data_plane_time': 'Sat Jan 01 12:00:00 AM UTC 2000',
'dpu_control_plane_state': 'up',
'dpu_control_plane_time': 'Sat Jan 01 12:00:00 AM UTC 2000'
}}