Skip to content
Open
206 changes: 206 additions & 0 deletions sonic-xcvrd/tests/test_xcvrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4138,6 +4138,153 @@ def test_CmisManagerTask_test_is_timer_expired(self, expired_time, current_time,
# Assert the result matches the expected output
assert result == expected_result

def test_CmisManagerTask_handle_cmis_inserted_state_waits_for_dp_settle(self):
"""When mid-transition on this lport's lanes, handle_cmis_inserted_state
short-circuits after computing host_lanes_mask/media_lanes_mask and
gates the rest of the state on should_wait_for_dp_settle."""
port_mapping = PortMapping()
stop_event = threading.Event()
task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event, platform_chassis=MagicMock())

port_change_event = PortChangeEvent(
'Ethernet0', 1, 0, PortChangeEvent.PORT_SET,
{'speed': '400000', 'lanes': '1,2,3,4,5,6,7,8'})
task.on_port_update_event(port_change_event)
assert task.port_dict['Ethernet0']['dp_settle_deadline'] is None

# Seed fields the inserted-state handler reads before the wait check.
api = MagicMock()
task.port_dict['Ethernet0']['api'] = api
task.port_dict['Ethernet0']['host_lane_count'] = 8
task.port_dict['Ethernet0']['subport'] = 0

task.is_fast_reboot_enabled = MagicMock(return_value=False)
task.get_cmis_max_host_lanes_mask = MagicMock(return_value=0xff)
task.get_cmis_host_lanes_mask = MagicMock(return_value=0xff)
task.get_cmis_media_lanes_mask = MagicMock(return_value=0xff)
api.get_media_lane_count = MagicMock(return_value=8)
api.get_media_lane_assignment_option = MagicMock(return_value=1)
task.is_decommission_required = MagicMock(return_value=False)
task.is_decomm_lead_lport = MagicMock(return_value=False)
task.is_decomm_pending = MagicMock(return_value=False)
task.should_wait_for_dp_settle = MagicMock(return_value=True)

with patch('xcvrd.cmis.cmis_manager_task.common.get_cmis_application_desired',
MagicMock(return_value=1)):
assert task.handle_cmis_inserted_state('Ethernet0') is False

task.should_wait_for_dp_settle.assert_called_once_with('Ethernet0', api, 0xff)

def test_CmisManagerTask_get_transient_datapath_state(self):
port_mapping = PortMapping()
stop_event = threading.Event()
task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event, platform_chassis=MagicMock())

# No transient state on lanes in mask: returns None
api = MagicMock()
api.get_datapath_state = MagicMock(return_value={
'DP1State': 'DataPathDeactivated', 'DP2State': 'DataPathActivated'})
assert task.get_transient_datapath_state(api, 0x03) is None

# DataPathDeinit found on a lane in mask
api.get_datapath_state = MagicMock(return_value={
'DP1State': 'DataPathDeactivated', 'DP2State': 'DataPathDeinit'})
assert task.get_transient_datapath_state(api, 0x03) == 'DataPathDeinit'

# DataPathInit found on a lane in mask
api.get_datapath_state = MagicMock(return_value={
'DP1State': 'DataPathInit', 'DP2State': 'DataPathDeactivated'})
assert task.get_transient_datapath_state(api, 0x03) == 'DataPathInit'

# Transient on a lane outside the mask is ignored (unrelated breakout sibling)
api.get_datapath_state = MagicMock(return_value={
'DP1State': 'DataPathActivated', 'DP2State': 'DataPathActivated',
'DP5State': 'DataPathDeinit', 'DP6State': 'DataPathInit'})
assert task.get_transient_datapath_state(api, 0x0f) is None
# Same state, but with the upper-lane mask: returns the transient state
assert task.get_transient_datapath_state(api, 0xf0) == 'DataPathDeinit'

# Caught exceptions (api missing / not implemented): returns None
api.get_datapath_state = MagicMock(side_effect=AttributeError('missing'))
assert task.get_transient_datapath_state(api, 0xff) is None
api.get_datapath_state = MagicMock(side_effect=NotImplementedError('nyi'))
assert task.get_transient_datapath_state(api, 0xff) is None

# Unexpected exceptions propagate (we no longer swallow all errors)
api.get_datapath_state = MagicMock(side_effect=RuntimeError('boom'))
with pytest.raises(RuntimeError):
task.get_transient_datapath_state(api, 0xff)

def _make_dp_settle_task(self):
"""Helper for the should_wait_for_dp_settle suite below: build a
CmisManagerTask with a single Ethernet0 entry and a dummy api/mask."""
port_mapping = PortMapping()
stop_event = threading.Event()
task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event, platform_chassis=MagicMock())
task.port_dict['Ethernet0'] = {'dp_settle_deadline': None}
api = MagicMock()
api.get_datapath_state = MagicMock(return_value={'DP1State': 'DataPathActivated'})
host_lanes_mask = 0x0f
return task, api, host_lanes_mask

def test_CmisManagerTask_should_wait_for_dp_settle_not_transient_clears_deadline(self):
"""Not transient: clears any prior deadline and returns False."""
task, api, host_lanes_mask = self._make_dp_settle_task()
task.port_dict['Ethernet0']['dp_settle_deadline'] = 12345.0
task.get_transient_datapath_state = MagicMock(return_value=None)

assert task.should_wait_for_dp_settle('Ethernet0', api, host_lanes_mask) is False
assert task.port_dict['Ethernet0']['dp_settle_deadline'] is None
task.get_transient_datapath_state.assert_called_once_with(api, host_lanes_mask)

def test_CmisManagerTask_should_wait_for_dp_settle_first_deinit_sets_deadline(self):
"""First call with DataPathDeinit: seeds deadline using deinit duration."""
task, api, host_lanes_mask = self._make_dp_settle_task()
task.get_transient_datapath_state = MagicMock(return_value='DataPathDeinit')
task.get_cmis_dp_deinit_duration_secs = MagicMock(return_value=600)
task.get_cmis_dp_init_duration_secs = MagicMock(return_value=60)

with patch('xcvrd.cmis.cmis_manager_task.time.time', return_value=1000.0):
assert task.should_wait_for_dp_settle('Ethernet0', api, host_lanes_mask) is True
assert task.port_dict['Ethernet0']['dp_settle_deadline'] == 1600.0
task.get_cmis_dp_deinit_duration_secs.assert_called_once_with(api)

def test_CmisManagerTask_should_wait_for_dp_settle_first_init_sets_deadline(self):
"""First call with DataPathInit: seeds deadline using init duration."""
task, api, host_lanes_mask = self._make_dp_settle_task()
task.get_transient_datapath_state = MagicMock(return_value='DataPathInit')
task.get_cmis_dp_deinit_duration_secs = MagicMock(return_value=600)
task.get_cmis_dp_init_duration_secs = MagicMock(return_value=60)

with patch('xcvrd.cmis.cmis_manager_task.time.time', return_value=2000.0):
assert task.should_wait_for_dp_settle('Ethernet0', api, host_lanes_mask) is True
assert task.port_dict['Ethernet0']['dp_settle_deadline'] == 2060.0
task.get_cmis_dp_init_duration_secs.assert_called_once_with(api)

def test_CmisManagerTask_should_wait_for_dp_settle_within_deadline_still_waiting(self):
"""Subsequent call within deadline: still waiting, no force_cmis_reinit."""
task, api, host_lanes_mask = self._make_dp_settle_task()
task.port_dict['Ethernet0']['dp_settle_deadline'] = 5000.0
task.get_transient_datapath_state = MagicMock(return_value='DataPathInit')
task.force_cmis_reinit = MagicMock()

with patch('xcvrd.cmis.cmis_manager_task.time.time', return_value=4999.0):
assert task.should_wait_for_dp_settle('Ethernet0', api, host_lanes_mask) is True
task.force_cmis_reinit.assert_not_called()
assert task.port_dict['Ethernet0']['dp_settle_deadline'] == 5000.0

def test_CmisManagerTask_should_wait_for_dp_settle_past_deadline_forces_reinit(self):
"""Past deadline: timeout, force_cmis_reinit invoked with retries+1."""
task, api, host_lanes_mask = self._make_dp_settle_task()
task.port_dict['Ethernet0']['dp_settle_deadline'] = 5000.0
task.port_dict['Ethernet0']['cmis_retries'] = 2
task.get_transient_datapath_state = MagicMock(return_value='DataPathInit')
task.force_cmis_reinit = MagicMock()

with patch('xcvrd.cmis.cmis_manager_task.time.time', return_value=6000.0):
assert task.should_wait_for_dp_settle('Ethernet0', api, host_lanes_mask) is True
task.force_cmis_reinit.assert_called_once_with('Ethernet0', 3)

@patch('xcvrd.xcvrd.XcvrTableHelper.get_status_sw_tbl')
@patch('xcvrd.xcvrd.platform_chassis')
@patch('xcvrd.xcvrd_utilities.common.is_fast_reboot_enabled', MagicMock(return_value=(False)))
Expand Down Expand Up @@ -4206,6 +4353,9 @@ def test_CmisManagerTask_task_worker(self, mock_chassis, mock_get_status_sw_tbl)
'ConfigStatusLane7': 'ConfigSuccess',
'ConfigStatusLane8': 'ConfigSuccess'
})
# First entry is consumed by the new dp-settle check at INSERTED-state
# entry (should_wait_for_dp_settle); remaining entries feed the rest
# of the CMIS state machine.
mock_xcvr_api.get_datapath_state = MagicMock(side_effect=[
{
'DP1State': 'DataPathDeactivated',
Expand Down Expand Up @@ -4257,6 +4407,16 @@ def test_CmisManagerTask_task_worker(self, mock_chassis, mock_get_status_sw_tbl)
'DP7State': 'DataPathDeactivated',
'DP8State': 'DataPathDeactivated'
},
{
'DP1State': 'DataPathDeactivated',
'DP2State': 'DataPathDeactivated',
'DP3State': 'DataPathDeactivated',
'DP4State': 'DataPathDeactivated',
'DP5State': 'DataPathDeactivated',
'DP6State': 'DataPathDeactivated',
'DP7State': 'DataPathDeactivated',
'DP8State': 'DataPathDeactivated'
},
{
'DP1State': 'DataPathInitialized',
'DP2State': 'DataPathInitialized',
Expand Down Expand Up @@ -4486,7 +4646,20 @@ def test_CmisManagerTask_task_worker_fastboot(self, mock_chassis, mock_get_statu
'ConfigStatusLane7': 'ConfigSuccess',
'ConfigStatusLane8': 'ConfigSuccess'
})
# First entry is consumed by the new dp-settle check at INSERTED-state
# entry (should_wait_for_dp_settle); remaining entries feed the rest
# of the CMIS state machine.
mock_xcvr_api.get_datapath_state = MagicMock(side_effect=[
{
'DP1State': 'DataPathDeactivated',
'DP2State': 'DataPathDeactivated',
'DP3State': 'DataPathDeactivated',
'DP4State': 'DataPathDeactivated',
'DP5State': 'DataPathDeactivated',
'DP6State': 'DataPathDeactivated',
'DP7State': 'DataPathDeactivated',
'DP8State': 'DataPathDeactivated'
},
{
'DP1State': 'DataPathDeactivated',
'DP2State': 'DataPathDeactivated',
Expand Down Expand Up @@ -4626,7 +4799,20 @@ def test_CmisManagerTask_task_worker_host_tx_ready_false_to_true(self, mock_chas
'ConfigStatusLane7': 'ConfigSuccess',
'ConfigStatusLane8': 'ConfigSuccess'
})
# First entry is consumed by the new dp-settle check at INSERTED-state
# entry (should_wait_for_dp_settle); remaining entries feed the rest
# of the CMIS state machine.
mock_xcvr_api.get_datapath_state = MagicMock(side_effect=[
{
'DP1State': 'DataPathDeactivated',
'DP2State': 'DataPathDeactivated',
'DP3State': 'DataPathDeactivated',
'DP4State': 'DataPathDeactivated',
'DP5State': 'DataPathDeactivated',
'DP6State': 'DataPathDeactivated',
'DP7State': 'DataPathDeactivated',
'DP8State': 'DataPathDeactivated'
},
{
'DP1State': 'DataPathDeactivated',
'DP2State': 'DataPathDeactivated',
Expand All @@ -4647,6 +4833,16 @@ def test_CmisManagerTask_task_worker_host_tx_ready_false_to_true(self, mock_chas
'DP7State': 'DataPathInitialized',
'DP8State': 'DataPathInitialized'
},
{
'DP1State': 'DataPathInitialized',
'DP2State': 'DataPathInitialized',
'DP3State': 'DataPathInitialized',
'DP4State': 'DataPathInitialized',
'DP5State': 'DataPathInitialized',
'DP6State': 'DataPathInitialized',
'DP7State': 'DataPathInitialized',
'DP8State': 'DataPathInitialized'
},
{
'DP1State': 'DataPathActivated',
'DP2State': 'DataPathActivated',
Expand Down Expand Up @@ -4697,6 +4893,16 @@ def test_CmisManagerTask_task_worker_host_tx_ready_false_to_true(self, mock_chas
'DP7State': 'DataPathInitialized',
'DP8State': 'DataPathInitialized'
},
{
'DP1State': 'DataPathInitialized',
'DP2State': 'DataPathInitialized',
'DP3State': 'DataPathInitialized',
'DP4State': 'DataPathInitialized',
'DP5State': 'DataPathInitialized',
'DP6State': 'DataPathInitialized',
'DP7State': 'DataPathInitialized',
'DP8State': 'DataPathInitialized'
},
])
mock_sfp = MagicMock()
mock_sfp.get_presence = MagicMock(return_value=True)
Expand Down
83 changes: 82 additions & 1 deletion sonic-xcvrd/xcvrd/cmis/cmis_manager_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ def on_port_update_event(self, port_change_event):
if port_change_event.event_type == port_change_event.PORT_SET:
if lport not in self.port_dict:
self.port_dict[lport] = {"asic_id": port_change_event.asic_id,
"forced_tx_disabled": False}
"forced_tx_disabled": False,
"dp_settle_deadline": None}
if pport >= 0:
self.port_dict[lport]['index'] = pport
if 'speed' in port_change_event.port_dict and port_change_event.port_dict['speed'] != 'N/A':
Expand Down Expand Up @@ -582,6 +583,7 @@ def force_cmis_reinit(self, lport, retries=0):
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_INSERTED)
self.port_dict[lport]['cmis_retries'] = retries
self.port_dict[lport]['cmis_expired'] = None # No expiration
self.port_dict[lport]['dp_settle_deadline'] = None

def check_module_state(self, api, states):
"""
Expand Down Expand Up @@ -845,6 +847,77 @@ def is_timer_expired(self, expired_time, current_time=None):

return expired_time <= current_time

def get_transient_datapath_state(self, api, host_lanes_mask):
"""
If any datapath lane owned by host_lanes_mask is in a transient state
(DataPathDeinit or DataPathInit, transitioning to DataPathDeactivated
or DataPathInitialized), returns that transient state name. Otherwise
returns None.

host_lanes_mask is mandatory so an unrelated breakout sibling cannot
block this port's CMIS initialization.
"""
transient = ('DataPathDeinit', 'DataPathInit')
try:
dp_state = api.get_datapath_state() or {}
except (AttributeError, NotImplementedError) as e:
self.log_error("Failed to read datapath state: {}".format(e))
return None
Comment thread
arpit-nexthop marked this conversation as resolved.

for lane in range(self.CMIS_MAX_HOST_LANES):
if ((1 << lane) & host_lanes_mask) == 0:
continue
s = dp_state.get("DP{}State".format(lane + 1))
if s in transient:
return s
return None

def should_wait_for_dp_settle(self, lport, api, host_lanes_mask):
"""
One-shot wait at INSERTED entry: if the module is mid-transition on the
lanes owned by lport (e.g., xcvrd restart caught it during DpInit or
DpDeinit), wait for those lanes to reach a terminal datapath state
before reconfiguring.

The deadline is held in self.port_dict[lport]['dp_settle_deadline']
and lives only for the current entry into CMIS_STATE_INSERTED: it is
seeded on first observation of a transient state and cleared once the
datapath settles, the wait times out, or force_cmis_reinit runs. On
xcvrd restart the deadline is intentionally recomputed from scratch
— that is the scenario this wait was added for, since xcvrd has no
memory of how far the in-flight transition had progressed.

Returns True while the caller should return from the state handler
(still waiting, or timeout triggered force_cmis_reinit). Returns False
once the datapath has settled or no wait is needed.
"""
transient_state = self.get_transient_datapath_state(api, host_lanes_mask)
if transient_state is None:
if self.port_dict[lport].get('dp_settle_deadline') is not None:
self.log_notice("{}: datapath settled, clearing dp_settle_deadline".format(lport))
self.port_dict[lport]['dp_settle_deadline'] = None
return False

deadline = self.port_dict[lport].get('dp_settle_deadline')
if deadline is None:
# Conservative: use the full transition window. The module may
# already be partway through, so we may wait longer than needed
# in the worst case, but never shorter.
if transient_state == 'DataPathDeinit':
duration = self.get_cmis_dp_deinit_duration_secs(api)
else: # DataPathInit
duration = self.get_cmis_dp_init_duration_secs(api)
self.port_dict[lport]['dp_settle_deadline'] = time.time() + duration
self.log_notice("{}: waiting up to {}s for datapath to settle (DP state={})".format(
lport, duration, transient_state))
return True
if time.time() < deadline:
return True
self.log_notice("{}: timeout waiting for datapath to settle, forcing CMIS reinit".format(lport))
retries = self.port_dict[lport].get('cmis_retries', 0)
self.force_cmis_reinit(lport, retries + 1)
return True

def handle_cmis_inserted_state(self, lport):
"""
Handle the CMIS_STATE_INSERTED state for a logical port.
Expand All @@ -858,6 +931,7 @@ def handle_cmis_inserted_state(self, lport):
"""
port_info = self.port_dict[lport]
api = port_info.get('api')

host_lane_count = port_info.get('host_lane_count')
speed = port_info.get('speed')
subport = port_info.get('subport')
Expand Down Expand Up @@ -899,6 +973,13 @@ def handle_cmis_inserted_state(self, lport):
media_lanes_mask = self.port_dict[lport]['media_lanes_mask']
self.log_notice("{}: Setting media_lanemask=0x{:x}".format(lport, media_lanes_mask))

# Once the lanes owned by this lport are known, gate on a one-shot
# wait for any transient DP state on those lanes (e.g. xcvrd restart
# caught the module mid DpInit/DpDeinit). This keeps unrelated breakout
# siblings from blocking this port.
if self.should_wait_for_dp_settle(lport, api, host_lanes_mask):
return False

if self.is_decommission_required(api, lport):
self.set_decomm_pending(lport)

Expand Down
Loading