Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 83 additions & 92 deletions tests/bgp/test_prefix_list_suppress.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import random
import re
import time
import uuid

import pytest
import yaml
Expand Down Expand Up @@ -63,11 +64,6 @@

BGPCFGD_RUNNING_TIMEOUT = 60
BGPCFGD_RUNNING_INTERVAL = 5
# Narrow journald lookback (seconds) used by TC-A5 after a fresh bgpcfgd
# restart. Wide enough to absorb the supervisorctl restart + bgpcfgd init
# on a slow real DUT (~30s p99), tight enough that stale lines from a
# previous image upgrade in the same 24h cannot leak into our assertions.
BGPCFGD_LOG_WINDOW_SECONDS = 180


# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -357,40 +353,34 @@ def apply_constants_to_bgpcfgd(duthost):
)


def restart_bgpcfgd_only(duthost):
"""Bounce just the ``bgpcfgd`` supervisord program in every frontend
ASIC's bgp container and wait for it to come back RUNNING.
def place_syslog_marker(duthost):
"""Inject a unique marker into /var/log/syslog and return the marker string.

Cheaper and more deterministic than a full ``systemctl restart bgp``:
it keeps the container, zebra, bgpd and staticd alive and only
re-runs bgpcfgd's ``main()``/``do_work()``. The one-shot
``log_notice`` startup lines (``AsPath Manager is enabled for
<DEVICE_TYPE>``, etc.) are re-emitted so the caller can use a
narrow ``--since`` window for syslog assertions.
Used to window syslog assertions: after placing the marker, only lines
after it are considered by collect_syslog_since_marker().
Waits until the marker is confirmed present in the syslog file to avoid
races with asynchronous rsyslog writes.
"""
for container in bgp_container_names(duthost):
duthost.shell(
"sudo docker exec {} supervisorctl restart bgpcfgd".format(container),
module_ignore_errors=True,
)
wait_for_bgpcfgd(duthost, timeout=180)
marker = "SONIC_TEST_MARKER_{}".format(uuid.uuid4().hex[:12])
duthost.shell("logger -t SONIC_TEST '{}'".format(marker))
duthost.shell(
"timeout 5 bash -c \"while ! grep -q '{}' /var/log/syslog; do sleep 0.1; done\"".format(marker)
)
return marker


def collect_recent_syslog(duthost, pattern, since_seconds=120):
"""Return matching lines from the last *since_seconds* of /var/log/syslog.
def collect_syslog_since_marker(duthost, marker, pattern):
"""Return lines from /var/log/syslog matching *pattern* after *marker*.

The grep filters out ansible's own audit log lines
(``ansible-ansible.legacy.command Invoked with _raw_params=...``), which
echo the journalctl command itself, including the regex pattern,
back into the same log we're searching. Without this filter every call
would appear to match its own argument and produce a false positive."""
# The bgpcfgd warnings flow into the host's syslog; multi-asic containers
# also forward their syslog to the host. We just grep the host log.
Works on both single-ASIC and multi-ASIC platforms because all container
daemons forward their logs to the host's /var/log/syslog via rsyslog.
Filters out ansible audit lines that echo the grep pattern back.
"""
cmd = (
"sudo journalctl --since='{}sec ago' --no-pager 2>/dev/null "
"| grep -v 'ansible-ansible.legacy.command Invoked with _raw_params' "
"| grep -E {!r} || true"
).format(since_seconds, pattern)
"sudo sed -n '/{marker}/,$p' /var/log/syslog "
"| grep -v 'Invoked with _raw_params' "
"| grep -E {pattern!r} || true"
Comment thread
deepak-singhal0408 marked this conversation as resolved.
).format(marker=marker, pattern=pattern)
return duthost.shell(cmd)["stdout"]


Expand Down Expand Up @@ -654,68 +644,69 @@ def test_prefix_list_mgr_running_on_every_device(self, rand_one_frontend_duthost
"""TC-A5: PrefixListMgr is now started unconditionally; old log line
must be gone; bgpcfgd must be healthy.

We deliberately bounce ``bgpcfgd`` at the start of this test and
use a narrow post-restart window (``BGPCFGD_LOG_WINDOW_SECONDS``)
for every syslog assertion. A 24h ``journalctl`` window would
otherwise surface stale lines from a *previous* image (e.g. the
legacy ``Prefix List Manager and AsPath Manager are enabled for
UpperSpineRouter/UpstreamLC`` notice or a bgpcfgd traceback that
happened before today's upgrade), which would make this test
fail spuriously on regression cycles that re-image the DUT.
Bouncing bgpcfgd also forces the one-shot ``log_notice`` startup
lines (in particular ``AsPath Manager is enabled for
<DEVICE_TYPE>`` on spines) to be re-emitted into the narrow
window, so the presence/absence checks below are deterministic.
We restart ``bgpcfgd`` one container at a time, placing a unique
syslog marker before each restart. This scopes every assertion to
exactly the log output from that container's fresh bgpcfgd startup,
eliminating false positives from stale lines in a previous image.
Uses /var/log/syslog (not journalctl) so it works on multi-ASIC
chassis where container daemons log via rsyslog.
"""
duthost = rand_one_frontend_duthost
spine = is_upstream_spine(duthost)

for container in bgp_container_names(duthost):
# Place marker, restart bgpcfgd, wait for it to come back
marker = place_syslog_marker(duthost)
duthost.shell(
"sudo docker exec {} supervisorctl restart bgpcfgd".format(
container),
module_ignore_errors=True,
)
sonichost = getattr(duthost, "sonichost", duthost)
pytest_assert(
wait_until(180, 5, 0,
lambda c=container: sonichost.is_service_running(
"bgpcfgd", c)),
"bgpcfgd not RUNNING in {} within 180s".format(container),
)

restart_bgpcfgd_only(duthost)
# ``wait_for_bgpcfgd`` inside ``restart_bgpcfgd_only`` already
# asserts bgpcfgd is RUNNING on every frontend ASIC.

# The pre-PR notice was tied to UpperSpineRouter/UpstreamLC and
# should no longer appear at all -- even after a fresh bgpcfgd
# startup.
legacy = collect_recent_syslog(
duthost,
"Prefix List Manager and AsPath Manager are enabled for "
"UpperSpineRouter/UpstreamLC",
since_seconds=BGPCFGD_LOG_WINDOW_SECONDS,
)
pytest_assert(
not legacy.strip(),
"Legacy 'Prefix List Manager and AsPath Manager are enabled' log "
"line is still produced after bgpcfgd restart on {}:\n{}".format(
duthost.hostname, legacy),
)

# On spine devices the new "AsPath Manager is enabled for <TYPE>" line
# must appear. On non-spine devices it must not.
new_line = collect_recent_syslog(
duthost,
"AsPath Manager is enabled for",
since_seconds=BGPCFGD_LOG_WINDOW_SECONDS,
)
if is_upstream_spine(duthost):
# The pre-PR notice was tied to UpperSpineRouter/UpstreamLC and
# should no longer appear at all after a fresh bgpcfgd startup.
legacy = collect_syslog_since_marker(
duthost, marker,
"Prefix List Manager and AsPath Manager are enabled for "
"UpperSpineRouter/UpstreamLC",
)
pytest_assert(
new_line.strip(),
"Expected 'AsPath Manager is enabled for <DEVICE_TYPE>' log "
"line on spine device {} after bgpcfgd restart".format(
duthost.hostname),
not legacy.strip(),
"Legacy 'Prefix List Manager and AsPath Manager are enabled'"
" log line still produced after bgpcfgd restart in {} on"
" {}:\n{}".format(container, duthost.hostname, legacy),
)

# bgpcfgd must not have logged a traceback since the restart we
# just performed. We use the same narrow window so a stale
# traceback from a previous run cannot fail this assertion.
tracebacks = collect_recent_syslog(
duthost, "bgpcfgd.*Traceback",
since_seconds=BGPCFGD_LOG_WINDOW_SECONDS,
)
pytest_assert(
not tracebacks.strip(),
"bgpcfgd recorded a traceback after restart on {}:\n{}".format(
duthost.hostname, tracebacks),
)
# On spine devices the new "AsPath Manager is enabled for <TYPE>"
# line must appear. On non-spine devices it must not.
if spine:
new_line = collect_syslog_since_marker(
duthost, marker,
"AsPath Manager is enabled for",
)
pytest_assert(
new_line.strip(),
"Expected 'AsPath Manager is enabled for <DEVICE_TYPE>'"
" log line on spine device {} after bgpcfgd restart in"
" {}".format(duthost.hostname, container),
)

# bgpcfgd must not have logged a traceback since the restart.
tracebacks = collect_syslog_since_marker(
duthost, marker, "bgpcfgd.*Traceback",
)
pytest_assert(
not tracebacks.strip(),
"bgpcfgd recorded a traceback after restart in {} on"
" {}:\n{}".format(container, duthost.hostname, tracebacks),
)


# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -826,6 +817,7 @@ def test_suppress_prefix_on_non_spine_device(

# --- Direct CONFIG_DB path ---
try:
marker = place_syslog_marker(duthost)
write_config_db_key_directly(duthost, SUPPRESS_TYPE, prefix)
pytest_assert(
wait_until(
Expand All @@ -835,11 +827,10 @@ def test_suppress_prefix_on_non_spine_device(
),
"SUPPRESS_PREFIX not picked up by PrefixListMgr from direct DB write",
)
unwanted_warn = collect_recent_syslog(
duthost,
unwanted_warn = collect_syslog_since_marker(
duthost, marker,
"PrefixListMgr:: Device type .* not supported for {}".format(
SUPPRESS_TYPE),
since_seconds=60,
)
pytest_assert(
not unwanted_warn.strip(),
Expand Down
Loading