Skip to content

Commit 8679a70

Browse files
Merge pull request #142 from lmiccini/fix-must-gather-resilience
fix: improve must-gather resilience for SOS and container log collection
2 parents a959f75 + b2ea3c1 commit 8679a70

3 files changed

Lines changed: 11 additions & 8 deletions

File tree

collection-scripts/gather_ctlplane_resources

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,11 @@ function gather_ctlplane_resources {
6969
.metadata.name as $pod |
7070
(.status |
7171
(
72-
if has("initContainerStatuses") then .initContainerStatuses[] else empty end |
72+
if has("initContainerStatuses") then .initContainerStatuses[] | select(.state | has("running") or has("terminated")) else empty end |
7373
"\($pod) \(.name) \(has("lastState") and (.lastState | has("terminated"))) true"
7474
),
7575
(
76-
if has("containerStatuses") then .containerStatuses[] else empty end |
76+
if has("containerStatuses") then .containerStatuses[] | select(.state | has("running") or has("terminated")) else empty end |
7777
"\($pod) \(.name) \(has("lastState") and (.lastState | has("terminated"))) false"
7878
)
7979
)

collection-scripts/gather_sos

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ gather_node_sos () {
8080
mkdir -p \"${TMPDIR}\" && \
8181
sudo podman rm --force toolbox-osp; \
8282
sudo --preserve-env podman pull --authfile /var/lib/kubelet/config.json $SUPPORT_TOOLS && \
83-
toolbox sos report --batch --all-logs $SOS_LIMIT --tmp-dir=\"${TMPDIR}\" && \
83+
toolbox sos report --batch --all-logs $SOS_LIMIT --tmp-dir=\"/host${TMPDIR}\" && \
8484
if [[ \"\$(ls /var/log/pods/*/{*.log.*,*/*.log.*} 2>/dev/null)\" != '' ]]; then tar --ignore-failed-read --warning=no-file-changed -cJf \"${TMPDIR}/podlogs.tar.xz\" --transform 's,^,podlogs/,' /var/log/pods/*/{*.log.*,*/*.log.*} || true; fi"
8585

8686
# shellcheck disable=SC2181
@@ -118,11 +118,13 @@ gather_node_sos () {
118118

119119
# if we are decompressing the sos report, remove the original sos archive
120120
if [[ ${SOS_DECOMPRESS} -eq 1 ]]; then
121-
mkdir "${SOS_PATH_NODES}/sosreport-$node"
122-
tar -i --one-top-level="${SOS_PATH_NODES}/sosreport-$node" --strip-components=1 --exclude='*/dev/null' -Jxf "${sos_file}"
123-
rm "${sos_file}"
124-
# Ensure write access to the sos reports directories so must-gather rsync doesn't fail
125-
chmod +w -R "${SOS_PATH_NODES}/sosreport-$node/"
121+
if tar -i -C "${SOS_PATH_NODES}" --one-top-level="sosreport-$node" --strip-components=1 --exclude='*/dev/null' -Jxf "${sos_file}"; then
122+
rm "${sos_file}"
123+
# Ensure write access to the sos reports directories so must-gather rsync doesn't fail
124+
chmod +w -R "${SOS_PATH_NODES}/sosreport-$node/"
125+
else
126+
echo "Failed to decompress SOS report for ${node}, keeping compressed archive"
127+
fi
126128
fi
127129

128130
sleep 1

pyscripts/tox.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ envlist =
99
mypy
1010
stestr
1111
pytest
12+
skip_missing_interpreters = true
1213
skipsdist = true
1314
passenv =
1415
LANG

0 commit comments

Comments
 (0)