|
20 | 20 | import logging |
21 | 21 | import time |
22 | 22 | import unittest |
| 23 | +from unittest import mock |
| 24 | + |
| 25 | +import requests |
23 | 26 |
|
24 | 27 | logging.basicConfig(level=logging.INFO) |
25 | 28 |
|
|
42 | 45 | from bqmonitor.pipeline import _parse_table_ref |
43 | 46 | from bqmonitor.pipeline import _parse_webhook_spec |
44 | 47 | from bqmonitor.pipeline import _PostAnomalyToWebhook |
| 48 | +from bqmonitor.pipeline import _WEBHOOK_BASE_BACKOFF_SEC |
| 49 | +from bqmonitor.pipeline import _WEBHOOK_MAX_BACKOFF_SEC |
| 50 | +from bqmonitor.pipeline import _WEBHOOK_MAX_RETRIES |
45 | 51 | from bqmonitor.pipeline import _RateLimitAlerts |
46 | 52 | from bqmonitor.pipeline import _substitute_template_tree |
47 | 53 | from bqmonitor.pipeline import _ThresholdAlert |
@@ -915,9 +921,51 @@ def request(self, method, url, json=None, headers=None, timeout=None): |
915 | 921 | return _StubResponse(status_code=self._status_code) |
916 | 922 |
|
917 | 923 |
|
| 924 | +class _SequenceSession: |
| 925 | + """Session stub that returns a scripted outcome per request() call. |
| 926 | +
|
| 927 | + Each entry in ``outcomes`` is either an int HTTP status (returns a |
| 928 | + ``_StubResponse`` with that code) or an ``Exception`` instance (raised, |
| 929 | + to emulate a network-level failure like a dropped connection). Once the |
| 930 | + script is exhausted the final entry repeats, so an "always failing" |
| 931 | + endpoint is expressed with a single-element script. |
| 932 | + """ |
| 933 | + |
| 934 | + def __init__(self, outcomes): |
| 935 | + self._outcomes = list(outcomes) |
| 936 | + self.calls = [] |
| 937 | + |
| 938 | + def request(self, method, url, json=None, headers=None, timeout=None): |
| 939 | + self.calls.append({ |
| 940 | + 'method': method, 'url': url, 'json': json, |
| 941 | + 'headers': headers, 'timeout': timeout, |
| 942 | + }) |
| 943 | + idx = min(len(self.calls) - 1, len(self._outcomes) - 1) |
| 944 | + outcome = self._outcomes[idx] |
| 945 | + if isinstance(outcome, Exception): |
| 946 | + raise outcome |
| 947 | + return _StubResponse(status_code=outcome) |
| 948 | + |
| 949 | + |
918 | 950 | class PostAnomalyToWebhookTest(unittest.TestCase): |
919 | 951 | """Tests for _PostAnomalyToWebhook DoFn (session stubbed; no network).""" |
920 | 952 |
|
| 953 | + def setUp(self): |
| 954 | + # The DoFn sleeps between transient retries; patch it out so the |
| 955 | + # retry/backoff tests run instantly. Tests that never hit a transient |
| 956 | + # path simply leave this mock uncalled. The captured call args also |
| 957 | + # let the backoff-schedule test assert the exact sleep sequence. |
| 958 | + patcher = mock.patch('bqmonitor.pipeline.time.sleep') |
| 959 | + self.sleep_mock = patcher.start() |
| 960 | + self.addCleanup(patcher.stop) |
| 961 | + |
| 962 | + def _make_dofn_with_outcomes(self, outcomes, body=None): |
| 963 | + """Build a DoFn whose session replays ``outcomes`` (see |
| 964 | + _SequenceSession) across successive request() calls.""" |
| 965 | + dofn = self._make_dofn(body or {'q': '{value}'}) |
| 966 | + dofn._session = _SequenceSession(outcomes) |
| 967 | + return dofn |
| 968 | + |
921 | 969 | def _make_result(self, label, value=42.0, score=5.0, model_id='ZScore'): |
922 | 970 | row = beam.Row( |
923 | 971 | value=value, |
@@ -1041,29 +1089,126 @@ def test_nested_body_substituted(self): |
1041 | 1089 | posted['dataAgentContext']['dataAgent'], |
1042 | 1090 | 'projects/p/dataAgents/a') |
1043 | 1091 |
|
1044 | | - def test_5xx_raises_for_retry(self): |
1045 | | - """5xx → transient; bundle retry covers server-side flapping.""" |
| 1092 | + def test_persistent_5xx_retries_then_drops(self): |
| 1093 | + """A persistently-5xx endpoint is retried inline _WEBHOOK_MAX_RETRIES |
| 1094 | + times (so 1 initial + N retries attempts) and then dropped -- NOT |
| 1095 | + raised. We retry inline rather than via Beam bundle retry because |
| 1096 | + AsyncWrapper does not reliably re-deliver a raising bundle, and for the |
| 1097 | + same reason we drop rather than raise once retries are exhausted |
| 1098 | + (raising would only risk wedging the bundle).""" |
1046 | 1099 | dofn = self._make_dofn({'q': '{value}'}, status_code=500) |
1047 | | - with self.assertRaises(RuntimeError): |
1048 | | - dofn.process(self._make_result(label=1)) |
1049 | | - # The POST was attempted even though it failed. |
1050 | | - self.assertEqual(len(dofn._session.calls), 1) |
| 1100 | + # Must not raise. |
| 1101 | + dofn.process(self._make_result(label=1)) |
| 1102 | + self.assertEqual( |
| 1103 | + len(dofn._session.calls), _WEBHOOK_MAX_RETRIES + 1) |
| 1104 | + self.assertEqual(self.sleep_mock.call_count, _WEBHOOK_MAX_RETRIES) |
1051 | 1105 |
|
1052 | | - def test_503_raises_for_retry(self): |
| 1106 | + def test_503_retries_then_drops(self): |
1053 | 1107 | dofn = self._make_dofn({'q': '{value}'}, status_code=503) |
1054 | | - with self.assertRaises(RuntimeError): |
1055 | | - dofn.process(self._make_result(label=1)) |
| 1108 | + dofn.process(self._make_result(label=1)) # must not raise |
| 1109 | + self.assertEqual( |
| 1110 | + len(dofn._session.calls), _WEBHOOK_MAX_RETRIES + 1) |
1056 | 1111 |
|
1057 | | - def test_429_raises_for_retry(self): |
| 1112 | + def test_429_retries_then_drops(self): |
1058 | 1113 | """429 Too Many Requests → transient; back off and retry.""" |
1059 | 1114 | dofn = self._make_dofn({'q': '{value}'}, status_code=429) |
1060 | | - with self.assertRaises(RuntimeError): |
1061 | | - dofn.process(self._make_result(label=1)) |
| 1115 | + dofn.process(self._make_result(label=1)) # must not raise |
| 1116 | + self.assertEqual( |
| 1117 | + len(dofn._session.calls), _WEBHOOK_MAX_RETRIES + 1) |
1062 | 1118 |
|
1063 | | - def test_408_raises_for_retry(self): |
| 1119 | + def test_408_retries_then_drops(self): |
1064 | 1120 | dofn = self._make_dofn({'q': '{value}'}, status_code=408) |
1065 | | - with self.assertRaises(RuntimeError): |
1066 | | - dofn.process(self._make_result(label=1)) |
| 1121 | + dofn.process(self._make_result(label=1)) # must not raise |
| 1122 | + self.assertEqual( |
| 1123 | + len(dofn._session.calls), _WEBHOOK_MAX_RETRIES + 1) |
| 1124 | + |
| 1125 | + def test_transient_status_then_success_does_not_raise(self): |
| 1126 | + """A few transient 503s followed by a 200 succeeds without raising; |
| 1127 | + the anomaly is delivered on the first non-transient response and no |
| 1128 | + further attempts are made.""" |
| 1129 | + dofn = self._make_dofn_with_outcomes([503, 503, 503, 200]) |
| 1130 | + # Must not raise. |
| 1131 | + dofn.process(self._make_result(label=1, value=99.0)) |
| 1132 | + # 3 failures + 1 success = 4 attempts, then it stops. |
| 1133 | + self.assertEqual(len(dofn._session.calls), 4) |
| 1134 | + # One sleep before each of the 3 retries. |
| 1135 | + self.assertEqual(self.sleep_mock.call_count, 3) |
| 1136 | + # The successful POST carried the substituted body. |
| 1137 | + self.assertEqual(dofn._session.calls[-1]['json'], {'q': '99.0'}) |
| 1138 | + |
| 1139 | + def test_backoff_schedule_is_exponential_and_capped(self): |
| 1140 | + """The sleep between retries grows exponentially from the base delay |
| 1141 | + and saturates at the max-backoff cap: 0.5, 1, 2, 4, 8, 15, 15, ... |
| 1142 | + There is exactly one sleep per retry (none before the first attempt).""" |
| 1143 | + dofn = self._make_dofn({'q': '{value}'}, status_code=500) |
| 1144 | + dofn.process(self._make_result(label=1)) |
| 1145 | + |
| 1146 | + actual_delays = [c.args[0] for c in self.sleep_mock.call_args_list] |
| 1147 | + expected_delays = [ |
| 1148 | + min(_WEBHOOK_MAX_BACKOFF_SEC, |
| 1149 | + _WEBHOOK_BASE_BACKOFF_SEC * (2 ** n)) |
| 1150 | + for n in range(_WEBHOOK_MAX_RETRIES) |
| 1151 | + ] |
| 1152 | + self.assertEqual(actual_delays, expected_delays) |
| 1153 | + # Sanity-check the literal schedule the constants are meant to produce. |
| 1154 | + self.assertEqual( |
| 1155 | + actual_delays[:6], [0.5, 1.0, 2.0, 4.0, 8.0, 15.0]) |
| 1156 | + # Every later delay sits exactly on the cap. |
| 1157 | + for delay in actual_delays[5:]: |
| 1158 | + self.assertEqual(delay, _WEBHOOK_MAX_BACKOFF_SEC) |
| 1159 | + |
| 1160 | + def test_network_error_retried_then_succeeds(self): |
| 1161 | + """A transient network-level error (e.g. dropped connection) is |
| 1162 | + retried with the same backoff as a transient status, and a later |
| 1163 | + success delivers the anomaly without raising.""" |
| 1164 | + outcomes = [ |
| 1165 | + requests.exceptions.ConnectionError('connection reset'), |
| 1166 | + 200, |
| 1167 | + ] |
| 1168 | + dofn = self._make_dofn_with_outcomes(outcomes) |
| 1169 | + dofn.process(self._make_result(label=1, value=7.0)) |
| 1170 | + self.assertEqual(len(dofn._session.calls), 2) |
| 1171 | + self.assertEqual(self.sleep_mock.call_count, 1) |
| 1172 | + |
| 1173 | + def test_network_error_exhausts_retries_and_drops(self): |
| 1174 | + """A persistently failing connection exhausts the retry budget and is |
| 1175 | + then dropped, not raised (a raising bundle is not reliably retried by |
| 1176 | + AsyncWrapper, so raising would only risk wedging the bundle).""" |
| 1177 | + outcomes = [requests.exceptions.ConnectionError('boom')] |
| 1178 | + dofn = self._make_dofn_with_outcomes(outcomes) |
| 1179 | + dofn.process(self._make_result(label=1)) # must not raise |
| 1180 | + self.assertEqual( |
| 1181 | + len(dofn._session.calls), _WEBHOOK_MAX_RETRIES + 1) |
| 1182 | + self.assertEqual(self.sleep_mock.call_count, _WEBHOOK_MAX_RETRIES) |
| 1183 | + |
| 1184 | + def test_non_requests_exception_retried_then_succeeds(self): |
| 1185 | + """Exceptions that are NOT requests.RequestException (e.g. a |
| 1186 | + google.auth RefreshError surfaces as a plain exception, SSL errors, |
| 1187 | + etc.) are also caught and retried -- otherwise they would escape |
| 1188 | + process() and crash-loop the bundle under AsyncWrapper. A later |
| 1189 | + success still delivers the anomaly.""" |
| 1190 | + outcomes = [RuntimeError('auth refresh failed'), 200] |
| 1191 | + dofn = self._make_dofn_with_outcomes(outcomes) |
| 1192 | + dofn.process(self._make_result(label=1, value=5.0)) # must not raise |
| 1193 | + self.assertEqual(len(dofn._session.calls), 2) |
| 1194 | + self.assertEqual(self.sleep_mock.call_count, 1) |
| 1195 | + |
| 1196 | + def test_non_requests_exception_exhausts_retries_and_drops(self): |
| 1197 | + """A persistently raised non-requests exception is retried to the |
| 1198 | + budget and then dropped, never escaping process().""" |
| 1199 | + outcomes = [RuntimeError('persistent boom')] |
| 1200 | + dofn = self._make_dofn_with_outcomes(outcomes) |
| 1201 | + dofn.process(self._make_result(label=1)) # must not raise |
| 1202 | + self.assertEqual( |
| 1203 | + len(dofn._session.calls), _WEBHOOK_MAX_RETRIES + 1) |
| 1204 | + |
| 1205 | + def test_permanent_4xx_not_retried(self): |
| 1206 | + """A permanent 4xx is dropped on the first response with no retries |
| 1207 | + and no sleeps, even if later attempts would have succeeded.""" |
| 1208 | + dofn = self._make_dofn_with_outcomes([400, 200]) |
| 1209 | + dofn.process(self._make_result(label=1)) |
| 1210 | + self.assertEqual(len(dofn._session.calls), 1) |
| 1211 | + self.sleep_mock.assert_not_called() |
1067 | 1212 |
|
1068 | 1213 | def test_permanent_4xx_dropped(self): |
1069 | 1214 | """Permanent 4xx (e.g. 400 bad request) is logged and dropped, not |
|
0 commit comments