Skip to content

Commit c8204c5

Browse files
committed
DAOS-18239 test: DO NOT LAND backport discard retry instrumentation on old base
Choose ps leader engine rank in dmg pool exclude command while testing test_osa_online_reintegration_with_multiple_ranks, to see if that case may have an impact on a pool_discard() hang on that engine. And instrument cont_discard_cb() for any retries e.g., that might continuously get -DER_INPROGRESS (resulting in overall hang), as seen in original observation. Test based on older master commit 0ff9ca7 where pool_discard() hang was originally observed. Test-tag: OSAOnlineReintegration,test_osa_online_reintegration_with_multiple_ranks Test-Repeat: 10 Skip-unit-tests: true Skip-fault-injection-test: true Skip-test-rpms: true Test-provider-hw-medium: ofi+tcp Signed-off-by: Kenneth Cain <kenneth.cain@hpe.com>
1 parent eee80bf commit c8204c5

1 file changed

Lines changed: 20 additions & 4 deletions

File tree

src/pool/srv_target.c

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2516,6 +2516,10 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry,
25162516
struct vos_iter_anchors anchor = { 0 };
25172517
daos_handle_t coh;
25182518
struct d_backoff_seq backoff_seq;
2519+
uint64_t start_hlc = d_hlc_get();
2520+
uint64_t last_warn_hlc = start_hlc;
2521+
uint64_t now_hlc;
2522+
unsigned int retry_cnt = 0;
25192523
int rc;
25202524

25212525
D_ASSERT(type == VOS_ITER_COUUID);
@@ -2556,16 +2560,28 @@ cont_discard_cb(daos_handle_t ih, vos_iter_entry_t *entry,
25562560
if (rc != -DER_BUSY && rc != -DER_INPROGRESS)
25572561
break;
25582562

2559-
D_DEBUG(DB_REBUILD, "retry by "DF_RC"/"DF_UUID"\n",
2560-
DP_RC(rc), DP_UUID(entry->ie_couuid));
2563+
retry_cnt++;
2564+
now_hlc = d_hlc_get();
2565+
if (now_hlc - last_warn_hlc >= d_sec2hlc(300)) {
2566+
D_WARN(DF_UUID "/" DF_UUID ": discard still retrying after " DF_U64
2567+
"s, retries=%u: " DF_RC "\n",
2568+
DP_UUID(arg->tgt_discard->pool_uuid), DP_UUID(entry->ie_couuid),
2569+
d_hlc2sec(now_hlc - start_hlc), retry_cnt, DP_RC(rc));
2570+
last_warn_hlc = now_hlc;
2571+
}
2572+
D_DEBUG(DB_REBUILD,
2573+
DF_UUID "/" DF_UUID ": discard retry %u after " DF_U64 "s: " DF_RC "\n",
2574+
DP_UUID(arg->tgt_discard->pool_uuid), DP_UUID(entry->ie_couuid), retry_cnt,
2575+
d_hlc2sec(now_hlc - start_hlc), DP_RC(rc));
25612576
dss_sleep(d_backoff_seq_next(&backoff_seq));
25622577
} while (1);
25632578

25642579
d_backoff_seq_fini(&backoff_seq);
25652580
vos_cont_close(coh);
2566-
D_DEBUG(DB_TRACE, DF_UUID"/"DF_UUID" discard cont done: "DF_RC"\n",
2581+
D_DEBUG(DB_TRACE,
2582+
DF_UUID "/" DF_UUID " discard cont done after " DF_U64 "s, retries=%u: " DF_RC "\n",
25672583
DP_UUID(arg->tgt_discard->pool_uuid), DP_UUID(entry->ie_couuid),
2568-
DP_RC(rc));
2584+
d_hlc2sec(d_hlc_get() - start_hlc), retry_cnt, DP_RC(rc));
25692585

25702586
put:
25712587
ds_cont_child_put(cont);

0 commit comments

Comments
 (0)