Skip to content

Commit d8cf409

Browse files
committed
DAOS-19008 control: Erase formatting after failed format --replace
Signed-off-by: Tom Nabarro <thomas.nabarro@hpe.com>
1 parent 0ef5142 commit d8cf409

3 files changed

Lines changed: 80 additions & 4 deletions

File tree

src/control/cmd/dmg/storage_test.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
//
22
// (C) Copyright 2019-2022 Intel Corporation.
3+
// (C) Copyright 2026 Hewlett Packard Enterprise Development LP
34
//
45
// SPDX-License-Identifier: BSD-2-Clause-Patent
56
//
@@ -154,6 +155,24 @@ func TestStorageCommands(t *testing.T) {
154155
printRequest(t, nvmeAddDeviceReq().WithStorageTierIndex(0)),
155156
nil,
156157
},
158+
{
159+
"Format with replace; no hosts in hostlist",
160+
"storage format --replace",
161+
"",
162+
errors.New("expects a single host"),
163+
},
164+
{
165+
"Format with replace; multiple hosts in hostlist",
166+
"storage format --replace -l foo[1,2].com",
167+
"",
168+
errors.New("expects a single host"),
169+
},
170+
{
171+
"Format with replace and force",
172+
"storage format --replace --force",
173+
"",
174+
errors.New("may not be mixed with --force"),
175+
},
157176
{
158177
"Nonexistent subcommand",
159178
"storage quack",

src/control/server/instance.go

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
//
22
// (C) Copyright 2019-2024 Intel Corporation.
3-
// (C) Copyright 2025 Hewlett Packard Enterprise Development LP
3+
// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
44
//
55
// SPDX-License-Identifier: BSD-2-Clause-Patent
66
//
@@ -216,9 +216,24 @@ func (ei *EngineInstance) determineRank(ctx context.Context, ready *srvpb.Notify
216216
Replace: ei.replaceRank.Load(),
217217
}
218218

219+
// Reset replaceRank state for instance after joinSystem() has been attempted.
220+
defer ei.replaceRank.SetFalse()
221+
219222
resp, err := ei.joinSystem(ctx, joinReq)
220223
if err != nil {
221224
ei.log.Errorf("join failed: %s", err)
225+
226+
// If this is a replace operation and join failed, clean up the formatted storage to
227+
// prevent leaving the rank in a formatted state. This prevents the engine
228+
// inadvertently being joined later with a new rank.
229+
if ei.replaceRank.Load() {
230+
ei.log.Infof("cleaning up after join failure during replace")
231+
if cleanupErr := ei.cleanupAfterFailedJoin(ctx); cleanupErr != nil {
232+
ei.log.Errorf("failed to cleanup after join failure: %v", cleanupErr)
233+
// Don't override the original join error
234+
}
235+
}
236+
222237
return ranklist.NilRank, false, 0, err
223238
}
224239
switch resp.State {
@@ -237,9 +252,6 @@ func (ei *EngineInstance) determineRank(ctx context.Context, ready *srvpb.Notify
237252
}
238253
r = ranklist.Rank(resp.Rank)
239254

240-
// Reset replaceRank state for instance after joinSystem() has returned.
241-
ei.replaceRank.SetFalse()
242-
243255
if !superblock.ValidRank || ready.Uri != superblock.URI {
244256
ei.log.Noticef("updating rank %d URI to %s", resp.Rank, ready.Uri)
245257
superblock.Rank = new(ranklist.Rank)

src/control/server/instance_storage.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/daos-stack/daos/src/control/build"
2020
"github.com/daos-stack/daos/src/control/fault"
2121
"github.com/daos-stack/daos/src/control/fault/code"
22+
"github.com/daos-stack/daos/src/control/lib/ranklist"
2223
"github.com/daos-stack/daos/src/control/server/storage"
2324
)
2425

@@ -76,6 +77,50 @@ func (ei *EngineInstance) NotifyStorageReady(replaceRank bool) {
7677
}()
7778
}
7879

80+
// cleanupAfterFailedJoin cleans up storage after a join failure during replace operation.
81+
// This is called when format succeeded but the join to the system failed, leaving
82+
// the storage in a partially initialized state.
83+
func (ei *EngineInstance) cleanupAfterFailedJoin(ctx context.Context) error {
84+
idx := ei.Index()
85+
ei.log.Infof("instance %d: cleaning up after join failure during replace", idx)
86+
87+
storageProv := ei.GetStorage()
88+
89+
// Get SCM config to access mount point and class
90+
scmCfg, err := storageProv.GetScmConfig()
91+
if err != nil {
92+
return errors.Wrap(err, "failed to get SCM config")
93+
}
94+
95+
if scmCfg == nil {
96+
ei.log.Debugf("instance %d: no SCM config, nothing to clean", idx)
97+
return nil
98+
}
99+
100+
if ei.IsStarted() {
101+
return FaultInstancesNotStopped("cleanup after failed join", ranklist.NilRank)
102+
}
103+
104+
// For RAM-based SCM (tmpfs), unmount to reset state completely
105+
// This allows immediate retry of the format --replace operation
106+
if scmCfg.Class == storage.ClassRam {
107+
ei.log.Debugf("instance %d: unmounting tmpfs at %s", idx, scmCfg.Scm.MountPoint)
108+
if err := storageProv.UnmountTmpfs(); err != nil {
109+
ei.log.Errorf("instance %d: unmount failed: %v", idx, err)
110+
// Continue anyway - log the error but don't fail the cleanup
111+
} else {
112+
ei.log.Debugf("instance %d: tmpfs unmounted successfully", idx)
113+
}
114+
}
115+
116+
if err := ei.RemoveSuperblock(); err != nil {
117+
return err
118+
}
119+
120+
ei.log.Infof("instance %d: cleanup after join failure complete", idx)
121+
return nil
122+
}
123+
79124
func (ei *EngineInstance) checkScmNeedFormat() (bool, error) {
80125
msgIdx := fmt.Sprintf("instance %d", ei.Index())
81126

0 commit comments

Comments
 (0)