@@ -19,6 +19,7 @@ import (
1919 "github.com/daos-stack/daos/src/control/build"
2020 "github.com/daos-stack/daos/src/control/fault"
2121 "github.com/daos-stack/daos/src/control/fault/code"
22+ "github.com/daos-stack/daos/src/control/lib/ranklist"
2223 "github.com/daos-stack/daos/src/control/server/storage"
2324)
2425
@@ -76,6 +77,50 @@ func (ei *EngineInstance) NotifyStorageReady(replaceRank bool) {
7677 }()
7778}
7879
80+ // cleanupAfterFailedJoin cleans up storage after a join failure during replace operation.
81+ // This is called when format succeeded but the join to the system failed, leaving
82+ // the storage in a partially initialized state.
83+ func (ei * EngineInstance ) cleanupAfterFailedJoin (ctx context.Context ) error {
84+ idx := ei .Index ()
85+ ei .log .Infof ("instance %d: cleaning up after join failure during replace" , idx )
86+
87+ storageProv := ei .GetStorage ()
88+
89+ // Get SCM config to access mount point and class
90+ scmCfg , err := storageProv .GetScmConfig ()
91+ if err != nil {
92+ return errors .Wrap (err , "failed to get SCM config" )
93+ }
94+
95+ if scmCfg == nil {
96+ ei .log .Debugf ("instance %d: no SCM config, nothing to clean" , idx )
97+ return nil
98+ }
99+
100+ if ei .IsStarted () {
101+ return FaultInstancesNotStopped ("cleanup after failed join" , ranklist .NilRank )
102+ }
103+
104+ // For RAM-based SCM (tmpfs), unmount to reset state completely
105+ // This allows immediate retry of the format --replace operation
106+ if scmCfg .Class == storage .ClassRam {
107+ ei .log .Debugf ("instance %d: unmounting tmpfs at %s" , idx , scmCfg .Scm .MountPoint )
108+ if err := storageProv .UnmountTmpfs (); err != nil {
109+ ei .log .Errorf ("instance %d: unmount failed: %v" , idx , err )
110+ // Continue anyway - log the error but don't fail the cleanup
111+ } else {
112+ ei .log .Debugf ("instance %d: tmpfs unmounted successfully" , idx )
113+ }
114+ }
115+
116+ if err := ei .RemoveSuperblock (); err != nil {
117+ return err
118+ }
119+
120+ ei .log .Infof ("instance %d: cleanup after join failure complete" , idx )
121+ return nil
122+ }
123+
79124func (ei * EngineInstance ) checkScmNeedFormat () (bool , error ) {
80125 msgIdx := fmt .Sprintf ("instance %d" , ei .Index ())
81126
0 commit comments