@@ -12,6 +12,7 @@ import (
1212 "context"
1313 "fmt"
1414 "os"
15+ "syscall"
1516
1617 "github.com/dustin/go-humanize"
1718 "github.com/pkg/errors"
@@ -76,6 +77,59 @@ func (ei *EngineInstance) NotifyStorageReady(replaceRank bool) {
7677 }()
7778}
7879
80+ // cleanupFailedJoinReplace cleans up storage after a join failure during replace operation.
81+ // This is called when format succeeded but the join to the system failed, leaving
82+ // the storage in a partially initialized state.
83+ func (ei * EngineInstance ) cleanupFailedJoinReplace (ctx context.Context ) error {
84+ idx := ei .Index ()
85+ ei .log .Infof ("instance %d: cleaning up after join failure during replace" , idx )
86+
87+ storageProv := ei .GetStorage ()
88+
89+ // Get SCM config to access mount point and class
90+ scmCfg , err := storageProv .GetScmConfig ()
91+ if err != nil {
92+ return errors .Wrap (err , "failed to get SCM config" )
93+ }
94+
95+ if scmCfg == nil {
96+ ei .log .Debugf ("instance %d: no SCM config, nothing to clean" , idx )
97+ return nil
98+ }
99+
100+ if ei .IsStarted () {
101+ ei .log .Infof ("instance %d: stopping engine before cleanup" , idx )
102+ if err := ei .Stop (syscall .SIGKILL ); err != nil {
103+ return errors .Wrap (err , "failed to stop engine" )
104+ }
105+
106+ pollFn := func (e Engine ) bool { return ! e .IsStarted () }
107+ if err := pollInstanceState (ctx , []Engine {ei }, pollFn ); err != nil {
108+ return errors .Wrap (err , "waiting for engine to stop" )
109+ }
110+ ei .log .Debugf ("instance %d: engine stopped successfully" , idx )
111+ }
112+
113+ // For RAM-based SCM (tmpfs), unmount to reset state
114+ if scmCfg .Class == storage .ClassRam {
115+ ei .log .Debugf ("instance %d: unmounting tmpfs at %s" , idx , scmCfg .Scm .MountPoint )
116+ if err := storageProv .UnmountTmpfs (); err != nil {
117+ ei .log .Errorf ("instance %d: unmount failed: %v" , idx , err )
118+ // Continue anyway - log the error but don't fail the cleanup
119+ } else {
120+ ei .log .Debugf ("instance %d: tmpfs unmounted successfully" , idx )
121+ }
122+ }
123+
124+ // Removing superblock prevents subsequent join without reformat.
125+ if err := ei .RemoveSuperblock (); err != nil {
126+ return err
127+ }
128+
129+ ei .log .Infof ("instance %d: cleanup after join failure complete" , idx )
130+ return nil
131+ }
132+
79133func (ei * EngineInstance ) checkScmNeedFormat () (bool , error ) {
80134 msgIdx := fmt .Sprintf ("instance %d" , ei .Index ())
81135
0 commit comments