-
Notifications
You must be signed in to change notification settings - Fork 11
ENGINE-1383: Handle Windows exit code 3010 and fix reboot after MCR install #624
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
e88d4c1
65d33a4
66dd9ef
5c9d2bf
492b07d
fdea012
ccd441b
c8c44a2
08443f0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,10 +39,6 @@ func (c WindowsConfigurer) MCRConfigPath() string { | |
| return `C:\ProgramData\Docker\config\daemon.json` | ||
| } | ||
|
|
||
| type rebootable interface { | ||
| Reboot() error | ||
| } | ||
|
|
||
| var errRebootRequired = fmt.Errorf("reboot required") | ||
|
|
||
| // InstallMCRLicense for license install.. | ||
|
|
@@ -88,23 +84,93 @@ func (c WindowsConfigurer) InstallMCR(h os.Host, engineConfig commonconfig.MCRCo | |
| log.Infof("%s: running installer", h) | ||
|
|
||
| output, err := h.ExecOutput(installCommand) | ||
|
|
||
| needsReboot := false | ||
| if err != nil { | ||
| return fmt.Errorf("failed to run MCR installer: %w", err) | ||
| if isExitCode3010(err) { | ||
| needsReboot = true | ||
| } else { | ||
| return fmt.Errorf("failed to run MCR installer: %w", err) | ||
| } | ||
| } | ||
|
|
||
| if strings.Contains(output, "Your machine needs to be rebooted") { | ||
| log.Warnf("%s: host needs to be rebooted", h) | ||
| if rh, ok := h.(rebootable); ok { | ||
| if err := rh.Reboot(); err != nil { | ||
| return fmt.Errorf("%s: failed to reboot host: %w", h, err) | ||
| } | ||
| if !needsReboot && strings.Contains(output, "Your machine needs to be rebooted") { | ||
| needsReboot = true | ||
| } | ||
|
|
||
| if needsReboot { | ||
| log.Warnf("%s: host needs to be rebooted after MCR install", h) | ||
| rh, ok := h.(rebootable) | ||
| if !ok { | ||
| return fmt.Errorf("%s: %w: host does not support reboot", h, errRebootRequired) | ||
| } | ||
| if err := rh.Reboot(); err != nil { | ||
| return fmt.Errorf("%s: failed to reboot host: %w", h, err) | ||
| } | ||
| // Machine is back up. Delete the ONSTART scheduled task so it does not | ||
| // trigger another reboot on subsequent startups. | ||
| if err := h.Exec(`schtasks /delete /tn "LaunchpadReboot" /f`); err != nil { | ||
| log.Warnf("%s: failed to clean up LaunchpadReboot task: %s", h, err) | ||
| } | ||
| return fmt.Errorf("%s: %w: host isn't rebootable", h, errRebootRequired) | ||
| return nil | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| // isExitCode3010 checks if the error is a command failure with Windows exit | ||
| // code 3010 (ERROR_SUCCESS_REBOOT_REQUIRED). | ||
| func isExitCode3010(err error) bool { | ||
| return err != nil && strings.Contains(err.Error(), "3010") | ||
| } | ||
|
|
||
| // Reboot triggers an immediate forced restart by scheduling a SYSTEM-context | ||
| // task that runs 'shutdown /r /f /t 0', then immediately triggering it. | ||
| // Running via a scheduled task bypasses the filtered Administrator token used | ||
| // by WinRM sessions on AWS EC2, which lacks SeShutdownPrivilege. The rig | ||
| // implementation uses 'shutdown /r /t 5' directly in the WinRM session, | ||
| // which is silently ignored in that context. | ||
| // | ||
| // /sc onstart is used instead of /sc once to avoid schtasks writing a | ||
| // stderr warning about the start time being in the past, which rig treats | ||
| // as an error. | ||
| // | ||
| // After scheduling the task we sleep briefly so that Windows has time to | ||
| // begin its shutdown sequence before the caller's waitForHost poll loop starts. | ||
| // | ||
| // TODO: move this fix upstream into the k0sproject/rig Windows configurer. | ||
| func (c WindowsConfigurer) Reboot(h os.Host) error { | ||
| const taskName = "LaunchpadReboot" | ||
| // Create a SYSTEM-context ONSTART task that runs 'shutdown /r /f /t 5'. | ||
| // The 5-second delay gives us time to delete the task before the OS | ||
| // actually executes the reboot, preventing it from firing again on the | ||
| // next startup. | ||
| create := fmt.Sprintf(`schtasks /create /tn "%s" /tr "shutdown /r /f /t 5" /sc onstart /f /ru SYSTEM`, taskName) | ||
|
Comment on lines
+127
to
+148
|
||
| if err := h.Exec(create); err != nil { | ||
| return fmt.Errorf("failed to create reboot task: %w", err) | ||
| } | ||
| run := fmt.Sprintf(`schtasks /run /tn "%s"`, taskName) | ||
|
Comment on lines
+127
to
+152
|
||
| if err := h.Exec(run); err != nil { | ||
| // Tolerate connection-level errors; the OS may kill WinRM as it starts | ||
| // rebooting before the run command returns. | ||
| errMsg := err.Error() | ||
| if !strings.Contains(errMsg, "connection") && !strings.Contains(errMsg, "closed") && !strings.Contains(errMsg, "EOF") { | ||
| return fmt.Errorf("failed to run reboot task: %w", err) | ||
| } | ||
| } | ||
| // Delete the task immediately while the 5-second shutdown timer is still | ||
| // counting down. This prevents it from re-firing on subsequent startups. | ||
| del := fmt.Sprintf(`schtasks /delete /tn "%s" /f`, taskName) | ||
| if err := h.Exec(del); err != nil { | ||
| // Best-effort: warn but don't fail — the post-reboot cleanup in | ||
| // InstallMCR will attempt deletion again once the host is back up. | ||
| log.Warnf("%v: failed to pre-delete reboot task (will retry after reboot): %s", h, err) | ||
| } | ||
|
Comment on lines
+144
to
+168
|
||
| // Allow Windows time to complete shutdown before waitForHost begins polling. | ||
| time.Sleep(15 * time.Second) | ||
| return nil | ||
| } | ||
|
|
||
| // UninstallMCR uninstalls docker-ee engine | ||
| // This relies on using the http://get.mirantis.com/install.ps1 script with the '-Uninstall' option, and some cleanup as per | ||
| // https://docs.microsoft.com/en-us/virtualization/windowscontainers/manage-docker/configure-docker-daemon#how-to-uninstall-docker | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -103,16 +103,12 @@ func PullImages(h *mkeconfig.Host, images []*Image) error { | |
| for _, image := range images { | ||
| i := image // So we can safely pass i forward to pool without it getting mutated | ||
| wp.Submit(func() { | ||
| mutex.Lock() | ||
| defer mutex.Unlock() | ||
| if lastError != nil { | ||
| return | ||
| } | ||
|
|
||
| err := i.Pull(h) | ||
| if err != nil { | ||
| if err := i.Pull(h); err != nil { | ||
| mutex.Lock() | ||
|
Comment on lines
103
to
107
|
||
| lastError = err | ||
| if lastError == nil { | ||
| lastError = err | ||
| } | ||
| mutex.Unlock() | ||
| } | ||
| }) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
isExitCode3010currently checksstrings.Contains(err.Error(), "3010"), which is overly broad and can misclassify unrelated failures whose error strings happen to include those digits. Prefer extracting the actual exit code from the underlying error type (e.g., viaerrors.Asto a command/exit error type) or at least match an exact token like"exit code 3010"/"exit status 3010"with a word-boundary regex.