-
Notifications
You must be signed in to change notification settings - Fork 79
write down the reason a VMM moved to Failed
#10285
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5061,14 +5061,16 @@ async fn cmd_db_instance_info( | |
| let ctx = || "listing past VMMs"; | ||
| #[derive(Tabled)] | ||
| #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] | ||
| struct VmmRow { | ||
| struct VmmRow<'a> { | ||
| #[tabled(inline)] | ||
| state: VmmStateRow, | ||
| sled_id: SledUuid, | ||
| #[tabled(display_with = "datetime_rfc3339_concise")] | ||
| time_created: chrono::DateTime<Utc>, | ||
| #[tabled(display_with = "datetime_opt_rfc3339_concise")] | ||
| time_deleted: Option<chrono::DateTime<Utc>>, | ||
| #[tabled(display_with = "display_option_blank")] | ||
| failure_reason: Option<&'a str>, | ||
| } | ||
| let vmms = vmm_dsl::vmm | ||
| .filter(vmm_dsl::instance_id.eq(id.into_untyped_uuid())) | ||
|
|
@@ -5097,6 +5099,7 @@ async fn cmd_db_instance_info( | |
| time_state_updated: _, | ||
| generation, | ||
| state, | ||
| ref failure_reason, | ||
| } = vmm; | ||
| VmmRow { | ||
| state: VmmStateRow { | ||
|
|
@@ -5107,6 +5110,7 @@ async fn cmd_db_instance_info( | |
| sled_id: sled_id.into(), | ||
| time_created, | ||
| time_deleted, | ||
| failure_reason: failure_reason.as_deref(), | ||
| } | ||
| })) | ||
| .with(tabled::settings::Style::empty()) | ||
|
|
@@ -7699,13 +7703,14 @@ fn prettyprint_vmm( | |
| const ID: &'static str = "ID"; | ||
| const CREATED: &'static str = "created at"; | ||
| const DELETED: &'static str = "deleted at"; | ||
| const UPDATED: &'static str = "updated at"; | ||
| const UPDATED: &'static str = " updated at"; | ||
| const INSTANCE_ID: &'static str = "instance ID"; | ||
| const SLED_ID: &'static str = "sled ID"; | ||
| const SLED_SERIAL: &'static str = "sled serial"; | ||
| const CPU_PLATFORM: &'static str = "CPU platform"; | ||
| const ADDRESS: &'static str = "propolis address"; | ||
| const STATE: &'static str = "state"; | ||
| const FAILURE_REASON: &'static str = " failed because:"; | ||
|
Comment on lines
+7706
to
+7713
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. dumb question, but aren't we already juggling the |
||
| const WIDTH: usize = const_max_len(&[ | ||
| ID, | ||
| CREATED, | ||
|
|
@@ -7717,6 +7722,7 @@ fn prettyprint_vmm( | |
| CPU_PLATFORM, | ||
| STATE, | ||
| ADDRESS, | ||
| FAILURE_REASON, | ||
| ]); | ||
|
|
||
| let width = std::cmp::max(width, Some(WIDTH)).unwrap_or(WIDTH); | ||
|
|
@@ -7732,6 +7738,7 @@ fn prettyprint_vmm( | |
| state, | ||
| generation, | ||
| time_state_updated, | ||
| failure_reason, | ||
| } = vmm; | ||
|
|
||
| println!("{indent}{ID:>width$}: {id}"); | ||
|
|
@@ -7743,6 +7750,9 @@ fn prettyprint_vmm( | |
| println!("{indent}{DELETED:width$}: {deleted}"); | ||
| } | ||
| println!("{indent}{STATE:>width$}: {state}"); | ||
| if let Some(reason) = failure_reason { | ||
| println!("{indent}{FAILURE_REASON:>width$}: {reason}"); | ||
| } | ||
| let g = u64::from(generation.0); | ||
| println!( | ||
| "{indent}{UPDATED:>width$}: {time_state_updated:?} (generation {g})" | ||
|
|
@@ -7821,6 +7831,8 @@ async fn cmd_db_vmm_list( | |
| #[tabled(inline)] | ||
| state: VmmStateRow, | ||
| sled: &'a str, | ||
| #[tabled(display_with = "display_option_blank")] | ||
| failure_reason: Option<&'a str>, | ||
| } | ||
|
|
||
| impl<'a> From<&'a (Vmm, Option<Sled>)> for VmmRow<'a> { | ||
|
|
@@ -7837,6 +7849,7 @@ async fn cmd_db_vmm_list( | |
| time_state_updated: _, | ||
| generation, | ||
| state, | ||
| ref failure_reason, | ||
| } = vmm; | ||
| let sled = match sled { | ||
| Some(sled) => sled.serial_number(), | ||
|
|
@@ -7853,6 +7866,7 @@ async fn cmd_db_vmm_list( | |
| generation: generation.0.into(), | ||
| }, | ||
| sled, | ||
| failure_reason: failure_reason.as_deref(), | ||
| } | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ use crate::typed_uuid::DbTypedUuid; | |
| use crate::{SqlU16, VmmCpuPlatform}; | ||
| use chrono::{DateTime, Utc}; | ||
| use nexus_db_schema::schema::vmm; | ||
| use omicron_common::api::internal::nexus; | ||
| use omicron_uuid_kinds::*; | ||
| use serde::{Deserialize, Serialize}; | ||
| use uuid::Uuid; | ||
|
|
@@ -72,6 +73,12 @@ pub struct Vmm { | |
| /// control plane if this VMM's instance didn't specify a required platform | ||
| /// when it was started. | ||
| pub cpu_platform: VmmCpuPlatform, | ||
|
|
||
| /// A human-readable reason describing why this VMM is in the `Failed` state. | ||
| /// | ||
| /// This is not stable and is intended for debugging purposes only. It | ||
| /// should only be `Some` if the VMM's `state` is `Failed`. | ||
| pub failure_reason: Option<String>, | ||
| } | ||
|
|
||
| impl Vmm { | ||
|
|
@@ -101,17 +108,18 @@ impl Vmm { | |
| propolis_port: SqlU16(propolis_port), | ||
| state: VmmState::Creating, | ||
| cpu_platform, | ||
| failure_reason: None, | ||
| } | ||
| } | ||
|
|
||
| /// Returns the runtime state of this VMM. | ||
| pub fn runtime(&self) -> VmmRuntimeState { | ||
| VmmRuntimeState { | ||
| time_state_updated: self.time_state_updated, | ||
| generation: self.generation, | ||
| state: self.state, | ||
| } | ||
| } | ||
| // /// Returns the runtime state of this VMM. | ||
| // pub fn runtime(&self) -> VmmRuntimeState { | ||
| // VmmRuntimeState { | ||
| // time_state_updated: self.time_state_updated, | ||
| // generation: self.generation, | ||
| // state: self.state, | ||
| // } | ||
| // } | ||
|
Comment on lines
+115
to
+122
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm surprised this isn't used? but also what led you to noticing this, and do you want to just delete it lol especially since it seems like
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah it should be deleted. i was surprised to discover this also |
||
|
|
||
| pub fn sled_id(&self) -> SledUuid { | ||
| self.sled_id.into() | ||
|
|
@@ -143,18 +151,27 @@ pub struct VmmRuntimeState { | |
| /// The state of this VMM. If this VMM is the active VMM for a given | ||
| /// instance, this state is the instance's logical state. | ||
| pub state: VmmState, | ||
|
|
||
| /// A human-readable reason describing why this VMM is in the `Failed` state. | ||
| /// | ||
| /// This is not stable and is intended for debugging purposes only. It | ||
| /// should only be `Some` if the VMM's `state` is `Failed`. | ||
| pub failure_reason: Option<String>, | ||
| } | ||
|
|
||
| impl From<omicron_common::api::internal::nexus::VmmRuntimeState> | ||
| for VmmRuntimeState | ||
| { | ||
| fn from( | ||
| value: omicron_common::api::internal::nexus::VmmRuntimeState, | ||
| ) -> Self { | ||
| impl From<nexus::VmmRuntimeState> for VmmRuntimeState { | ||
| fn from(value: nexus::VmmRuntimeState) -> Self { | ||
| let nexus::VmmRuntimeState { | ||
| state, | ||
| time_updated, | ||
| generation, | ||
| failure_reason, | ||
| } = value; | ||
| Self { | ||
| state: value.state.into(), | ||
| time_state_updated: value.time_updated, | ||
| generation: value.generation.into(), | ||
| state: state.into(), | ||
| time_state_updated: time_updated, | ||
| generation: generation.into(), | ||
| failure_reason, | ||
| } | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fwiw I'm ... not sure what this means. taking a swing: "vmm_register() or vmm_get_state() returned a SledVmmState that was immediately
Failed"? would we expect Propolis to fill in a reason here, or would we expect sled-agent to invent a reason on Propolis' behalf when it sees the failure?