Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clients/sled-agent-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ impl From<types::VmmRuntimeState>
state: s.state.into(),
generation: s.gen_,
time_updated: s.time_updated,
failure_reason: None, // TODO(eliza)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fwiw I'm ... not sure what this means. taking a swing: "vmm_register() or vmm_get_state() returned a SledVmmState that was immediately Failed"? would we expect Propolis to fill in a reason here, or would we expect sled-agent to invent a reason on Propolis' behalf when it sees the failure?

}
}
}
Expand Down
2 changes: 2 additions & 0 deletions common/src/api/internal/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ pub struct VmmRuntimeState {
pub generation: Generation,
/// Timestamp for the VMM's state.
pub time_updated: DateTime<Utc>,
/// A human-readable description of why this VMM is in the 'failed' state.
pub failure_reason: Option<String>,
}

/// A wrapper type containing a sled's total knowledge of the state of a VMM.
Expand Down
18 changes: 16 additions & 2 deletions dev-tools/omdb/src/bin/omdb/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5061,14 +5061,16 @@ async fn cmd_db_instance_info(
let ctx = || "listing past VMMs";
#[derive(Tabled)]
#[tabled(rename_all = "SCREAMING_SNAKE_CASE")]
struct VmmRow {
struct VmmRow<'a> {
#[tabled(inline)]
state: VmmStateRow,
sled_id: SledUuid,
#[tabled(display_with = "datetime_rfc3339_concise")]
time_created: chrono::DateTime<Utc>,
#[tabled(display_with = "datetime_opt_rfc3339_concise")]
time_deleted: Option<chrono::DateTime<Utc>>,
#[tabled(display_with = "display_option_blank")]
failure_reason: Option<&'a str>,
}
let vmms = vmm_dsl::vmm
.filter(vmm_dsl::instance_id.eq(id.into_untyped_uuid()))
Expand Down Expand Up @@ -5097,6 +5099,7 @@ async fn cmd_db_instance_info(
time_state_updated: _,
generation,
state,
ref failure_reason,
} = vmm;
VmmRow {
state: VmmStateRow {
Expand All @@ -5107,6 +5110,7 @@ async fn cmd_db_instance_info(
sled_id: sled_id.into(),
time_created,
time_deleted,
failure_reason: failure_reason.as_deref(),
}
}))
.with(tabled::settings::Style::empty())
Expand Down Expand Up @@ -7699,13 +7703,14 @@ fn prettyprint_vmm(
const ID: &'static str = "ID";
const CREATED: &'static str = "created at";
const DELETED: &'static str = "deleted at";
const UPDATED: &'static str = "updated at";
const UPDATED: &'static str = " updated at";
const INSTANCE_ID: &'static str = "instance ID";
const SLED_ID: &'static str = "sled ID";
const SLED_SERIAL: &'static str = "sled serial";
const CPU_PLATFORM: &'static str = "CPU platform";
const ADDRESS: &'static str = "propolis address";
const STATE: &'static str = "state";
const FAILURE_REASON: &'static str = " failed because:";
Comment on lines +7706 to +7713
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dumb question, but aren't we already juggling the ident below? Why add spaces as prefixes here?

const WIDTH: usize = const_max_len(&[
ID,
CREATED,
Expand All @@ -7717,6 +7722,7 @@ fn prettyprint_vmm(
CPU_PLATFORM,
STATE,
ADDRESS,
FAILURE_REASON,
]);

let width = std::cmp::max(width, Some(WIDTH)).unwrap_or(WIDTH);
Expand All @@ -7732,6 +7738,7 @@ fn prettyprint_vmm(
state,
generation,
time_state_updated,
failure_reason,
} = vmm;

println!("{indent}{ID:>width$}: {id}");
Expand All @@ -7743,6 +7750,9 @@ fn prettyprint_vmm(
println!("{indent}{DELETED:width$}: {deleted}");
}
println!("{indent}{STATE:>width$}: {state}");
if let Some(reason) = failure_reason {
println!("{indent}{FAILURE_REASON:>width$}: {reason}");
}
let g = u64::from(generation.0);
println!(
"{indent}{UPDATED:>width$}: {time_state_updated:?} (generation {g})"
Expand Down Expand Up @@ -7821,6 +7831,8 @@ async fn cmd_db_vmm_list(
#[tabled(inline)]
state: VmmStateRow,
sled: &'a str,
#[tabled(display_with = "display_option_blank")]
failure_reason: Option<&'a str>,
}

impl<'a> From<&'a (Vmm, Option<Sled>)> for VmmRow<'a> {
Expand All @@ -7837,6 +7849,7 @@ async fn cmd_db_vmm_list(
time_state_updated: _,
generation,
state,
ref failure_reason,
} = vmm;
let sled = match sled {
Some(sled) => sled.serial_number(),
Expand All @@ -7853,6 +7866,7 @@ async fn cmd_db_vmm_list(
generation: generation.0.into(),
},
sled,
failure_reason: failure_reason.as_deref(),
}
}
}
Expand Down
3 changes: 2 additions & 1 deletion nexus/db-model/src/schema_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
///
/// This must be updated when you change the database schema. Refer to
/// schema/crdb/README.adoc in the root of this repository for details.
pub const SCHEMA_VERSION: Version = Version::new(251, 0, 0);
pub const SCHEMA_VERSION: Version = Version::new(252, 0, 0);

/// List of all past database schema versions, in *reverse* order
///
Expand All @@ -28,6 +28,7 @@ pub static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
// | leaving the first copy as an example for the next person.
// v
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
KnownVersion::new(252, "vmm-failure-reason"),
KnownVersion::new(251, "fm-sitrep-next-inv-min-time-started"),
KnownVersion::new(250, "inv-svc-enabled-not-online"),
KnownVersion::new(249, "fm-support-bundle-request"),
Expand Down
51 changes: 34 additions & 17 deletions nexus/db-model/src/vmm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use crate::typed_uuid::DbTypedUuid;
use crate::{SqlU16, VmmCpuPlatform};
use chrono::{DateTime, Utc};
use nexus_db_schema::schema::vmm;
use omicron_common::api::internal::nexus;
use omicron_uuid_kinds::*;
use serde::{Deserialize, Serialize};
use uuid::Uuid;
Expand Down Expand Up @@ -72,6 +73,12 @@ pub struct Vmm {
/// control plane if this VMM's instance didn't specify a required platform
/// when it was started.
pub cpu_platform: VmmCpuPlatform,

/// A human-readable reason describing why this VMM is in the `Failed` state.
///
/// This is not stable and is intended for debugging purposes only. It
/// should only be `Some` if the VMM's `state` is `Failed`.
pub failure_reason: Option<String>,
}

impl Vmm {
Expand Down Expand Up @@ -101,17 +108,18 @@ impl Vmm {
propolis_port: SqlU16(propolis_port),
state: VmmState::Creating,
cpu_platform,
failure_reason: None,
}
}

/// Returns the runtime state of this VMM.
pub fn runtime(&self) -> VmmRuntimeState {
VmmRuntimeState {
time_state_updated: self.time_state_updated,
generation: self.generation,
state: self.state,
}
}
// /// Returns the runtime state of this VMM.
// pub fn runtime(&self) -> VmmRuntimeState {
// VmmRuntimeState {
// time_state_updated: self.time_state_updated,
// generation: self.generation,
// state: self.state,
// }
// }
Comment on lines +115 to +122
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm surprised this isn't used? but also what led you to noticing this, and do you want to just delete it lol

especially since it seems like Instance::runtime is the .runtime() that's actually used?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah it should be deleted. i was surprised to discover this also


pub fn sled_id(&self) -> SledUuid {
self.sled_id.into()
Expand Down Expand Up @@ -143,18 +151,27 @@ pub struct VmmRuntimeState {
/// The state of this VMM. If this VMM is the active VMM for a given
/// instance, this state is the instance's logical state.
pub state: VmmState,

/// A human-readable reason describing why this VMM is in the `Failed` state.
///
/// This is not stable and is intended for debugging purposes only. It
/// should only be `Some` if the VMM's `state` is `Failed`.
pub failure_reason: Option<String>,
}

impl From<omicron_common::api::internal::nexus::VmmRuntimeState>
for VmmRuntimeState
{
fn from(
value: omicron_common::api::internal::nexus::VmmRuntimeState,
) -> Self {
impl From<nexus::VmmRuntimeState> for VmmRuntimeState {
fn from(value: nexus::VmmRuntimeState) -> Self {
let nexus::VmmRuntimeState {
state,
time_updated,
generation,
failure_reason,
} = value;
Self {
state: value.state.into(),
time_state_updated: value.time_updated,
generation: value.generation.into(),
state: state.into(),
time_state_updated: time_updated,
generation: generation.into(),
failure_reason,
}
}
}
Expand Down
8 changes: 8 additions & 0 deletions nexus/db-queries/src/db/datastore/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2991,6 +2991,7 @@ mod tests {
time_state_updated: Utc::now(),
generation: Generation::new(),
state: VmmState::Running,
failure_reason: None,
},
)
.await
Expand Down Expand Up @@ -3052,6 +3053,7 @@ mod tests {
time_state_updated: Utc::now(),
generation: Generation::new(),
state: VmmState::Running,
failure_reason: None,
},
)
.await
Expand Down Expand Up @@ -3148,6 +3150,7 @@ mod tests {
time_state_updated: Utc::now(),
generation: Generation::new(),
state: VmmState::Stopped,
failure_reason: None,
},
)
.await
Expand Down Expand Up @@ -3187,6 +3190,7 @@ mod tests {
time_state_updated: Utc::now(),
generation: Generation::new(),
state: VmmState::Running,
failure_reason: None,
},
)
.await
Expand Down Expand Up @@ -3224,6 +3228,7 @@ mod tests {
&VmmRuntimeState {
time_state_updated: Utc::now(),
generation: Generation(vmm2.generation.0.next()),
failure_reason: None,
state: VmmState::Running,
},
)
Expand Down Expand Up @@ -3288,6 +3293,7 @@ mod tests {
time_state_updated: Utc::now(),
generation: Generation::new(),
state: VmmState::Running,
failure_reason: None,
},
)
.await
Expand Down Expand Up @@ -3323,6 +3329,7 @@ mod tests {
&VmmRuntimeState {
time_state_updated: Utc::now(),
generation: Generation(vmm2.generation.0.next().next()),
failure_reason: None,
state: VmmState::SagaUnwound,
},
)
Expand Down Expand Up @@ -3433,6 +3440,7 @@ mod tests {
time_state_updated: Utc::now(),
generation: Generation::new(),
state: VmmState::Running,
failure_reason: None,
},
)
.await
Expand Down
7 changes: 7 additions & 0 deletions nexus/db-queries/src/db/datastore/vmm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,7 @@ mod tests {
time_state_updated: Utc::now(),
generation: Generation::new(),
state: VmmState::Running,
failure_reason: None,
},
)
.await
Expand All @@ -496,6 +497,7 @@ mod tests {
time_state_updated: Utc::now(),
generation: Generation::new(),
state: VmmState::Running,
failure_reason: None,
},
)
.await
Expand Down Expand Up @@ -531,6 +533,7 @@ mod tests {
time_state_updated: Utc::now(),
generation: Generation(vmm1.generation.0.next()),
state: VmmState::Stopping,
failure_reason: None,
},
Migrations {
migration_in: None,
Expand All @@ -553,6 +556,7 @@ mod tests {
time_state_updated: Utc::now(),
generation: Generation(vmm2.generation.0.next()),
state: VmmState::Running,
failure_reason: None,
},
Migrations {
migration_in: Some(&vmm2_migration_in),
Expand Down Expand Up @@ -605,6 +609,7 @@ mod tests {
time_state_updated: Utc::now(),
generation: Generation::new(),
state: VmmState::Running,
failure_reason: None,
},
)
.await
Expand Down Expand Up @@ -639,6 +644,7 @@ mod tests {
time_state_updated: Utc::now(),
generation: Generation(vmm2.generation.0.next()),
state: VmmState::Destroyed,
failure_reason: None,
},
Migrations {
migration_in: Some(&vmm2_migration_in),
Expand All @@ -663,6 +669,7 @@ mod tests {
time_state_updated: Utc::now(),
generation: Generation(vmm3.generation.0.next()),
state: VmmState::Destroyed,
failure_reason: None,
},
Migrations {
migration_in: Some(&vmm3_migration_in),
Expand Down
1 change: 1 addition & 0 deletions nexus/db-schema/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,7 @@ table! {
propolis_port -> Int4,
state -> crate::enums::VmmStateEnum,
cpu_platform -> crate::enums::VmmCpuPlatformEnum,
failure_reason -> Nullable<Text>,
}
}
joinable!(vmm -> sled (sled_id));
Expand Down
1 change: 1 addition & 0 deletions nexus/src/app/background/tasks/abandoned_vmm_reaper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ mod tests {
state: VmmState::Destroyed,
time_state_updated: Utc::now(),
generation: Generation::new(),
failure_reason: None,
}),
)
.await
Expand Down
1 change: 1 addition & 0 deletions nexus/src/app/background/tasks/instance_reincarnation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,7 @@ mod test {
time_state_updated: Utc::now(),
generation: Generation::new(),
state: VmmState::SagaUnwound,
failure_reason: None,
},
)
.await
Expand Down
5 changes: 5 additions & 0 deletions nexus/src/app/background/tasks/instance_watcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ impl InstanceWatcher {
generation: vmm.generation.0.next(),
state: nexus::VmmState::Failed,
time_updated: chrono::Utc::now(),
failure_reason: Some("sled expunged".to_string()),
},
// It's fine to synthesize `None`s here because a `None`
// just means "don't update the migration state", not
Expand Down Expand Up @@ -173,6 +174,10 @@ impl InstanceWatcher {
generation: vmm.generation.0.next(),
state: nexus::VmmState::Failed,
time_updated: chrono::Utc::now(),
failure_reason: Some(
"VMM no longer known to sled-agent"
.to_string(),
),
},
// It's fine to synthesize `None`s here because a `None`
// just means "don't update the migration state", not
Expand Down
Loading
Loading