Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 200 additions & 4 deletions crates/openshell-bootstrap/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -449,16 +449,20 @@ pub fn generic_failure_diagnosis(gateway_name: &str) -> GatewayFailureDiagnosis
summary: "Gateway failed to start".to_string(),
explanation: "The gateway encountered an unexpected error during startup.".to_string(),
recovery_steps: vec![
RecoveryStep::with_command(
"Check container logs for details",
format!("openshell doctor logs --name {gateway_name}"),
),
RecoveryStep::with_command(
"Run diagnostics",
format!("openshell doctor check --name {gateway_name}"),
),
RecoveryStep::with_command(
"Try destroying and recreating the gateway",
format!(
"openshell gateway destroy --name {gateway_name} && openshell gateway start"
),
),
RecoveryStep::with_command(
"Check container logs for details",
format!("docker logs openshell-cluster-{gateway_name}"),
),
RecoveryStep::new(
"If the issue persists, report it at https://github.com/nvidia/openshell/issues",
),
Expand Down Expand Up @@ -650,4 +654,196 @@ mod tests {
);
assert!(d.retryable);
}

// -- generic_failure_diagnosis tests --

#[test]
fn generic_diagnosis_suggests_doctor_logs() {
let d = generic_failure_diagnosis("my-gw");
let commands: Vec<String> = d
.recovery_steps
.iter()
.filter_map(|s| s.command.clone())
.collect();
assert!(
commands.iter().any(|c| c.contains("openshell doctor logs")),
"expected 'openshell doctor logs' in recovery commands, got: {commands:?}"
);
}

#[test]
fn generic_diagnosis_suggests_doctor_check() {
let d = generic_failure_diagnosis("my-gw");
let commands: Vec<String> = d
.recovery_steps
.iter()
.filter_map(|s| s.command.clone())
.collect();
assert!(
commands
.iter()
.any(|c| c.contains("openshell doctor check")),
"expected 'openshell doctor check' in recovery commands, got: {commands:?}"
);
}

#[test]
fn generic_diagnosis_includes_gateway_name() {
let d = generic_failure_diagnosis("custom-name");
let all_text: String = d
.recovery_steps
.iter()
.filter_map(|s| s.command.clone())
.collect::<Vec<_>>()
.join(" ");
assert!(
all_text.contains("custom-name"),
"expected gateway name in recovery commands, got: {all_text}"
);
}

// -- fallback behavior tests --

#[test]
fn namespace_timeout_without_logs_returns_none() {
// This is the most common user-facing error: a plain timeout with only
// kubectl output. It must NOT match any specific pattern so the caller
// can fall back to generic_failure_diagnosis.
let diagnosis = diagnose_failure(
"test",
"K8s namespace not ready\n\nCaused by:\n \
timed out waiting for namespace 'openshell' to exist: \
error: the server doesn't have a resource type \"namespace\"",
None,
);
assert!(
diagnosis.is_none(),
"plain namespace timeout should not match any specific pattern, got: {:?}",
diagnosis.map(|d| d.summary)
);
}

#[test]
fn namespace_timeout_with_pressure_logs_matches() {
// When container logs reveal node pressure, the diagnosis engine
// should detect it even though the error message itself is generic.
let diagnosis = diagnose_failure(
"test",
"K8s namespace not ready\n\nCaused by:\n \
timed out waiting for namespace 'openshell' to exist: <kubectl output>",
Some("HEALTHCHECK_NODE_PRESSURE: DiskPressure"),
);
assert!(diagnosis.is_some(), "expected node pressure diagnosis");
let d = diagnosis.unwrap();
assert!(
d.summary.contains("pressure"),
"expected pressure in summary, got: {}",
d.summary
);
}

#[test]
fn namespace_timeout_with_corrupted_state_logs_matches() {
// Container logs revealing RBAC corruption should be caught.
let diagnosis = diagnose_failure(
"test",
"K8s namespace not ready\n\nCaused by:\n \
timed out waiting for namespace 'openshell' to exist: <output>",
Some(
"configmaps \"extension-apiserver-authentication\" is forbidden: \
User cannot get resource",
),
);
assert!(diagnosis.is_some(), "expected corrupted state diagnosis");
let d = diagnosis.unwrap();
assert!(
d.summary.contains("Corrupted"),
"expected Corrupted in summary, got: {}",
d.summary
);
}

#[test]
fn namespace_timeout_with_no_route_logs_matches() {
let diagnosis = diagnose_failure(
"test",
"K8s namespace not ready",
Some("Error: no default route present before starting k3s"),
);
assert!(diagnosis.is_some(), "expected networking diagnosis");
let d = diagnosis.unwrap();
assert!(
d.summary.contains("networking"),
"expected networking in summary, got: {}",
d.summary
);
}

#[test]
fn diagnose_failure_with_logs_uses_combined_text() {
// Verify that diagnose_failure combines error_message + container_logs
// for pattern matching. The pattern "connection refused" is in logs,
// not in the error message.
let diagnosis = diagnose_failure(
"test",
"K8s namespace not ready",
Some("dial tcp 127.0.0.1:6443: connect: connection refused"),
);
assert!(
diagnosis.is_some(),
"expected diagnosis from container logs pattern"
);
let d = diagnosis.unwrap();
assert!(
d.summary.contains("Network") || d.summary.contains("connectivity"),
"expected network diagnosis, got: {}",
d.summary
);
}

// -- end-to-end fallback pattern (mirrors CLI code) --

#[test]
fn fallback_to_generic_produces_actionable_diagnosis() {
// This mirrors the actual CLI pattern:
// diagnose_failure(...).unwrap_or_else(|| generic_failure_diagnosis(name))
// For a plain namespace timeout with no useful container logs, the
// specific matcher returns None and we must fall back to the generic
// diagnosis that suggests doctor commands.
let err_str = "K8s namespace not ready\n\nCaused by:\n \
timed out waiting for namespace 'openshell' to exist: \
error: the server doesn't have a resource type \"namespace\"";
let container_logs = Some("k3s is starting\nwaiting for kube-apiserver");

let diagnosis = diagnose_failure("my-gw", err_str, container_logs)
.unwrap_or_else(|| generic_failure_diagnosis("my-gw"));

// Should have gotten the generic diagnosis (no specific pattern matched)
assert_eq!(diagnosis.summary, "Gateway failed to start");
// Must contain actionable recovery steps
assert!(
!diagnosis.recovery_steps.is_empty(),
"generic diagnosis should have recovery steps"
);
// Must mention doctor commands
let all_commands: String = diagnosis
.recovery_steps
.iter()
.filter_map(|s| s.command.as_ref())
.cloned()
.collect::<Vec<_>>()
.join("\n");
assert!(
all_commands.contains("doctor logs"),
"should suggest 'doctor logs', got: {all_commands}"
);
assert!(
all_commands.contains("doctor check"),
"should suggest 'doctor check', got: {all_commands}"
);
assert!(
all_commands.contains("my-gw"),
"commands should include gateway name, got: {all_commands}"
);
}
}
24 changes: 22 additions & 2 deletions crates/openshell-bootstrap/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,21 @@ pub async fn gateway_container_logs<W: std::io::Write>(
Ok(())
}

/// Fetch the last `n` lines of container logs for a local gateway as a
/// `String`. This is a convenience wrapper for diagnostic call sites (e.g.
/// failure diagnosis in the CLI) that do not hold a Docker client handle.
///
/// Returns an empty string on any Docker/connection error so callers don't
/// need to worry about error handling.
pub async fn fetch_gateway_logs(name: &str, n: usize) -> String {
let docker = match Docker::connect_with_local_defaults() {
Ok(d) => d,
Err(_) => return String::new(),
};
let container = container_name(name);
fetch_recent_logs(&docker, &container, n).await
}

fn default_gateway_image_ref() -> String {
if let Ok(image) = std::env::var("OPENSHELL_CLUSTER_IMAGE")
&& !image.trim().is_empty()
Expand Down Expand Up @@ -984,7 +999,11 @@ async fn wait_for_namespace(
}

if attempt + 1 == attempts {
return Err(err).wrap_err("K8s namespace not ready");
let logs = fetch_recent_logs(docker, container_name, 40).await;
return Err(miette::miette!(
"exec failed on final attempt while waiting for namespace '{namespace}': {err}\n{logs}"
))
.wrap_err("K8s namespace not ready");
}
tokio::time::sleep(backoff).await;
backoff = std::cmp::min(backoff.saturating_mul(2), max_backoff);
Expand All @@ -997,8 +1016,9 @@ async fn wait_for_namespace(
}

if attempt + 1 == attempts {
let logs = fetch_recent_logs(docker, container_name, 40).await;
return Err(miette::miette!(
"timed out waiting for namespace '{namespace}' to exist: {output}"
"timed out waiting for namespace '{namespace}' to exist: {output}\n{logs}"
))
.wrap_err("K8s namespace not ready");
}
Expand Down
2 changes: 1 addition & 1 deletion crates/openshell-cli/src/bootstrap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ pub async fn run_bootstrap(
eprintln!();
eprintln!(
" Manage it later with: {} or {}",
"openshell gateway status".bold(),
"openshell status".bold(),
"openshell gateway stop".bold(),
);
eprintln!();
Expand Down
52 changes: 43 additions & 9 deletions crates/openshell-cli/src/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1248,19 +1248,27 @@ pub(crate) async fn deploy_gateway_with_panel(
"x".red().bold(),
"Gateway failed:".red().bold(),
);
// Fetch container logs for pattern-based diagnosis
let container_logs = openshell_bootstrap::fetch_gateway_logs(name, 80).await;
let logs_opt = if container_logs.is_empty() {
None
} else {
Some(container_logs.as_str())
};
// Try to diagnose the failure and provide guidance
let err_str = format!("{err:?}");
if let Some(diagnosis) =
openshell_bootstrap::errors::diagnose_failure(name, &err_str, None)
{
print_failure_diagnosis(&diagnosis);
}
let diagnosis =
openshell_bootstrap::errors::diagnose_failure(name, &err_str, logs_opt)
.unwrap_or_else(|| {
openshell_bootstrap::errors::generic_failure_diagnosis(name)
});
print_failure_diagnosis(&diagnosis);
Err(err)
}
}
} else {
eprintln!("Deploying {location} gateway {name}...");
let handle = openshell_bootstrap::deploy_gateway_with_logs(options, |line| {
let result = openshell_bootstrap::deploy_gateway_with_logs(options, |line| {
if let Some(status) = line.strip_prefix("[status] ") {
eprintln!(" {status}");
} else if line.strip_prefix("[progress] ").is_some() {
Expand All @@ -1269,9 +1277,35 @@ pub(crate) async fn deploy_gateway_with_panel(
eprintln!(" {line}");
}
})
.await?;
eprintln!("Gateway {name} ready.");
Ok(handle)
.await;
match result {
Ok(handle) => {
eprintln!("Gateway {name} ready.");
Ok(handle)
}
Err(err) => {
eprintln!(
"{} {} {name}",
"x".red().bold(),
"Gateway failed:".red().bold(),
);
// Fetch container logs for pattern-based diagnosis
let container_logs = openshell_bootstrap::fetch_gateway_logs(name, 80).await;
let logs_opt = if container_logs.is_empty() {
None
} else {
Some(container_logs.as_str())
};
let err_str = format!("{err:?}");
let diagnosis =
openshell_bootstrap::errors::diagnose_failure(name, &err_str, logs_opt)
.unwrap_or_else(|| {
openshell_bootstrap::errors::generic_failure_diagnosis(name)
});
print_failure_diagnosis(&diagnosis);
Err(err)
}
}
}
}

Expand Down
Loading