diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs index 1511b362..f316b953 100644 --- a/crates/openshell-bootstrap/src/errors.rs +++ b/crates/openshell-bootstrap/src/errors.rs @@ -449,16 +449,20 @@ pub fn generic_failure_diagnosis(gateway_name: &str) -> GatewayFailureDiagnosis summary: "Gateway failed to start".to_string(), explanation: "The gateway encountered an unexpected error during startup.".to_string(), recovery_steps: vec![ + RecoveryStep::with_command( + "Check container logs for details", + format!("openshell doctor logs --name {gateway_name}"), + ), + RecoveryStep::with_command( + "Run diagnostics", + format!("openshell doctor check --name {gateway_name}"), + ), RecoveryStep::with_command( "Try destroying and recreating the gateway", format!( "openshell gateway destroy --name {gateway_name} && openshell gateway start" ), ), - RecoveryStep::with_command( - "Check container logs for details", - format!("docker logs openshell-cluster-{gateway_name}"), - ), RecoveryStep::new( "If the issue persists, report it at https://github.com/nvidia/openshell/issues", ), @@ -650,4 +654,196 @@ mod tests { ); assert!(d.retryable); } + + // -- generic_failure_diagnosis tests -- + + #[test] + fn generic_diagnosis_suggests_doctor_logs() { + let d = generic_failure_diagnosis("my-gw"); + let commands: Vec = d + .recovery_steps + .iter() + .filter_map(|s| s.command.clone()) + .collect(); + assert!( + commands.iter().any(|c| c.contains("openshell doctor logs")), + "expected 'openshell doctor logs' in recovery commands, got: {commands:?}" + ); + } + + #[test] + fn generic_diagnosis_suggests_doctor_check() { + let d = generic_failure_diagnosis("my-gw"); + let commands: Vec = d + .recovery_steps + .iter() + .filter_map(|s| s.command.clone()) + .collect(); + assert!( + commands + .iter() + .any(|c| c.contains("openshell doctor check")), + "expected 'openshell doctor check' in recovery commands, got: {commands:?}" + ); + } + + #[test] + fn generic_diagnosis_includes_gateway_name() { + let d = generic_failure_diagnosis("custom-name"); + let all_text: String = d + .recovery_steps + .iter() + .filter_map(|s| s.command.clone()) + .collect::>() + .join(" "); + assert!( + all_text.contains("custom-name"), + "expected gateway name in recovery commands, got: {all_text}" + ); + } + + // -- fallback behavior tests -- + + #[test] + fn namespace_timeout_without_logs_returns_none() { + // This is the most common user-facing error: a plain timeout with only + // kubectl output. It must NOT match any specific pattern so the caller + // can fall back to generic_failure_diagnosis. + let diagnosis = diagnose_failure( + "test", + "K8s namespace not ready\n\nCaused by:\n \ + timed out waiting for namespace 'openshell' to exist: \ + error: the server doesn't have a resource type \"namespace\"", + None, + ); + assert!( + diagnosis.is_none(), + "plain namespace timeout should not match any specific pattern, got: {:?}", + diagnosis.map(|d| d.summary) + ); + } + + #[test] + fn namespace_timeout_with_pressure_logs_matches() { + // When container logs reveal node pressure, the diagnosis engine + // should detect it even though the error message itself is generic. + let diagnosis = diagnose_failure( + "test", + "K8s namespace not ready\n\nCaused by:\n \ + timed out waiting for namespace 'openshell' to exist: ", + Some("HEALTHCHECK_NODE_PRESSURE: DiskPressure"), + ); + assert!(diagnosis.is_some(), "expected node pressure diagnosis"); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("pressure"), + "expected pressure in summary, got: {}", + d.summary + ); + } + + #[test] + fn namespace_timeout_with_corrupted_state_logs_matches() { + // Container logs revealing RBAC corruption should be caught. + let diagnosis = diagnose_failure( + "test", + "K8s namespace not ready\n\nCaused by:\n \ + timed out waiting for namespace 'openshell' to exist: ", + Some( + "configmaps \"extension-apiserver-authentication\" is forbidden: \ + User cannot get resource", + ), + ); + assert!(diagnosis.is_some(), "expected corrupted state diagnosis"); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("Corrupted"), + "expected Corrupted in summary, got: {}", + d.summary + ); + } + + #[test] + fn namespace_timeout_with_no_route_logs_matches() { + let diagnosis = diagnose_failure( + "test", + "K8s namespace not ready", + Some("Error: no default route present before starting k3s"), + ); + assert!(diagnosis.is_some(), "expected networking diagnosis"); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("networking"), + "expected networking in summary, got: {}", + d.summary + ); + } + + #[test] + fn diagnose_failure_with_logs_uses_combined_text() { + // Verify that diagnose_failure combines error_message + container_logs + // for pattern matching. The pattern "connection refused" is in logs, + // not in the error message. + let diagnosis = diagnose_failure( + "test", + "K8s namespace not ready", + Some("dial tcp 127.0.0.1:6443: connect: connection refused"), + ); + assert!( + diagnosis.is_some(), + "expected diagnosis from container logs pattern" + ); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("Network") || d.summary.contains("connectivity"), + "expected network diagnosis, got: {}", + d.summary + ); + } + + // -- end-to-end fallback pattern (mirrors CLI code) -- + + #[test] + fn fallback_to_generic_produces_actionable_diagnosis() { + // This mirrors the actual CLI pattern: + // diagnose_failure(...).unwrap_or_else(|| generic_failure_diagnosis(name)) + // For a plain namespace timeout with no useful container logs, the + // specific matcher returns None and we must fall back to the generic + // diagnosis that suggests doctor commands. + let err_str = "K8s namespace not ready\n\nCaused by:\n \ + timed out waiting for namespace 'openshell' to exist: \ + error: the server doesn't have a resource type \"namespace\""; + let container_logs = Some("k3s is starting\nwaiting for kube-apiserver"); + + let diagnosis = diagnose_failure("my-gw", err_str, container_logs) + .unwrap_or_else(|| generic_failure_diagnosis("my-gw")); + + // Should have gotten the generic diagnosis (no specific pattern matched) + assert_eq!(diagnosis.summary, "Gateway failed to start"); + // Must contain actionable recovery steps + assert!( + !diagnosis.recovery_steps.is_empty(), + "generic diagnosis should have recovery steps" + ); + // Must mention doctor commands + let all_commands: String = diagnosis + .recovery_steps + .iter() + .filter_map(|s| s.command.as_ref()) + .cloned() + .collect::>() + .join("\n"); + assert!( + all_commands.contains("doctor logs"), + "should suggest 'doctor logs', got: {all_commands}" + ); + assert!( + all_commands.contains("doctor check"), + "should suggest 'doctor check', got: {all_commands}" + ); + assert!( + all_commands.contains("my-gw"), + "commands should include gateway name, got: {all_commands}" + ); + } } diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index b6423aae..48f8d74d 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -638,6 +638,21 @@ pub async fn gateway_container_logs( Ok(()) } +/// Fetch the last `n` lines of container logs for a local gateway as a +/// `String`. This is a convenience wrapper for diagnostic call sites (e.g. +/// failure diagnosis in the CLI) that do not hold a Docker client handle. +/// +/// Returns an empty string on any Docker/connection error so callers don't +/// need to worry about error handling. +pub async fn fetch_gateway_logs(name: &str, n: usize) -> String { + let docker = match Docker::connect_with_local_defaults() { + Ok(d) => d, + Err(_) => return String::new(), + }; + let container = container_name(name); + fetch_recent_logs(&docker, &container, n).await +} + fn default_gateway_image_ref() -> String { if let Ok(image) = std::env::var("OPENSHELL_CLUSTER_IMAGE") && !image.trim().is_empty() @@ -984,7 +999,11 @@ async fn wait_for_namespace( } if attempt + 1 == attempts { - return Err(err).wrap_err("K8s namespace not ready"); + let logs = fetch_recent_logs(docker, container_name, 40).await; + return Err(miette::miette!( + "exec failed on final attempt while waiting for namespace '{namespace}': {err}\n{logs}" + )) + .wrap_err("K8s namespace not ready"); } tokio::time::sleep(backoff).await; backoff = std::cmp::min(backoff.saturating_mul(2), max_backoff); @@ -997,8 +1016,9 @@ async fn wait_for_namespace( } if attempt + 1 == attempts { + let logs = fetch_recent_logs(docker, container_name, 40).await; return Err(miette::miette!( - "timed out waiting for namespace '{namespace}' to exist: {output}" + "timed out waiting for namespace '{namespace}' to exist: {output}\n{logs}" )) .wrap_err("K8s namespace not ready"); } diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs index eb8f93a3..e976061f 100644 --- a/crates/openshell-cli/src/bootstrap.rs +++ b/crates/openshell-cli/src/bootstrap.rs @@ -139,7 +139,7 @@ pub async fn run_bootstrap( eprintln!(); eprintln!( " Manage it later with: {} or {}", - "openshell gateway status".bold(), + "openshell status".bold(), "openshell gateway stop".bold(), ); eprintln!(); diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 37f11fcb..2f9dd2f7 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1248,19 +1248,27 @@ pub(crate) async fn deploy_gateway_with_panel( "x".red().bold(), "Gateway failed:".red().bold(), ); + // Fetch container logs for pattern-based diagnosis + let container_logs = openshell_bootstrap::fetch_gateway_logs(name, 80).await; + let logs_opt = if container_logs.is_empty() { + None + } else { + Some(container_logs.as_str()) + }; // Try to diagnose the failure and provide guidance let err_str = format!("{err:?}"); - if let Some(diagnosis) = - openshell_bootstrap::errors::diagnose_failure(name, &err_str, None) - { - print_failure_diagnosis(&diagnosis); - } + let diagnosis = + openshell_bootstrap::errors::diagnose_failure(name, &err_str, logs_opt) + .unwrap_or_else(|| { + openshell_bootstrap::errors::generic_failure_diagnosis(name) + }); + print_failure_diagnosis(&diagnosis); Err(err) } } } else { eprintln!("Deploying {location} gateway {name}..."); - let handle = openshell_bootstrap::deploy_gateway_with_logs(options, |line| { + let result = openshell_bootstrap::deploy_gateway_with_logs(options, |line| { if let Some(status) = line.strip_prefix("[status] ") { eprintln!(" {status}"); } else if line.strip_prefix("[progress] ").is_some() { @@ -1269,9 +1277,35 @@ pub(crate) async fn deploy_gateway_with_panel( eprintln!(" {line}"); } }) - .await?; - eprintln!("Gateway {name} ready."); - Ok(handle) + .await; + match result { + Ok(handle) => { + eprintln!("Gateway {name} ready."); + Ok(handle) + } + Err(err) => { + eprintln!( + "{} {} {name}", + "x".red().bold(), + "Gateway failed:".red().bold(), + ); + // Fetch container logs for pattern-based diagnosis + let container_logs = openshell_bootstrap::fetch_gateway_logs(name, 80).await; + let logs_opt = if container_logs.is_empty() { + None + } else { + Some(container_logs.as_str()) + }; + let err_str = format!("{err:?}"); + let diagnosis = + openshell_bootstrap::errors::diagnose_failure(name, &err_str, logs_opt) + .unwrap_or_else(|| { + openshell_bootstrap::errors::generic_failure_diagnosis(name) + }); + print_failure_diagnosis(&diagnosis); + Err(err) + } + } } }