From b433d3ef93a2d1b4523f806c656286114f0fcc49 Mon Sep 17 00:00:00 2001 From: robcholz <84130577+robcholz@users.noreply.github.com> Date: Mon, 9 Feb 2026 01:16:20 -0500 Subject: [PATCH] fix: handled edge network issue when doing provisioning --- src/provision.sh | 48 ++++++++++++++++++++- src/vm.rs | 109 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 151 insertions(+), 6 deletions(-) diff --git a/src/provision.sh b/src/provision.sh index 3394ffd..fe95476 100644 --- a/src/provision.sh +++ b/src/provision.sh @@ -1,12 +1,55 @@ #!/bin/bash -set -eux +set -eEux + +trap 'echo "[vibebox][error] provisioning failed"; echo "VIBEBOX_PROVISION_FAILED"; systemctl poweroff || true; exit 1' ERR + +# Wait for network + DNS before apt-get to avoid early boot flakiness. +wait_for_network() { + echo "[vibebox] waiting for network/DNS readiness" + local deadline=$((SECONDS + 60)) + while [ "$SECONDS" -lt "$deadline" ]; do + local has_route=0 + if ip -4 route show default >/dev/null 2>&1; then + has_route=1 + elif ip -6 route show default >/dev/null 2>&1; then + has_route=1 + fi + if [ "$has_route" -eq 1 ]; then + if getent hosts deb.debian.org >/dev/null 2>&1; then + return 0 + fi + fi + sleep 1 + done + echo "[vibebox][warn] network/DNS still not ready after 60s; continuing" >&2 + echo "[vibebox][warn] /etc/resolv.conf:" >&2 + cat /etc/resolv.conf >&2 || true + ip -br addr >&2 || true + ip route >&2 || true + ip -6 route >&2 || true + return 0 +} + +apt_update_with_retries() { + local attempt=1 + while [ "$attempt" -le 5 ]; do + if apt-get update; then + return 0 + fi + echo "[vibebox][warn] apt-get update failed (attempt ${attempt}/5); retrying..." >&2 + attempt=$((attempt + 1)) + sleep 2 + done + return 1 +} # Don't wait too long for slow mirrors. echo 'Acquire::http::Timeout "2";' | tee /etc/apt/apt.conf.d/99timeout echo 'Acquire::https::Timeout "2";' | tee -a /etc/apt/apt.conf.d/99timeout echo 'Acquire::Retries "2";' | tee -a /etc/apt/apt.conf.d/99timeout -apt-get update +wait_for_network +apt_update_with_retries apt-get install -y --no-install-recommends \ build-essential \ pkg-config \ @@ -55,4 +98,5 @@ sleep 100 # sleep here so that we don't see the login screen flash up before the EOF # Done provisioning, power off the VM +echo "VIBEBOX_PROVISION_OK" systemctl poweroff diff --git a/src/vm.rs b/src/vm.rs index 17c5716..37189a9 100644 --- a/src/vm.rs +++ b/src/vm.rs @@ -40,6 +40,7 @@ const DEFAULT_RAM_MB: u64 = 2048; const DEFAULT_RAM_BYTES: u64 = DEFAULT_RAM_MB * BYTES_PER_MB; const START_TIMEOUT: Duration = Duration::from_secs(60); const LOGIN_EXPECT_TIMEOUT: Duration = Duration::from_secs(120); +const PROVISION_EXPECT_TIMEOUT: Duration = Duration::from_secs(900); struct StatusFile { path: PathBuf, @@ -76,7 +77,15 @@ const BASE_DISK_RAW_NAME: &str = "disk.raw"; #[derive(Clone)] pub(crate) enum LoginAction { - Expect { text: String, timeout: Duration }, + Expect { + text: String, + timeout: Duration, + }, + ExpectEither { + success: String, + failure: String, + timeout: Duration, + }, Send(String), } use LoginAction::*; @@ -359,6 +368,12 @@ enum WaitResult { Found, } +#[derive(PartialEq, Eq)] +enum WaitAnyResult { + Timeout, + Found(usize), +} + pub enum VmInput { Bytes(Vec), Shutdown, @@ -366,6 +381,7 @@ pub enum VmInput { enum VmOutput { LoginActionTimeout { action: String, timeout: Duration }, + LoginActionFailed { action: String, reason: String }, } #[derive(Default)] @@ -402,6 +418,41 @@ impl OutputMonitor { WaitResult::Found } } + + fn wait_for_any(&self, needles: &[&str], timeout: Duration) -> WaitAnyResult { + let mut found: Option = None; + let (_unused, timeout_result) = self + .condvar + .wait_timeout_while(self.buffer.lock().unwrap(), timeout, |buf| { + if let Some((pos, idx, len)) = find_any(buf, needles) { + *buf = buf[(pos + len)..].to_string(); + found = Some(idx); + false + } else { + true + } + }) + .unwrap(); + + if timeout_result.timed_out() { + WaitAnyResult::Timeout + } else { + WaitAnyResult::Found(found.unwrap_or(0)) + } + } +} + +fn find_any(buf: &str, needles: &[&str]) -> Option<(usize, usize, usize)> { + let mut best: Option<(usize, usize, usize)> = None; // (pos, idx, len) + for (idx, needle) in needles.iter().enumerate() { + if let Some(pos) = buf.find(needle) { + let candidate = (pos, idx, needle.len()); + if best.is_none_or(|b| candidate.0 < b.0) { + best = Some(candidate); + } + } + } + best } #[derive(Debug)] @@ -546,14 +597,25 @@ fn ensure_default_image( fs::copy(base_raw, default_raw)?; let provision_command = script_command_from_content(PROVISION_SCRIPT_NAME, PROVISION_SCRIPT)?; - run_vm( + let provision_actions = [ + Send(provision_command), + ExpectEither { + success: "VIBEBOX_PROVISION_OK".to_string(), + failure: "VIBEBOX_PROVISION_FAILED".to_string(), + timeout: PROVISION_EXPECT_TIMEOUT, + }, + ]; + if let Err(err) = run_vm( default_raw, - &[Send(provision_command)], + &provision_actions, directory_shares, DEFAULT_CPU_COUNT, DEFAULT_RAM_BYTES, None, - )?; + ) { + let _ = fs::remove_file(default_raw); + return Err(err); + } Ok(()) } @@ -1033,6 +1095,27 @@ fn spawn_login_actions_thread( return; } } + ExpectEither { + success, + failure, + timeout, + } => match output_monitor.wait_for_any(&[&success, &failure], timeout) { + WaitAnyResult::Found(0) => {} + WaitAnyResult::Found(_) => { + let _ = vm_output_tx.send(VmOutput::LoginActionFailed { + action: format!("expect '{}'", success), + reason: format!("saw failure marker '{}'", failure), + }); + return; + } + WaitAnyResult::Timeout => { + let _ = vm_output_tx.send(VmOutput::LoginActionTimeout { + action: format!("expect '{}'", success), + timeout, + }); + return; + } + }, Send(mut text) => { text.push('\n'); // Type the newline so the command is actually submitted. input_tx.send(VmInput::Bytes(text.into_bytes())).unwrap(); @@ -1192,6 +1275,24 @@ where } break; } + Ok(VmOutput::LoginActionFailed { action, reason }) => { + exit_result = Err(format!( + "Login action ({}) failed: {}; shutting down.", + action, reason + ) + .into()); + unsafe { + if vm.canRequestStop() { + if let Err(err) = vm.requestStopWithError() { + tracing::error!(error = ?err, "failed to request VM stop"); + } + } else if vm.canStop() { + let handler = RcBlock::new(|_error: *mut NSError| {}); + vm.stopWithCompletionHandler(&handler); + } + } + break; + } Err(mpsc::TryRecvError::Empty) => {} Err(mpsc::TryRecvError::Disconnected) => {} }