fix: handled edge network issue when doing provisioning

This commit is contained in:
robcholz
2026-02-09 01:16:20 -05:00
parent ecfce7acf7
commit b433d3ef93
2 changed files with 151 additions and 6 deletions

View File

@@ -1,12 +1,55 @@
#!/bin/bash
set -eux
set -eEux
trap 'echo "[vibebox][error] provisioning failed"; echo "VIBEBOX_PROVISION_FAILED"; systemctl poweroff || true; exit 1' ERR
# Wait for network + DNS before apt-get to avoid early boot flakiness.
wait_for_network() {
echo "[vibebox] waiting for network/DNS readiness"
local deadline=$((SECONDS + 60))
while [ "$SECONDS" -lt "$deadline" ]; do
local has_route=0
if ip -4 route show default >/dev/null 2>&1; then
has_route=1
elif ip -6 route show default >/dev/null 2>&1; then
has_route=1
fi
if [ "$has_route" -eq 1 ]; then
if getent hosts deb.debian.org >/dev/null 2>&1; then
return 0
fi
fi
sleep 1
done
echo "[vibebox][warn] network/DNS still not ready after 60s; continuing" >&2
echo "[vibebox][warn] /etc/resolv.conf:" >&2
cat /etc/resolv.conf >&2 || true
ip -br addr >&2 || true
ip route >&2 || true
ip -6 route >&2 || true
return 0
}
apt_update_with_retries() {
local attempt=1
while [ "$attempt" -le 5 ]; do
if apt-get update; then
return 0
fi
echo "[vibebox][warn] apt-get update failed (attempt ${attempt}/5); retrying..." >&2
attempt=$((attempt + 1))
sleep 2
done
return 1
}
# Don't wait too long for slow mirrors.
echo 'Acquire::http::Timeout "2";' | tee /etc/apt/apt.conf.d/99timeout
echo 'Acquire::https::Timeout "2";' | tee -a /etc/apt/apt.conf.d/99timeout
echo 'Acquire::Retries "2";' | tee -a /etc/apt/apt.conf.d/99timeout
apt-get update
wait_for_network
apt_update_with_retries
apt-get install -y --no-install-recommends \
build-essential \
pkg-config \
@@ -55,4 +98,5 @@ sleep 100 # sleep here so that we don't see the login screen flash up before the
EOF
# Done provisioning, power off the VM
echo "VIBEBOX_PROVISION_OK"
systemctl poweroff

109
src/vm.rs
View File

@@ -40,6 +40,7 @@ const DEFAULT_RAM_MB: u64 = 2048;
const DEFAULT_RAM_BYTES: u64 = DEFAULT_RAM_MB * BYTES_PER_MB;
const START_TIMEOUT: Duration = Duration::from_secs(60);
const LOGIN_EXPECT_TIMEOUT: Duration = Duration::from_secs(120);
const PROVISION_EXPECT_TIMEOUT: Duration = Duration::from_secs(900);
struct StatusFile {
path: PathBuf,
@@ -76,7 +77,15 @@ const BASE_DISK_RAW_NAME: &str = "disk.raw";
#[derive(Clone)]
pub(crate) enum LoginAction {
Expect { text: String, timeout: Duration },
Expect {
text: String,
timeout: Duration,
},
ExpectEither {
success: String,
failure: String,
timeout: Duration,
},
Send(String),
}
use LoginAction::*;
@@ -359,6 +368,12 @@ enum WaitResult {
Found,
}
#[derive(PartialEq, Eq)]
enum WaitAnyResult {
Timeout,
Found(usize),
}
pub enum VmInput {
Bytes(Vec<u8>),
Shutdown,
@@ -366,6 +381,7 @@ pub enum VmInput {
enum VmOutput {
LoginActionTimeout { action: String, timeout: Duration },
LoginActionFailed { action: String, reason: String },
}
#[derive(Default)]
@@ -402,6 +418,41 @@ impl OutputMonitor {
WaitResult::Found
}
}
fn wait_for_any(&self, needles: &[&str], timeout: Duration) -> WaitAnyResult {
let mut found: Option<usize> = None;
let (_unused, timeout_result) = self
.condvar
.wait_timeout_while(self.buffer.lock().unwrap(), timeout, |buf| {
if let Some((pos, idx, len)) = find_any(buf, needles) {
*buf = buf[(pos + len)..].to_string();
found = Some(idx);
false
} else {
true
}
})
.unwrap();
if timeout_result.timed_out() {
WaitAnyResult::Timeout
} else {
WaitAnyResult::Found(found.unwrap_or(0))
}
}
}
fn find_any(buf: &str, needles: &[&str]) -> Option<(usize, usize, usize)> {
let mut best: Option<(usize, usize, usize)> = None; // (pos, idx, len)
for (idx, needle) in needles.iter().enumerate() {
if let Some(pos) = buf.find(needle) {
let candidate = (pos, idx, needle.len());
if best.is_none_or(|b| candidate.0 < b.0) {
best = Some(candidate);
}
}
}
best
}
#[derive(Debug)]
@@ -546,14 +597,25 @@ fn ensure_default_image(
fs::copy(base_raw, default_raw)?;
let provision_command = script_command_from_content(PROVISION_SCRIPT_NAME, PROVISION_SCRIPT)?;
run_vm(
let provision_actions = [
Send(provision_command),
ExpectEither {
success: "VIBEBOX_PROVISION_OK".to_string(),
failure: "VIBEBOX_PROVISION_FAILED".to_string(),
timeout: PROVISION_EXPECT_TIMEOUT,
},
];
if let Err(err) = run_vm(
default_raw,
&[Send(provision_command)],
&provision_actions,
directory_shares,
DEFAULT_CPU_COUNT,
DEFAULT_RAM_BYTES,
None,
)?;
) {
let _ = fs::remove_file(default_raw);
return Err(err);
}
Ok(())
}
@@ -1033,6 +1095,27 @@ fn spawn_login_actions_thread(
return;
}
}
ExpectEither {
success,
failure,
timeout,
} => match output_monitor.wait_for_any(&[&success, &failure], timeout) {
WaitAnyResult::Found(0) => {}
WaitAnyResult::Found(_) => {
let _ = vm_output_tx.send(VmOutput::LoginActionFailed {
action: format!("expect '{}'", success),
reason: format!("saw failure marker '{}'", failure),
});
return;
}
WaitAnyResult::Timeout => {
let _ = vm_output_tx.send(VmOutput::LoginActionTimeout {
action: format!("expect '{}'", success),
timeout,
});
return;
}
},
Send(mut text) => {
text.push('\n'); // Type the newline so the command is actually submitted.
input_tx.send(VmInput::Bytes(text.into_bytes())).unwrap();
@@ -1192,6 +1275,24 @@ where
}
break;
}
Ok(VmOutput::LoginActionFailed { action, reason }) => {
exit_result = Err(format!(
"Login action ({}) failed: {}; shutting down.",
action, reason
)
.into());
unsafe {
if vm.canRequestStop() {
if let Err(err) = vm.requestStopWithError() {
tracing::error!(error = ?err, "failed to request VM stop");
}
} else if vm.canStop() {
let handler = RcBlock::new(|_error: *mut NSError| {});
vm.stopWithCompletionHandler(&handler);
}
}
break;
}
Err(mpsc::TryRecvError::Empty) => {}
Err(mpsc::TryRecvError::Disconnected) => {}
}