dns: recovery race condition fix

Three changes to reduce worst-case recovery from ~30s to <3s: 1. debounceRecovery() for network changes (500ms window) — coalesces rapid consecutive network changes into a single recovery pass, eliminating the cancel-and-restart race. 2. ForceReBootstrap() on recovery entry — closes dead connections and creates fresh transports synchronously before probing, replacing the lazy ReBootstrap() flag that left stale connections. 3. Combined effect: recovery probes never inherit dead connections from a canceled prior recovery attempt.
2026-05-02 12:45:10 +02:00 · 2026-04-29 04:01:04 -04:00
parent 8cb383d87e
commit 2b27c148be
2 changed files with 51 additions and 1 deletions
@@ -1580,7 +1580,7 @@ func (p *prog) monitorNetworkChanges() error {

 		// we only trigger recovery flow for network changes on non router devices
 		if router.Name() == "" {
-			p.handleRecovery(RecoveryReasonNetworkChange)
+			p.debounceRecovery()
 		}

 		// After network changes, verify our pf anchor is still active and
@@ -1693,6 +1693,35 @@ func (p *prog) checkUpstreamOnce(upstream string, uc *ctrld.UpstreamConfig) erro
 	return err
 }

+// recoveryDebounceWindow is the time to wait after the last network change
+// before triggering handleRecovery. This coalesces rapid consecutive network
+// changes (e.g., hotspot→LAN causing en1 drop + en0 pickup + en1 re-pickup)
+// into a single recovery pass, avoiding the cancel-and-restart race that
+// leaves DoH transports in a stale state.
+const recoveryDebounceWindow = 500 * time.Millisecond
+
+// debounceRecovery schedules a handleRecovery(NetworkChange) call after a debounce
+// window. If called again before the window expires, the timer is reset so that
+// recovery runs once with the final network state. All other state updates (IP,
+// pf anchor, VPN DNS, tunnel checks) run immediately — only the recovery flow
+// with its upstream probing and DHCP bypass logic is debounced.
+func (p *prog) debounceRecovery() {
+	p.recoveryDebounceMu.Lock()
+	defer p.recoveryDebounceMu.Unlock()
+
+	if p.recoveryDebounceTimer != nil {
+		p.recoveryDebounceTimer.Stop()
+		mainLog.Load().Debug().Msg("Recovery debounce: resetting timer (rapid network change)")
+	}
+	p.recoveryDebounceTimer = time.AfterFunc(recoveryDebounceWindow, func() {
+		p.recoveryDebounceMu.Lock()
+		p.recoveryDebounceTimer = nil
+		p.recoveryDebounceMu.Unlock()
+		p.handleRecovery(RecoveryReasonNetworkChange)
+	})
+	mainLog.Load().Debug().Msg("Recovery debounce: scheduled (500ms window)")
+}
+
 // handleRecovery performs a unified recovery by removing DNS settings,
 // canceling existing recovery checks for network changes, but coalescing duplicate
 // upstream failure recoveries, waiting for recovery to complete (using a cancellable context without timeout),
@@ -1720,6 +1749,21 @@ func (p *prog) handleRecovery(reason RecoveryReason) {
 		p.recoveryCancelMu.Unlock()
 	}

+	// For network changes, force-reset all upstream transports synchronously.
+	// The lazy ReBootstrap() called earlier in the network change callback only
+	// sets a flag — the old transport's dead connections can still be used by
+	// recovery probes, causing context deadline timeouts. ForceReBootstrap()
+	// closes old connections and creates fresh transports so probes succeed on
+	// first attempt.
+	if reason == RecoveryReasonNetworkChange {
+		for _, uc := range p.cfg.Upstream {
+			if uc != nil {
+				uc.ForceReBootstrap()
+			}
+		}
+		mainLog.Load().Info().Msg("Force-reset upstream transports for network change recovery")
+	}
+
 	// Create a new recovery context without a fixed timeout.
 	p.recoveryCancelMu.Lock()
 	recoveryCtx, cancel := context.WithCancel(context.Background())