From 2b27c148bedb31804680ca3aadc0972a48e35b87 Mon Sep 17 00:00:00 2001 From: Codescribe Date: Wed, 29 Apr 2026 04:01:04 -0400 Subject: [PATCH] dns: recovery race condition fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three changes to reduce worst-case recovery from ~30s to <3s: 1. debounceRecovery() for network changes (500ms window) — coalesces rapid consecutive network changes into a single recovery pass, eliminating the cancel-and-restart race. 2. ForceReBootstrap() on recovery entry — closes dead connections and creates fresh transports synchronously before probing, replacing the lazy ReBootstrap() flag that left stale connections. 3. Combined effect: recovery probes never inherit dead connections from a canceled prior recovery attempt. --- cmd/cli/dns_proxy.go | 46 +++++++++++++++++++++++++++++++++++++++++++- cmd/cli/prog.go | 6 ++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/cmd/cli/dns_proxy.go b/cmd/cli/dns_proxy.go index f1aa810..d12ab35 100644 --- a/cmd/cli/dns_proxy.go +++ b/cmd/cli/dns_proxy.go @@ -1580,7 +1580,7 @@ func (p *prog) monitorNetworkChanges() error { // we only trigger recovery flow for network changes on non router devices if router.Name() == "" { - p.handleRecovery(RecoveryReasonNetworkChange) + p.debounceRecovery() } // After network changes, verify our pf anchor is still active and @@ -1693,6 +1693,35 @@ func (p *prog) checkUpstreamOnce(upstream string, uc *ctrld.UpstreamConfig) erro return err } +// recoveryDebounceWindow is the time to wait after the last network change +// before triggering handleRecovery. This coalesces rapid consecutive network +// changes (e.g., hotspot→LAN causing en1 drop + en0 pickup + en1 re-pickup) +// into a single recovery pass, avoiding the cancel-and-restart race that +// leaves DoH transports in a stale state. +const recoveryDebounceWindow = 500 * time.Millisecond + +// debounceRecovery schedules a handleRecovery(NetworkChange) call after a debounce +// window. If called again before the window expires, the timer is reset so that +// recovery runs once with the final network state. All other state updates (IP, +// pf anchor, VPN DNS, tunnel checks) run immediately — only the recovery flow +// with its upstream probing and DHCP bypass logic is debounced. +func (p *prog) debounceRecovery() { + p.recoveryDebounceMu.Lock() + defer p.recoveryDebounceMu.Unlock() + + if p.recoveryDebounceTimer != nil { + p.recoveryDebounceTimer.Stop() + mainLog.Load().Debug().Msg("Recovery debounce: resetting timer (rapid network change)") + } + p.recoveryDebounceTimer = time.AfterFunc(recoveryDebounceWindow, func() { + p.recoveryDebounceMu.Lock() + p.recoveryDebounceTimer = nil + p.recoveryDebounceMu.Unlock() + p.handleRecovery(RecoveryReasonNetworkChange) + }) + mainLog.Load().Debug().Msg("Recovery debounce: scheduled (500ms window)") +} + // handleRecovery performs a unified recovery by removing DNS settings, // canceling existing recovery checks for network changes, but coalescing duplicate // upstream failure recoveries, waiting for recovery to complete (using a cancellable context without timeout), @@ -1720,6 +1749,21 @@ func (p *prog) handleRecovery(reason RecoveryReason) { p.recoveryCancelMu.Unlock() } + // For network changes, force-reset all upstream transports synchronously. + // The lazy ReBootstrap() called earlier in the network change callback only + // sets a flag — the old transport's dead connections can still be used by + // recovery probes, causing context deadline timeouts. ForceReBootstrap() + // closes old connections and creates fresh transports so probes succeed on + // first attempt. + if reason == RecoveryReasonNetworkChange { + for _, uc := range p.cfg.Upstream { + if uc != nil { + uc.ForceReBootstrap() + } + } + mainLog.Load().Info().Msg("Force-reset upstream transports for network change recovery") + } + // Create a new recovery context without a fixed timeout. p.recoveryCancelMu.Lock() recoveryCtx, cancel := context.WithCancel(context.Background()) diff --git a/cmd/cli/prog.go b/cmd/cli/prog.go index c579a80..96fd2fe 100644 --- a/cmd/cli/prog.go +++ b/cmd/cli/prog.go @@ -146,6 +146,12 @@ type prog struct { recoveryCancel context.CancelFunc recoveryRunning atomic.Bool + // recoveryDebounceTimer coalesces rapid NetworkChange recovery triggers + // into a single handleRecovery call. Only handleRecovery is debounced — + // all other state updates (IP, pf anchor, VPN DNS) run immediately. + recoveryDebounceMu sync.Mutex + recoveryDebounceTimer *time.Timer + // recoveryBypass is set when dns-intercept mode enters recovery. // When true, proxy() forwards all queries to OS/DHCP resolver // instead of using the normal upstream flow.