From 1735d3d55b6664d352c2182ab2f63640b71ed423 Mon Sep 17 00:00:00 2001 From: Codescribe Date: Wed, 6 May 2026 04:47:34 -0400 Subject: [PATCH] cmd/cli: skip upstream.os healthcheck when WFP loopback protect enabled When WFP loopback protect is active, the upstream.os healthcheck will always fail because an external WFP block filter is interfering with plain DNS. This demotes those expected failures to debug level and returns errOsHealthcheckSuppressed so the recovery loop treats them as non-fatal, eliminating the log spam described in #526. --- cmd/cli/dns_intercept_darwin.go | 4 ++++ cmd/cli/dns_intercept_others.go | 4 ++++ cmd/cli/dns_intercept_windows.go | 20 ++++++++++++++++++++ cmd/cli/dns_proxy.go | 19 +++++++++++++++---- 4 files changed, 43 insertions(+), 4 deletions(-) diff --git a/cmd/cli/dns_intercept_darwin.go b/cmd/cli/dns_intercept_darwin.go index 5740b41..7b80ca0 100644 --- a/cmd/cli/dns_intercept_darwin.go +++ b/cmd/cli/dns_intercept_darwin.go @@ -1742,3 +1742,7 @@ func (p *prog) forceReloadPFMainRuleset() { mainLog.Load().Info().Msg("DNS intercept: force reload — pf ruleset and anchor reloaded successfully") } + +// osHealthcheckSuppressed always returns false on darwin — WFP loopback +// protect (the trigger for suppression) is Windows-only. +func (p *prog) osHealthcheckSuppressed() bool { return false } diff --git a/cmd/cli/dns_intercept_others.go b/cmd/cli/dns_intercept_others.go index 9f3c903..50c7fd0 100644 --- a/cmd/cli/dns_intercept_others.go +++ b/cmd/cli/dns_intercept_others.go @@ -37,3 +37,7 @@ func (p *prog) scheduleDelayedRechecks() {} // pfInterceptMonitor is a no-op on unsupported platforms. func (p *prog) pfInterceptMonitor() {} + +// osHealthcheckSuppressed always returns false on non-Windows platforms — +// WFP loopback protect (the trigger for suppression) is Windows-only. +func (p *prog) osHealthcheckSuppressed() bool { return false } diff --git a/cmd/cli/dns_intercept_windows.go b/cmd/cli/dns_intercept_windows.go index 063d4f9..208f0e2 100644 --- a/cmd/cli/dns_intercept_windows.go +++ b/cmd/cli/dns_intercept_windows.go @@ -1260,6 +1260,26 @@ func (p *prog) activateLoopbackWFPProtect(state *wfpState) error { return nil } +// osHealthcheckSuppressed reports whether the upstream.os healthcheck should +// be skipped because DNS intercept mode is active and the WFP loopback protect +// has been engaged. Loopback protect is only activated when an external WFP +// block filter (e.g. OpenVPN's block-outside-dns) is interfering with DNS, +// which is the same condition that makes the OS resolver healthcheck fail +// every 2s with i/o timeout — so suppressing the check avoids the log spam +// described in issue #526. +func (p *prog) osHealthcheckSuppressed() bool { + if !dnsIntercept || p.dnsInterceptState == nil { + return false + } + state, ok := p.dnsInterceptState.(*wfpState) + if !ok || state == nil { + return false + } + state.mu.Lock() + defer state.mu.Unlock() + return state.loopbackProtectActive +} + // deactivateLoopbackWFPProtectLocked is the lock-free inner implementation. // Caller must hold state.mu. func (p *prog) deactivateLoopbackWFPProtectLocked(state *wfpState) { diff --git a/cmd/cli/dns_proxy.go b/cmd/cli/dns_proxy.go index c2c248e..c8a2b74 100644 --- a/cmd/cli/dns_proxy.go +++ b/cmd/cli/dns_proxy.go @@ -1809,6 +1809,8 @@ func interfaceIPsEqual(a, b []netip.Prefix) bool { return true } +var errOsHealthcheckSuppressed = errors.New("upstream os health check suppressed") + // checkUpstreamOnce sends a test query to the specified upstream. // Returns nil if the upstream responds successfully. func (p *prog) checkUpstreamOnce(upstream string, uc *ctrld.UpstreamConfig) error { @@ -1838,11 +1840,19 @@ func (p *prog) checkUpstreamOnce(upstream string, uc *ctrld.UpstreamConfig) erro duration := time.Since(start) if err != nil { + // Demote upstream.os check failures to debug while WFP loopback + // protect is active: an external WFP block filter is interfering + // with plain DNS so repeated failures here are expected. Other + // upstreams keep error level so real outages stay visible. + if upstream == upstreamOS && p.osHealthcheckSuppressed() { + p.Debug().Err(err).Msgf("Upstream %s check failed after %v (WFP loopback protect active)", upstream, duration) + return errOsHealthcheckSuppressed + } p.Error().Err(err).Msgf("Upstream %s check failed after %v", upstream, duration) - } else { - p.Debug().Msgf("Upstream %s responded successfully in %v", upstream, duration) + return err } - return err + p.Debug().Msgf("Upstream %s responded successfully in %v", upstream, duration) + return nil } // handleRecovery orchestrates the recovery process by coordinating multiple smaller methods. @@ -2121,7 +2131,8 @@ func (p *prog) waitForUpstreamRecovery(ctx context.Context, upstreams map[string default: attempts++ // checkUpstreamOnce will reset any failure counters on success. - if err := p.checkUpstreamOnce(name, uc); err == nil { + err := p.checkUpstreamOnce(name, uc) + if err == nil || errors.Is(err, errOsHealthcheckSuppressed) { p.Debug().Msgf("Upstream %s recovered successfully", name) select { case recoveredCh <- name: