cmd/cli: skip upstream.os healthcheck when WFP loopback protect enabled

When WFP loopback protect is active, the upstream.os healthcheck will always fail because an external WFP block filter is interfering with plain DNS. This demotes those expected failures to debug level and returns errOsHealthcheckSuppressed so the recovery loop treats them as non-fatal, eliminating the log spam described in #526.
2026-07-16 13:17:19 +02:00 · 2026-05-06 04:47:34 -04:00
parent 81aa6b237b
commit 1735d3d55b
4 changed files with 43 additions and 4 deletions
@@ -1742,3 +1742,7 @@ func (p *prog) forceReloadPFMainRuleset() {

 	mainLog.Load().Info().Msg("DNS intercept: force reload — pf ruleset and anchor reloaded successfully")
 }
+
+// osHealthcheckSuppressed always returns false on darwin — WFP loopback
+// protect (the trigger for suppression) is Windows-only.
+func (p *prog) osHealthcheckSuppressed() bool { return false }
@@ -37,3 +37,7 @@ func (p *prog) scheduleDelayedRechecks() {}

 // pfInterceptMonitor is a no-op on unsupported platforms.
 func (p *prog) pfInterceptMonitor() {}
+
+// osHealthcheckSuppressed always returns false on non-Windows platforms —
+// WFP loopback protect (the trigger for suppression) is Windows-only.
+func (p *prog) osHealthcheckSuppressed() bool { return false }
@@ -1260,6 +1260,26 @@ func (p *prog) activateLoopbackWFPProtect(state *wfpState) error {
 	return nil
 }

+// osHealthcheckSuppressed reports whether the upstream.os healthcheck should
+// be skipped because DNS intercept mode is active and the WFP loopback protect
+// has been engaged. Loopback protect is only activated when an external WFP
+// block filter (e.g. OpenVPN's block-outside-dns) is interfering with DNS,
+// which is the same condition that makes the OS resolver healthcheck fail
+// every 2s with i/o timeout — so suppressing the check avoids the log spam
+// described in issue #526.
+func (p *prog) osHealthcheckSuppressed() bool {
+	if !dnsIntercept || p.dnsInterceptState == nil {
+		return false
+	}
+	state, ok := p.dnsInterceptState.(*wfpState)
+	if !ok || state == nil {
+		return false
+	}
+	state.mu.Lock()
+	defer state.mu.Unlock()
+	return state.loopbackProtectActive
+}
+
 // deactivateLoopbackWFPProtectLocked is the lock-free inner implementation.
 // Caller must hold state.mu.
 func (p *prog) deactivateLoopbackWFPProtectLocked(state *wfpState) {
@@ -1809,6 +1809,8 @@ func interfaceIPsEqual(a, b []netip.Prefix) bool {
 	return true
 }

+var errOsHealthcheckSuppressed = errors.New("upstream os health check suppressed")
+
 // checkUpstreamOnce sends a test query to the specified upstream.
 // Returns nil if the upstream responds successfully.
 func (p *prog) checkUpstreamOnce(upstream string, uc *ctrld.UpstreamConfig) error {
@@ -1838,11 +1840,19 @@ func (p *prog) checkUpstreamOnce(upstream string, uc *ctrld.UpstreamConfig) erro
 	duration := time.Since(start)

 	if err != nil {
+		// Demote upstream.os check failures to debug while WFP loopback
+		// protect is active: an external WFP block filter is interfering
+		// with plain DNS so repeated failures here are expected. Other
+		// upstreams keep error level so real outages stay visible.
+		if upstream == upstreamOS && p.osHealthcheckSuppressed() {
+			p.Debug().Err(err).Msgf("Upstream %s check failed after %v (WFP loopback protect active)", upstream, duration)
+			return errOsHealthcheckSuppressed
+		}
 		p.Error().Err(err).Msgf("Upstream %s check failed after %v", upstream, duration)
-	} else {
-		p.Debug().Msgf("Upstream %s responded successfully in %v", upstream, duration)
+		return err
 	}
-	return err
+	p.Debug().Msgf("Upstream %s responded successfully in %v", upstream, duration)
+	return nil
 }

 // handleRecovery orchestrates the recovery process by coordinating multiple smaller methods.
@@ -2121,7 +2131,8 @@ func (p *prog) waitForUpstreamRecovery(ctx context.Context, upstreams map[string
 				default:
 					attempts++
 					// checkUpstreamOnce will reset any failure counters on success.
-					if err := p.checkUpstreamOnce(name, uc); err == nil {
+					err := p.checkUpstreamOnce(name, uc)
+					if err == nil || errors.Is(err, errOsHealthcheckSuppressed) {
 						p.Debug().Msgf("Upstream %s recovered successfully", name)
 						select {
 						case recoveredCh <- name: