cmd/cli: skip upstream.os healthcheck when WFP loopback protect enabled

When WFP loopback protect is active, the upstream.os healthcheck will
always fail because an external WFP block filter is interfering with
plain DNS. This demotes those expected failures to debug level and
returns errOsHealthcheckSuppressed so the recovery loop treats them
as non-fatal, eliminating the log spam described in #526.
This commit is contained in:
Codescribe
2026-05-06 04:47:34 -04:00
committed by Cuong Manh Le
parent 81aa6b237b
commit 1735d3d55b
4 changed files with 43 additions and 4 deletions
+15 -4
View File
@@ -1809,6 +1809,8 @@ func interfaceIPsEqual(a, b []netip.Prefix) bool {
return true
}
var errOsHealthcheckSuppressed = errors.New("upstream os health check suppressed")
// checkUpstreamOnce sends a test query to the specified upstream.
// Returns nil if the upstream responds successfully.
func (p *prog) checkUpstreamOnce(upstream string, uc *ctrld.UpstreamConfig) error {
@@ -1838,11 +1840,19 @@ func (p *prog) checkUpstreamOnce(upstream string, uc *ctrld.UpstreamConfig) erro
duration := time.Since(start)
if err != nil {
// Demote upstream.os check failures to debug while WFP loopback
// protect is active: an external WFP block filter is interfering
// with plain DNS so repeated failures here are expected. Other
// upstreams keep error level so real outages stay visible.
if upstream == upstreamOS && p.osHealthcheckSuppressed() {
p.Debug().Err(err).Msgf("Upstream %s check failed after %v (WFP loopback protect active)", upstream, duration)
return errOsHealthcheckSuppressed
}
p.Error().Err(err).Msgf("Upstream %s check failed after %v", upstream, duration)
} else {
p.Debug().Msgf("Upstream %s responded successfully in %v", upstream, duration)
return err
}
return err
p.Debug().Msgf("Upstream %s responded successfully in %v", upstream, duration)
return nil
}
// handleRecovery orchestrates the recovery process by coordinating multiple smaller methods.
@@ -2121,7 +2131,8 @@ func (p *prog) waitForUpstreamRecovery(ctx context.Context, upstreams map[string
default:
attempts++
// checkUpstreamOnce will reset any failure counters on success.
if err := p.checkUpstreamOnce(name, uc); err == nil {
err := p.checkUpstreamOnce(name, uc)
if err == nil || errors.Is(err, errOsHealthcheckSuppressed) {
p.Debug().Msgf("Upstream %s recovered successfully", name)
select {
case recoveredCh <- name: