mirror of
https://github.com/Control-D-Inc/ctrld.git
synced 2026-05-27 12:52:27 +02:00
cmd/cli: skip upstream.os healthcheck when WFP loopback protect enabled
When WFP loopback protect is active, the upstream.os healthcheck will always fail because an external WFP block filter is interfering with plain DNS. This demotes those expected failures to debug level and returns errOsHealthcheckSuppressed so the recovery loop treats them as non-fatal, eliminating the log spam described in #526.
This commit is contained in:
committed by
Cuong Manh Le
parent
81aa6b237b
commit
1735d3d55b
+15
-4
@@ -1809,6 +1809,8 @@ func interfaceIPsEqual(a, b []netip.Prefix) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
var errOsHealthcheckSuppressed = errors.New("upstream os health check suppressed")
|
||||
|
||||
// checkUpstreamOnce sends a test query to the specified upstream.
|
||||
// Returns nil if the upstream responds successfully.
|
||||
func (p *prog) checkUpstreamOnce(upstream string, uc *ctrld.UpstreamConfig) error {
|
||||
@@ -1838,11 +1840,19 @@ func (p *prog) checkUpstreamOnce(upstream string, uc *ctrld.UpstreamConfig) erro
|
||||
duration := time.Since(start)
|
||||
|
||||
if err != nil {
|
||||
// Demote upstream.os check failures to debug while WFP loopback
|
||||
// protect is active: an external WFP block filter is interfering
|
||||
// with plain DNS so repeated failures here are expected. Other
|
||||
// upstreams keep error level so real outages stay visible.
|
||||
if upstream == upstreamOS && p.osHealthcheckSuppressed() {
|
||||
p.Debug().Err(err).Msgf("Upstream %s check failed after %v (WFP loopback protect active)", upstream, duration)
|
||||
return errOsHealthcheckSuppressed
|
||||
}
|
||||
p.Error().Err(err).Msgf("Upstream %s check failed after %v", upstream, duration)
|
||||
} else {
|
||||
p.Debug().Msgf("Upstream %s responded successfully in %v", upstream, duration)
|
||||
return err
|
||||
}
|
||||
return err
|
||||
p.Debug().Msgf("Upstream %s responded successfully in %v", upstream, duration)
|
||||
return nil
|
||||
}
|
||||
|
||||
// handleRecovery orchestrates the recovery process by coordinating multiple smaller methods.
|
||||
@@ -2121,7 +2131,8 @@ func (p *prog) waitForUpstreamRecovery(ctx context.Context, upstreams map[string
|
||||
default:
|
||||
attempts++
|
||||
// checkUpstreamOnce will reset any failure counters on success.
|
||||
if err := p.checkUpstreamOnce(name, uc); err == nil {
|
||||
err := p.checkUpstreamOnce(name, uc)
|
||||
if err == nil || errors.Is(err, errOsHealthcheckSuppressed) {
|
||||
p.Debug().Msgf("Upstream %s recovered successfully", name)
|
||||
select {
|
||||
case recoveredCh <- name:
|
||||
|
||||
Reference in New Issue
Block a user