diff --git a/cmd/cli/dns_proxy.go b/cmd/cli/dns_proxy.go index 31e8aa8..d2065ef 100644 --- a/cmd/cli/dns_proxy.go +++ b/cmd/cli/dns_proxy.go @@ -1429,6 +1429,9 @@ func (p *prog) handleRecovery(reason RecoveryReason) { } mainLog.Load().Info().Msgf("Upstream %q recovered; re-applying DNS settings", recovered) + // reset the upstream failure count and down state + p.um.reset(recovered) + // For network changes we also reinitialize the OS resolver. if reason == RecoveryReasonNetworkChange { ns := ctrld.InitializeOsResolver(true) diff --git a/cmd/cli/upstream_monitor.go b/cmd/cli/upstream_monitor.go index 7489091..507a06f 100644 --- a/cmd/cli/upstream_monitor.go +++ b/cmd/cli/upstream_monitor.go @@ -1,12 +1,9 @@ package cli import ( - "context" "sync" "time" - "github.com/miekg/dns" - "github.com/Control-D-Inc/ctrld" ) @@ -80,11 +77,10 @@ func (um *upstreamMonitor) isDown(upstream string) bool { // reset marks an upstream as up and set failed queries counter to zero. func (um *upstreamMonitor) reset(upstream string) { um.mu.Lock() - defer um.mu.Unlock() - um.failureReq[upstream] = 0 um.down[upstream] = false um.recovered[upstream] = true + um.mu.Unlock() go func() { // debounce the recovery to avoid incrementing failure counts already in flight time.Sleep(1 * time.Second) @@ -94,58 +90,6 @@ func (um *upstreamMonitor) reset(upstream string) { }() } -// checkUpstream checks the given upstream status, periodically sending query to upstream -// until successfully. An upstream status/counter will be reset once it becomes reachable. -func (p *prog) checkUpstream(upstream string, uc *ctrld.UpstreamConfig) { - p.um.mu.Lock() - isChecking := p.um.checking[upstream] - if isChecking { - p.um.mu.Unlock() - return - } - p.um.checking[upstream] = true - p.um.mu.Unlock() - defer func() { - p.um.mu.Lock() - p.um.checking[upstream] = false - p.um.mu.Unlock() - }() - - resolver, err := ctrld.NewResolver(uc) - if err != nil { - mainLog.Load().Warn().Err(err).Msg("could not check upstream") - return - } - msg := new(dns.Msg) - msg.SetQuestion(".", dns.TypeNS) - timeout := 1000 * time.Millisecond - if uc.Timeout > 0 { - timeout = time.Duration(uc.Timeout) * time.Millisecond - } - check := func() error { - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() - uc.ReBootstrap() - _, err := resolver.Resolve(ctx, msg) - return err - } - endpoint := uc.Endpoint - if endpoint == "" { - endpoint = uc.Name - } - mainLog.Load().Warn().Msgf("upstream %q is offline", endpoint) - for { - if err := check(); err == nil { - mainLog.Load().Warn().Msgf("upstream %q is online", endpoint) - p.um.reset(upstream) - return - } else { - mainLog.Load().Debug().Msgf("checked upstream %q failed: %v", endpoint, err) - } - time.Sleep(checkUpstreamBackoffSleep) - } -} - // countHealthy returns the number of upstreams in the provided map that are considered healthy. func (um *upstreamMonitor) countHealthy(upstreams []string) int { var count int