do the reset after recovery finished

2026-02-03 22:18:39 +00:00 · 2025-02-07 16:03:36 -05:00
parent d37d0e942c
commit 60e65a37a6
2 changed files with 4 additions and 57 deletions
--- a/cmd/cli/dns_proxy.go
+++ b/cmd/cli/dns_proxy.go
@@ -1429,6 +1429,9 @@ func (p *prog) handleRecovery(reason RecoveryReason) {
 	}
 	mainLog.Load().Info().Msgf("Upstream %q recovered; re-applying DNS settings", recovered)

+	// reset the upstream failure count and down state
+	p.um.reset(recovered)
+
 	// For network changes we also reinitialize the OS resolver.
 	if reason == RecoveryReasonNetworkChange {
 		ns := ctrld.InitializeOsResolver(true)
--- a/cmd/cli/upstream_monitor.go
+++ b/cmd/cli/upstream_monitor.go
@@ -1,12 +1,9 @@
 package cli

 import (
-	"context"
 	"sync"
 	"time"

-	"github.com/miekg/dns"
-
 	"github.com/Control-D-Inc/ctrld"
 )

@@ -80,11 +77,10 @@ func (um *upstreamMonitor) isDown(upstream string) bool {
 // reset marks an upstream as up and set failed queries counter to zero.
 func (um *upstreamMonitor) reset(upstream string) {
 	um.mu.Lock()
-	defer um.mu.Unlock()
-
 	um.failureReq[upstream] = 0
 	um.down[upstream] = false
 	um.recovered[upstream] = true
+	um.mu.Unlock()
 	go func() {
 		// debounce the recovery to avoid incrementing failure counts already in flight
 		time.Sleep(1 * time.Second)
@@ -94,58 +90,6 @@ func (um *upstreamMonitor) reset(upstream string) {
 	}()
 }

-// checkUpstream checks the given upstream status, periodically sending query to upstream
-// until successfully. An upstream status/counter will be reset once it becomes reachable.
-func (p *prog) checkUpstream(upstream string, uc *ctrld.UpstreamConfig) {
-	p.um.mu.Lock()
-	isChecking := p.um.checking[upstream]
-	if isChecking {
-		p.um.mu.Unlock()
-		return
-	}
-	p.um.checking[upstream] = true
-	p.um.mu.Unlock()
-	defer func() {
-		p.um.mu.Lock()
-		p.um.checking[upstream] = false
-		p.um.mu.Unlock()
-	}()
-
-	resolver, err := ctrld.NewResolver(uc)
-	if err != nil {
-		mainLog.Load().Warn().Err(err).Msg("could not check upstream")
-		return
-	}
-	msg := new(dns.Msg)
-	msg.SetQuestion(".", dns.TypeNS)
-	timeout := 1000 * time.Millisecond
-	if uc.Timeout > 0 {
-		timeout = time.Duration(uc.Timeout) * time.Millisecond
-	}
-	check := func() error {
-		ctx, cancel := context.WithTimeout(context.Background(), timeout)
-		defer cancel()
-		uc.ReBootstrap()
-		_, err := resolver.Resolve(ctx, msg)
-		return err
-	}
-	endpoint := uc.Endpoint
-	if endpoint == "" {
-		endpoint = uc.Name
-	}
-	mainLog.Load().Warn().Msgf("upstream %q is offline", endpoint)
-	for {
-		if err := check(); err == nil {
-			mainLog.Load().Warn().Msgf("upstream %q is online", endpoint)
-			p.um.reset(upstream)
-			return
-		} else {
-			mainLog.Load().Debug().Msgf("checked upstream %q failed: %v", endpoint, err)
-		}
-		time.Sleep(checkUpstreamBackoffSleep)
-	}
-}
-
 // countHealthy returns the number of upstreams in the provided map that are considered healthy.
 func (um *upstreamMonitor) countHealthy(upstreams []string) int {
 	var count int