From d88cf52b4e6ed7510613bae02712b71d64a74a90 Mon Sep 17 00:00:00 2001 From: Cuong Manh Le Date: Tue, 24 Oct 2023 22:38:50 +0700 Subject: [PATCH] cmd/cli: always rebootstrap when check upstream Otherwise, network changes may not be seen on some platforms, causing ctrld failed to recover and failing all requests. While at it, also doing the check DNS in separate goroutine, prevent it from blocking ctrld from notifying others that it "started". The issue was seen when ctrld is configured as direct listener, requests are flooded before ctrld started, causing the healtch process failed. --- cmd/cli/loop.go | 5 ++++- cmd/cli/prog.go | 6 +++--- cmd/cli/upstream_monitor.go | 19 +++++++++++-------- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/cmd/cli/loop.go b/cmd/cli/loop.go index 5e6d911..a9d3972 100644 --- a/cmd/cli/loop.go +++ b/cmd/cli/loop.go @@ -56,7 +56,10 @@ func (p *prog) checkDnsLoop() { mainLog.Load().Debug().Msg("start checking DNS loop") upstream := make(map[string]*ctrld.UpstreamConfig) p.loopMu.Lock() - for _, uc := range p.cfg.Upstream { + for n, uc := range p.cfg.Upstream { + if p.um.isDown("upstream." + n) { + continue + } uid := uc.UID() p.loop[uid] = false upstream[uid] = uc diff --git a/cmd/cli/prog.go b/cmd/cli/prog.go index a475a77..d304cce 100644 --- a/cmd/cli/prog.go +++ b/cmd/cli/prog.go @@ -282,14 +282,14 @@ func (p *prog) run(reload bool, reloadCh chan struct{}) { } } - // Check for possible DNS loop. - p.checkDnsLoop() close(p.onStartedDone) - // Start check DNS loop ticker. wg.Add(1) go func() { defer wg.Done() + // Check for possible DNS loop. + p.checkDnsLoop() + // Start check DNS loop ticker. p.checkDnsLoopTicker(ctx) }() diff --git a/cmd/cli/upstream_monitor.go b/cmd/cli/upstream_monitor.go index 4b3ee69..83087a4 100644 --- a/cmd/cli/upstream_monitor.go +++ b/cmd/cli/upstream_monitor.go @@ -7,7 +7,6 @@ import ( "time" "github.com/miekg/dns" - "tailscale.com/logtail/backoff" "github.com/Control-D-Inc/ctrld" ) @@ -15,8 +14,8 @@ import ( const ( // maxFailureRequest is the maximum failed queries allowed before an upstream is marked as down. maxFailureRequest = 100 - // checkUpstreamMaxBackoff is the max backoff time when checking upstream status. - checkUpstreamMaxBackoff = 2 * time.Minute + // checkUpstreamBackoffSleep is the time interval between each upstream checks. + checkUpstreamBackoffSleep = 2 * time.Second ) // upstreamMonitor performs monitoring upstreams health. @@ -76,7 +75,6 @@ func (um *upstreamMonitor) checkUpstream(upstream string, uc *ctrld.UpstreamConf um.checking[upstream] = true um.mu.Unlock() - bo := backoff.NewBackoff("checkUpstream", logf, checkUpstreamMaxBackoff) resolver, err := ctrld.NewResolver(uc) if err != nil { mainLog.Load().Warn().Err(err).Msg("could not check upstream") @@ -84,15 +82,20 @@ func (um *upstreamMonitor) checkUpstream(upstream string, uc *ctrld.UpstreamConf } msg := new(dns.Msg) msg.SetQuestion(".", dns.TypeNS) - ctx := context.Background() - for { + check := func() error { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + uc.ReBootstrap() _, err := resolver.Resolve(ctx, msg) - if err == nil { + return err + } + for { + if err := check(); err == nil { mainLog.Load().Debug().Msgf("upstream %q is online", uc.Endpoint) um.reset(upstream) return } - bo.BackOff(ctx, err) + time.Sleep(checkUpstreamBackoffSleep) } }