mirror of
https://github.com/Control-D-Inc/ctrld.git
synced 2026-02-03 22:18:39 +00:00
cmd/cli: always rebootstrap when check upstream
Otherwise, network changes may not be seen on some platforms, causing ctrld failed to recover and failing all requests. While at it, also doing the check DNS in separate goroutine, prevent it from blocking ctrld from notifying others that it "started". The issue was seen when ctrld is configured as direct listener, requests are flooded before ctrld started, causing the healtch process failed.
This commit is contained in:
committed by
Cuong Manh Le
parent
58a00ea24a
commit
d88cf52b4e
@@ -56,7 +56,10 @@ func (p *prog) checkDnsLoop() {
|
||||
mainLog.Load().Debug().Msg("start checking DNS loop")
|
||||
upstream := make(map[string]*ctrld.UpstreamConfig)
|
||||
p.loopMu.Lock()
|
||||
for _, uc := range p.cfg.Upstream {
|
||||
for n, uc := range p.cfg.Upstream {
|
||||
if p.um.isDown("upstream." + n) {
|
||||
continue
|
||||
}
|
||||
uid := uc.UID()
|
||||
p.loop[uid] = false
|
||||
upstream[uid] = uc
|
||||
|
||||
@@ -282,14 +282,14 @@ func (p *prog) run(reload bool, reloadCh chan struct{}) {
|
||||
}
|
||||
}
|
||||
|
||||
// Check for possible DNS loop.
|
||||
p.checkDnsLoop()
|
||||
close(p.onStartedDone)
|
||||
|
||||
// Start check DNS loop ticker.
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
// Check for possible DNS loop.
|
||||
p.checkDnsLoop()
|
||||
// Start check DNS loop ticker.
|
||||
p.checkDnsLoopTicker(ctx)
|
||||
}()
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/miekg/dns"
|
||||
"tailscale.com/logtail/backoff"
|
||||
|
||||
"github.com/Control-D-Inc/ctrld"
|
||||
)
|
||||
@@ -15,8 +14,8 @@ import (
|
||||
const (
|
||||
// maxFailureRequest is the maximum failed queries allowed before an upstream is marked as down.
|
||||
maxFailureRequest = 100
|
||||
// checkUpstreamMaxBackoff is the max backoff time when checking upstream status.
|
||||
checkUpstreamMaxBackoff = 2 * time.Minute
|
||||
// checkUpstreamBackoffSleep is the time interval between each upstream checks.
|
||||
checkUpstreamBackoffSleep = 2 * time.Second
|
||||
)
|
||||
|
||||
// upstreamMonitor performs monitoring upstreams health.
|
||||
@@ -76,7 +75,6 @@ func (um *upstreamMonitor) checkUpstream(upstream string, uc *ctrld.UpstreamConf
|
||||
um.checking[upstream] = true
|
||||
um.mu.Unlock()
|
||||
|
||||
bo := backoff.NewBackoff("checkUpstream", logf, checkUpstreamMaxBackoff)
|
||||
resolver, err := ctrld.NewResolver(uc)
|
||||
if err != nil {
|
||||
mainLog.Load().Warn().Err(err).Msg("could not check upstream")
|
||||
@@ -84,15 +82,20 @@ func (um *upstreamMonitor) checkUpstream(upstream string, uc *ctrld.UpstreamConf
|
||||
}
|
||||
msg := new(dns.Msg)
|
||||
msg.SetQuestion(".", dns.TypeNS)
|
||||
ctx := context.Background()
|
||||
|
||||
for {
|
||||
check := func() error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
||||
defer cancel()
|
||||
uc.ReBootstrap()
|
||||
_, err := resolver.Resolve(ctx, msg)
|
||||
if err == nil {
|
||||
return err
|
||||
}
|
||||
for {
|
||||
if err := check(); err == nil {
|
||||
mainLog.Load().Debug().Msgf("upstream %q is online", uc.Endpoint)
|
||||
um.reset(upstream)
|
||||
return
|
||||
}
|
||||
bo.BackOff(ctx, err)
|
||||
time.Sleep(checkUpstreamBackoffSleep)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user