mirror of
https://github.com/Control-D-Inc/ctrld.git
synced 2026-02-03 22:18:39 +00:00
cmd/cli: add upstream monitor
Some users mentioned that when there is an Internet outage, ctrld fails to recover, crashing or locks up the router. When requests start failing, this results in the clients emitting more queries, creating a resource spiral of death that can brick the device entirely. To guard against this case, this commit implement an upstream monitor approach: - Marking upstream as down after 100 consecutive failed queries. - Start a goroutine to check when the upstream is back again. - When upstream is down, answer all queries with SERVFAIL. - The checking process uses backoff retry to reduce high requests rate. - As long as the query succeeded, marking the upstream as alive then start operate normally.
This commit is contained in:
committed by
Cuong Manh Le
parent
bed7435b0c
commit
511c4e696f
@@ -145,7 +145,7 @@ func (p *prog) serveDNS(listenerNum string) error {
|
||||
// processed later, because policy logging want to know whether a network rule
|
||||
// is disregarded in favor of the domain level rule.
|
||||
func (p *prog) upstreamFor(ctx context.Context, defaultUpstreamNum string, lc *ctrld.ListenerConfig, addr net.Addr, domain string) ([]string, bool) {
|
||||
upstreams := []string{"upstream." + defaultUpstreamNum}
|
||||
upstreams := []string{upstreamPrefix + defaultUpstreamNum}
|
||||
matchedPolicy := "no policy"
|
||||
matchedNetwork := "no network"
|
||||
matchedRule := "no rule"
|
||||
@@ -229,7 +229,7 @@ func (p *prog) proxy(ctx context.Context, upstreams []string, failoverRcodes []i
|
||||
upstreamConfigs := p.upstreamConfigsFromUpstreamNumbers(upstreams)
|
||||
if len(upstreamConfigs) == 0 {
|
||||
upstreamConfigs = []*ctrld.UpstreamConfig{osUpstreamConfig}
|
||||
upstreams = []string{"upstream.os"}
|
||||
upstreams = []string{upstreamOS}
|
||||
}
|
||||
// Inverse query should not be cached: https://www.rfc-editor.org/rfc/rfc1035#section-7.4
|
||||
if p.cache != nil && msg.Question[0].Qtype != dns.TypePTR {
|
||||
@@ -273,6 +273,12 @@ func (p *prog) proxy(ctx context.Context, upstreams []string, failoverRcodes []i
|
||||
answer, err := resolve1(n, upstreamConfig, msg)
|
||||
if err != nil {
|
||||
ctrld.Log(ctx, mainLog.Load().Error().Err(err), "failed to resolve query")
|
||||
if errNetworkError(err) {
|
||||
p.um.increaseFailureCount(upstreams[n])
|
||||
if p.um.isDown(upstreams[n]) {
|
||||
go p.um.checkUpstream(upstreams[n], upstreamConfig)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
return answer
|
||||
@@ -281,6 +287,10 @@ func (p *prog) proxy(ctx context.Context, upstreams []string, failoverRcodes []i
|
||||
if upstreamConfig == nil {
|
||||
continue
|
||||
}
|
||||
if p.um.isDown(upstreams[n]) {
|
||||
ctrld.Log(ctx, mainLog.Load().Warn(), "%s is down", upstreams[n])
|
||||
continue
|
||||
}
|
||||
answer := resolve(n, upstreamConfig, msg)
|
||||
if answer == nil {
|
||||
if serveStaleCache && staleAnswer != nil {
|
||||
@@ -312,7 +322,7 @@ func (p *prog) proxy(ctx context.Context, upstreams []string, failoverRcodes []i
|
||||
}
|
||||
return answer
|
||||
}
|
||||
ctrld.Log(ctx, mainLog.Load().Error(), "all upstreams failed")
|
||||
ctrld.Log(ctx, mainLog.Load().Error(), "all %v endpoints failed", upstreams)
|
||||
answer := new(dns.Msg)
|
||||
answer.SetRcode(msg, dns.RcodeServerFailure)
|
||||
return answer
|
||||
@@ -321,7 +331,7 @@ func (p *prog) proxy(ctx context.Context, upstreams []string, failoverRcodes []i
|
||||
func (p *prog) upstreamConfigsFromUpstreamNumbers(upstreams []string) []*ctrld.UpstreamConfig {
|
||||
upstreamConfigs := make([]*ctrld.UpstreamConfig, 0, len(upstreams))
|
||||
for _, upstream := range upstreams {
|
||||
upstreamNum := strings.TrimPrefix(upstream, "upstream.")
|
||||
upstreamNum := strings.TrimPrefix(upstream, upstreamPrefix)
|
||||
upstreamConfigs = append(upstreamConfigs, p.cfg.Upstream[upstreamNum])
|
||||
}
|
||||
return upstreamConfigs
|
||||
|
||||
Reference in New Issue
Block a user