cmd/cli: add upstream monitor

Some users mentioned that when there is an Internet outage, ctrld fails
to recover, crashing or locks up the router. When requests start
failing, this results in the clients emitting more queries, creating a
resource spiral of death that can brick the device entirely.

To guard against this case, this commit implement an upstream monitor
approach:

 - Marking upstream as down after 100 consecutive failed queries.
 - Start a goroutine to check when the upstream is back again.
 - When upstream is down, answer all queries with SERVFAIL.
 - The checking process uses backoff retry to reduce high requests rate.
 - As long as the query succeeded, marking the upstream as alive then
   start operate normally.
This commit is contained in:
Cuong Manh Le
2023-09-21 06:06:09 +00:00
committed by Cuong Manh Le
parent bed7435b0c
commit 511c4e696f
4 changed files with 141 additions and 19 deletions

View File

@@ -0,0 +1,98 @@
package cli
import (
"context"
"sync"
"sync/atomic"
"time"
"github.com/miekg/dns"
"tailscale.com/logtail/backoff"
"github.com/Control-D-Inc/ctrld"
)
const (
// maxFailureRequest is the maximum failed queries allowed before an upstream is marked as down.
maxFailureRequest = 100
// checkUpstreamMaxBackoff is the max backoff time when checking upstream status.
checkUpstreamMaxBackoff = 2 * time.Minute
)
// upstreamMonitor performs monitoring upstreams health.
type upstreamMonitor struct {
cfg *ctrld.Config
down map[string]*atomic.Bool
failureReq map[string]*atomic.Uint64
mu sync.Mutex
checking map[string]bool
}
func newUpstreamMonitor(cfg *ctrld.Config) *upstreamMonitor {
um := &upstreamMonitor{
cfg: cfg,
down: make(map[string]*atomic.Bool),
failureReq: make(map[string]*atomic.Uint64),
checking: make(map[string]bool),
}
for n := range cfg.Upstream {
upstream := upstreamPrefix + n
um.down[upstream] = new(atomic.Bool)
um.failureReq[upstream] = new(atomic.Uint64)
}
um.down[upstreamOS] = new(atomic.Bool)
um.failureReq[upstreamOS] = new(atomic.Uint64)
return um
}
// increaseFailureCount increase failed queries count for an upstream by 1.
func (um *upstreamMonitor) increaseFailureCount(upstream string) {
failedCount := um.failureReq[upstream].Add(1)
um.down[upstream].Store(failedCount >= maxFailureRequest)
}
// isDown reports whether the given upstream is being marked as down.
func (um *upstreamMonitor) isDown(upstream string) bool {
return um.down[upstream].Load()
}
// reset marks an upstream as up and set failed queries counter to zero.
func (um *upstreamMonitor) reset(upstream string) {
um.failureReq[upstream].Store(0)
um.down[upstream].Store(false)
}
// checkUpstream checks the given upstream status, periodically sending query to upstream
// until successfully. An upstream status/counter will be reset once it becomes reachable.
func (um *upstreamMonitor) checkUpstream(upstream string, uc *ctrld.UpstreamConfig) {
um.mu.Lock()
isChecking := um.checking[upstream]
if isChecking {
um.mu.Unlock()
return
}
um.checking[upstream] = true
um.mu.Unlock()
bo := backoff.NewBackoff("checkUpstream", logf, checkUpstreamMaxBackoff)
resolver, err := ctrld.NewResolver(uc)
if err != nil {
mainLog.Load().Warn().Err(err).Msg("could not check upstream")
return
}
msg := new(dns.Msg)
msg.SetQuestion(".", dns.TypeNS)
ctx := context.Background()
for {
_, err := resolver.Resolve(ctx, msg)
if err == nil {
mainLog.Load().Debug().Msgf("upstream %q is online", uc.Endpoint)
um.reset(upstream)
return
}
bo.BackOff(ctx, err)
}
}