all: leaking queries to OS resolver instead of SRVFAIL

So it would work in more general case than just captive portal network,
which ctrld have supported recently.

Uses who may want no leaking behavior can use a config to turn off this
feature.
This commit is contained in:
Cuong Manh Le
2024-09-23 18:27:14 +07:00
committed by Cuong Manh Le
parent cfe1209d61
commit 3e388c2857
6 changed files with 69 additions and 53 deletions
+28 -38
View File
@@ -6,7 +6,6 @@ import (
"encoding/hex"
"errors"
"fmt"
"io"
"net"
"net/netip"
"runtime"
@@ -17,7 +16,6 @@ import (
"github.com/miekg/dns"
"golang.org/x/sync/errgroup"
"tailscale.com/net/captivedetection"
"tailscale.com/net/netaddr"
"tailscale.com/net/netmon"
"tailscale.com/net/tsaddr"
@@ -412,6 +410,16 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *proxyResponse {
upstreams := req.ufr.upstreams
serveStaleCache := p.cache != nil && p.cfg.Service.CacheServeStale
upstreamConfigs := p.upstreamConfigsFromUpstreamNumbers(upstreams)
// If ctrld is going to leak query to OS resolver, check remote upstream in background,
// so ctrld could be back to normal operation as long as the network is back online.
if len(upstreamConfigs) > 0 && p.leakingQuery.Load() {
for n, uc := range upstreamConfigs {
go p.checkUpstream(upstreams[n], uc)
}
upstreamConfigs = nil
}
if len(upstreamConfigs) == 0 {
upstreamConfigs = []*ctrld.UpstreamConfig{osUpstreamConfig}
upstreams = []string{upstreamOS}
@@ -501,17 +509,9 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *proxyResponse {
if isNetworkErr {
p.um.increaseFailureCount(upstreams[n])
if p.um.isDown(upstreams[n]) {
go p.um.checkUpstream(upstreams[n], upstreamConfig)
go p.checkUpstream(upstreams[n], upstreamConfig)
}
}
if cdUID != "" && (isNetworkErr || err == io.EOF) {
p.captivePortalMu.Lock()
if !p.captivePortalCheckWasRun {
p.captivePortalCheckWasRun = true
go p.performCaptivePortalDetection()
}
p.captivePortalMu.Unlock()
}
// For timeout error (i.e: context deadline exceed), force re-bootstrapping.
var e net.Error
if errors.As(err, &e) && e.Timeout() {
@@ -580,6 +580,14 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *proxyResponse {
return res
}
ctrld.Log(ctx, mainLog.Load().Error(), "all %v endpoints failed", upstreams)
if cdUID != "" && p.leakOnUpstreamFailure() {
p.leakingQueryMu.Lock()
if !p.leakingQueryWasRun {
p.leakingQueryWasRun = true
go p.performLeakingQuery()
}
p.leakingQueryMu.Unlock()
}
answer := new(dns.Msg)
answer.SetRcode(req.msg, dns.RcodeServerFailure)
res.answer = answer
@@ -597,9 +605,6 @@ func (p *prog) upstreamsAndUpstreamConfigForLanAndPtr(upstreams []string, upstre
}
func (p *prog) upstreamConfigsFromUpstreamNumbers(upstreams []string) []*ctrld.UpstreamConfig {
if p.captivePortalDetected.Load() {
return nil // always use OS resolver if behind captive portal.
}
upstreamConfigs := make([]*ctrld.UpstreamConfig, 0, len(upstreams))
for _, upstream := range upstreams {
upstreamNum := strings.TrimPrefix(upstream, upstreamPrefix)
@@ -903,31 +908,16 @@ func (p *prog) selfUninstallCoolOfPeriod() {
p.selfUninstallMu.Unlock()
}
// performCaptivePortalDetection check if ctrld is running behind a captive portal.
func (p *prog) performCaptivePortalDetection() {
mainLog.Load().Warn().Msg("Performing captive portal detection")
d := captivedetection.NewDetector(logf)
found := true
var resetDnsOnce sync.Once
for found {
time.Sleep(2 * time.Second)
found = d.Detect(context.Background(), netmon.NewStatic(), nil, 0)
if found {
resetDnsOnce.Do(func() {
mainLog.Load().Warn().Msg("found captive portal, leaking query to OS resolver")
// Store the result once here, so changes made below won't be reverted by DNS watchers.
p.captivePortalDetected.Store(found)
p.resetDNS()
})
}
p.captivePortalDetected.Store(found)
}
p.captivePortalMu.Lock()
p.captivePortalCheckWasRun = false
p.captivePortalMu.Unlock()
// performLeakingQuery performs necessary works to leak queries to OS resolver.
func (p *prog) performLeakingQuery() {
mainLog.Load().Warn().Msg("leaking query to OS resolver")
// Signal dns watchers to stop, so changes made below won't be reverted.
p.leakingQuery.Store(true)
p.resetDNS()
ns := ctrld.InitializeOsResolver()
mainLog.Load().Debug().Msgf("re-initialized OS resolver with nameservers: %v", ns)
p.dnsWg.Wait()
p.setDNS()
mainLog.Load().Warn().Msg("captive portal login finished, stop leaking query")
}
// forceFetchingAPI sends signal to force syncing API config if run in cd mode,
+16 -4
View File
@@ -107,9 +107,9 @@ type prog struct {
loopMu sync.Mutex
loop map[string]bool
captivePortalMu sync.Mutex
captivePortalCheckWasRun bool
captivePortalDetected atomic.Bool
leakingQueryMu sync.Mutex
leakingQueryWasRun bool
leakingQuery atomic.Bool
started chan struct{}
onStartedDone chan struct{}
@@ -685,7 +685,7 @@ func (p *prog) dnsWatchdog(iface *net.Interface, nameservers []string, allIfaces
mainLog.Load().Debug().Msg("stop dns watchdog")
return
case <-ticker.C:
if p.captivePortalDetected.Load() {
if p.leakingQuery.Load() {
return
}
if dnsChanged(iface, ns) {
@@ -742,6 +742,18 @@ func (p *prog) resetDNS() {
}
}
// leakOnUpstreamFailure reports whether ctrld should leak query to OS resolver when failed to connect all upstreams.
func (p *prog) leakOnUpstreamFailure() bool {
if ptr := p.cfg.Service.LeakOnUpstreamFailure; ptr != nil {
return *ptr
}
// Default is false on routers, since this leaking is only useful for devices that move between networks.
if router.Name() != "" {
return false
}
return true
}
func randomLocalIP() string {
n := rand.Intn(254-2) + 2
return fmt.Sprintf("127.0.0.%d", n)
+1 -1
View File
@@ -40,7 +40,7 @@ func (p *prog) watchResolvConf(iface *net.Interface, ns []netip.Addr, setDnsFn f
mainLog.Load().Debug().Msgf("stopping watcher for %s", resolvConfPath)
return
case event, ok := <-watcher.Events:
if p.captivePortalDetected.Load() {
if p.leakingQuery.Load() {
return
}
if !ok {
+16 -10
View File
@@ -71,19 +71,19 @@ func (um *upstreamMonitor) reset(upstream string) {
// checkUpstream checks the given upstream status, periodically sending query to upstream
// until successfully. An upstream status/counter will be reset once it becomes reachable.
func (um *upstreamMonitor) checkUpstream(upstream string, uc *ctrld.UpstreamConfig) {
um.mu.Lock()
isChecking := um.checking[upstream]
func (p *prog) checkUpstream(upstream string, uc *ctrld.UpstreamConfig) {
p.um.mu.Lock()
isChecking := p.um.checking[upstream]
if isChecking {
um.mu.Unlock()
p.um.mu.Unlock()
return
}
um.checking[upstream] = true
um.mu.Unlock()
p.um.checking[upstream] = true
p.um.mu.Unlock()
defer func() {
um.mu.Lock()
um.checking[upstream] = false
um.mu.Unlock()
p.um.mu.Lock()
p.um.checking[upstream] = false
p.um.mu.Unlock()
}()
resolver, err := ctrld.NewResolver(uc)
@@ -104,7 +104,13 @@ func (um *upstreamMonitor) checkUpstream(upstream string, uc *ctrld.UpstreamConf
for {
if err := check(); err == nil {
mainLog.Load().Debug().Msgf("upstream %q is online", uc.Endpoint)
um.reset(upstream)
p.um.reset(upstream)
if p.leakingQuery.CompareAndSwap(true, false) {
p.leakingQueryMu.Lock()
p.leakingQueryWasRun = false
p.leakingQueryMu.Unlock()
mainLog.Load().Warn().Msg("stop leaking query")
}
return
}
time.Sleep(checkUpstreamBackoffSleep)