all: leaking queries to OS resolver instead of SRVFAIL

So it would work in more general case than just captive portal network,
which ctrld have supported recently.

Uses who may want no leaking behavior can use a config to turn off this
feature.
This commit is contained in:
Cuong Manh Le
2024-09-23 18:27:14 +07:00
committed by Cuong Manh Le
parent cfe1209d61
commit 3e388c2857
6 changed files with 69 additions and 53 deletions

View File

@@ -6,7 +6,6 @@ import (
"encoding/hex"
"errors"
"fmt"
"io"
"net"
"net/netip"
"runtime"
@@ -17,7 +16,6 @@ import (
"github.com/miekg/dns"
"golang.org/x/sync/errgroup"
"tailscale.com/net/captivedetection"
"tailscale.com/net/netaddr"
"tailscale.com/net/netmon"
"tailscale.com/net/tsaddr"
@@ -412,6 +410,16 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *proxyResponse {
upstreams := req.ufr.upstreams
serveStaleCache := p.cache != nil && p.cfg.Service.CacheServeStale
upstreamConfigs := p.upstreamConfigsFromUpstreamNumbers(upstreams)
// If ctrld is going to leak query to OS resolver, check remote upstream in background,
// so ctrld could be back to normal operation as long as the network is back online.
if len(upstreamConfigs) > 0 && p.leakingQuery.Load() {
for n, uc := range upstreamConfigs {
go p.checkUpstream(upstreams[n], uc)
}
upstreamConfigs = nil
}
if len(upstreamConfigs) == 0 {
upstreamConfigs = []*ctrld.UpstreamConfig{osUpstreamConfig}
upstreams = []string{upstreamOS}
@@ -501,17 +509,9 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *proxyResponse {
if isNetworkErr {
p.um.increaseFailureCount(upstreams[n])
if p.um.isDown(upstreams[n]) {
go p.um.checkUpstream(upstreams[n], upstreamConfig)
go p.checkUpstream(upstreams[n], upstreamConfig)
}
}
if cdUID != "" && (isNetworkErr || err == io.EOF) {
p.captivePortalMu.Lock()
if !p.captivePortalCheckWasRun {
p.captivePortalCheckWasRun = true
go p.performCaptivePortalDetection()
}
p.captivePortalMu.Unlock()
}
// For timeout error (i.e: context deadline exceed), force re-bootstrapping.
var e net.Error
if errors.As(err, &e) && e.Timeout() {
@@ -580,6 +580,14 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *proxyResponse {
return res
}
ctrld.Log(ctx, mainLog.Load().Error(), "all %v endpoints failed", upstreams)
if cdUID != "" && p.leakOnUpstreamFailure() {
p.leakingQueryMu.Lock()
if !p.leakingQueryWasRun {
p.leakingQueryWasRun = true
go p.performLeakingQuery()
}
p.leakingQueryMu.Unlock()
}
answer := new(dns.Msg)
answer.SetRcode(req.msg, dns.RcodeServerFailure)
res.answer = answer
@@ -597,9 +605,6 @@ func (p *prog) upstreamsAndUpstreamConfigForLanAndPtr(upstreams []string, upstre
}
func (p *prog) upstreamConfigsFromUpstreamNumbers(upstreams []string) []*ctrld.UpstreamConfig {
if p.captivePortalDetected.Load() {
return nil // always use OS resolver if behind captive portal.
}
upstreamConfigs := make([]*ctrld.UpstreamConfig, 0, len(upstreams))
for _, upstream := range upstreams {
upstreamNum := strings.TrimPrefix(upstream, upstreamPrefix)
@@ -903,31 +908,16 @@ func (p *prog) selfUninstallCoolOfPeriod() {
p.selfUninstallMu.Unlock()
}
// performCaptivePortalDetection check if ctrld is running behind a captive portal.
func (p *prog) performCaptivePortalDetection() {
mainLog.Load().Warn().Msg("Performing captive portal detection")
d := captivedetection.NewDetector(logf)
found := true
var resetDnsOnce sync.Once
for found {
time.Sleep(2 * time.Second)
found = d.Detect(context.Background(), netmon.NewStatic(), nil, 0)
if found {
resetDnsOnce.Do(func() {
mainLog.Load().Warn().Msg("found captive portal, leaking query to OS resolver")
// Store the result once here, so changes made below won't be reverted by DNS watchers.
p.captivePortalDetected.Store(found)
p.resetDNS()
})
}
p.captivePortalDetected.Store(found)
}
p.captivePortalMu.Lock()
p.captivePortalCheckWasRun = false
p.captivePortalMu.Unlock()
// performLeakingQuery performs necessary works to leak queries to OS resolver.
func (p *prog) performLeakingQuery() {
mainLog.Load().Warn().Msg("leaking query to OS resolver")
// Signal dns watchers to stop, so changes made below won't be reverted.
p.leakingQuery.Store(true)
p.resetDNS()
ns := ctrld.InitializeOsResolver()
mainLog.Load().Debug().Msgf("re-initialized OS resolver with nameservers: %v", ns)
p.dnsWg.Wait()
p.setDNS()
mainLog.Load().Warn().Msg("captive portal login finished, stop leaking query")
}
// forceFetchingAPI sends signal to force syncing API config if run in cd mode,

View File

@@ -107,9 +107,9 @@ type prog struct {
loopMu sync.Mutex
loop map[string]bool
captivePortalMu sync.Mutex
captivePortalCheckWasRun bool
captivePortalDetected atomic.Bool
leakingQueryMu sync.Mutex
leakingQueryWasRun bool
leakingQuery atomic.Bool
started chan struct{}
onStartedDone chan struct{}
@@ -685,7 +685,7 @@ func (p *prog) dnsWatchdog(iface *net.Interface, nameservers []string, allIfaces
mainLog.Load().Debug().Msg("stop dns watchdog")
return
case <-ticker.C:
if p.captivePortalDetected.Load() {
if p.leakingQuery.Load() {
return
}
if dnsChanged(iface, ns) {
@@ -742,6 +742,18 @@ func (p *prog) resetDNS() {
}
}
// leakOnUpstreamFailure reports whether ctrld should leak query to OS resolver when failed to connect all upstreams.
func (p *prog) leakOnUpstreamFailure() bool {
if ptr := p.cfg.Service.LeakOnUpstreamFailure; ptr != nil {
return *ptr
}
// Default is false on routers, since this leaking is only useful for devices that move between networks.
if router.Name() != "" {
return false
}
return true
}
func randomLocalIP() string {
n := rand.Intn(254-2) + 2
return fmt.Sprintf("127.0.0.%d", n)

View File

@@ -40,7 +40,7 @@ func (p *prog) watchResolvConf(iface *net.Interface, ns []netip.Addr, setDnsFn f
mainLog.Load().Debug().Msgf("stopping watcher for %s", resolvConfPath)
return
case event, ok := <-watcher.Events:
if p.captivePortalDetected.Load() {
if p.leakingQuery.Load() {
return
}
if !ok {

View File

@@ -71,19 +71,19 @@ func (um *upstreamMonitor) reset(upstream string) {
// checkUpstream checks the given upstream status, periodically sending query to upstream
// until successfully. An upstream status/counter will be reset once it becomes reachable.
func (um *upstreamMonitor) checkUpstream(upstream string, uc *ctrld.UpstreamConfig) {
um.mu.Lock()
isChecking := um.checking[upstream]
func (p *prog) checkUpstream(upstream string, uc *ctrld.UpstreamConfig) {
p.um.mu.Lock()
isChecking := p.um.checking[upstream]
if isChecking {
um.mu.Unlock()
p.um.mu.Unlock()
return
}
um.checking[upstream] = true
um.mu.Unlock()
p.um.checking[upstream] = true
p.um.mu.Unlock()
defer func() {
um.mu.Lock()
um.checking[upstream] = false
um.mu.Unlock()
p.um.mu.Lock()
p.um.checking[upstream] = false
p.um.mu.Unlock()
}()
resolver, err := ctrld.NewResolver(uc)
@@ -104,7 +104,13 @@ func (um *upstreamMonitor) checkUpstream(upstream string, uc *ctrld.UpstreamConf
for {
if err := check(); err == nil {
mainLog.Load().Debug().Msgf("upstream %q is online", uc.Endpoint)
um.reset(upstream)
p.um.reset(upstream)
if p.leakingQuery.CompareAndSwap(true, false) {
p.leakingQueryMu.Lock()
p.leakingQueryWasRun = false
p.leakingQueryMu.Unlock()
mainLog.Load().Warn().Msg("stop leaking query")
}
return
}
time.Sleep(checkUpstreamBackoffSleep)

View File

@@ -218,6 +218,7 @@ type ServiceConfig struct {
DnsWatchdogInvterval *time.Duration `mapstructure:"dns_watchdog_interval" toml:"dns_watchdog_interval,omitempty"`
RefetchTime *int `mapstructure:"refetch_time" toml:"refetch_time,omitempty"`
ForceRefetchWaitTime *int `mapstructure:"force_refetch_wait_time" toml:"force_refetch_wait_time,omitempty"`
LeakOnUpstreamFailure *bool `mapstructure:"leak_on_upstream_failure" toml:"leak_on_upstream_failure,omitempty"`
Daemon bool `mapstructure:"-" toml:"-"`
AllocateIP bool `mapstructure:"-" toml:"-"`
}

View File

@@ -281,6 +281,13 @@ The value must be a positive number, any invalid value will be ignored and defau
- Required: no
- Default: 3600
### leak_on_upstream_failure
Once ctrld is "offline", mean ctrld could not connect to any upstream, next queries will be leaked to OS resolver.
- Type: boolean
- Required: no
- Default: true on Windows, MacOS and non-router Linux.
## Upstream
The `[upstream]` section specifies the DNS upstream servers that `ctrld` will forward DNS requests to.