all: leaking queries to OS resolver instead of SRVFAIL

So it would work in more general case than just captive portal network, which ctrld have supported recently. Uses who may want no leaking behavior can use a config to turn off this feature.
2026-07-04 01:07:49 +02:00 · 2024-09-23 18:27:14 +07:00
parent cfe1209d61
commit 3e388c2857
6 changed files with 69 additions and 53 deletions
@@ -6,7 +6,6 @@ import (
 	"encoding/hex"
 	"errors"
 	"fmt"
-	"io"
 	"net"
 	"net/netip"
 	"runtime"
@@ -17,7 +16,6 @@ import (

 	"github.com/miekg/dns"
 	"golang.org/x/sync/errgroup"
-	"tailscale.com/net/captivedetection"
 	"tailscale.com/net/netaddr"
 	"tailscale.com/net/netmon"
 	"tailscale.com/net/tsaddr"
@@ -412,6 +410,16 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *proxyResponse {
 	upstreams := req.ufr.upstreams
 	serveStaleCache := p.cache != nil && p.cfg.Service.CacheServeStale
 	upstreamConfigs := p.upstreamConfigsFromUpstreamNumbers(upstreams)
+
+	// If ctrld is going to leak query to OS resolver, check remote upstream in background,
+	// so ctrld could be back to normal operation as long as the network is back online.
+	if len(upstreamConfigs) > 0 && p.leakingQuery.Load() {
+		for n, uc := range upstreamConfigs {
+			go p.checkUpstream(upstreams[n], uc)
+		}
+		upstreamConfigs = nil
+	}
+
 	if len(upstreamConfigs) == 0 {
 		upstreamConfigs = []*ctrld.UpstreamConfig{osUpstreamConfig}
 		upstreams = []string{upstreamOS}
@@ -501,17 +509,9 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *proxyResponse {
 			if isNetworkErr {
 				p.um.increaseFailureCount(upstreams[n])
 				if p.um.isDown(upstreams[n]) {
-					go p.um.checkUpstream(upstreams[n], upstreamConfig)
+					go p.checkUpstream(upstreams[n], upstreamConfig)
 				}
 			}
-			if cdUID != "" && (isNetworkErr || err == io.EOF) {
-				p.captivePortalMu.Lock()
-				if !p.captivePortalCheckWasRun {
-					p.captivePortalCheckWasRun = true
-					go p.performCaptivePortalDetection()
-				}
-				p.captivePortalMu.Unlock()
-			}
 			// For timeout error (i.e: context deadline exceed), force re-bootstrapping.
 			var e net.Error
 			if errors.As(err, &e) && e.Timeout() {
@@ -580,6 +580,14 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *proxyResponse {
 		return res
 	}
 	ctrld.Log(ctx, mainLog.Load().Error(), "all %v endpoints failed", upstreams)
+	if cdUID != "" && p.leakOnUpstreamFailure() {
+		p.leakingQueryMu.Lock()
+		if !p.leakingQueryWasRun {
+			p.leakingQueryWasRun = true
+			go p.performLeakingQuery()
+		}
+		p.leakingQueryMu.Unlock()
+	}
 	answer := new(dns.Msg)
 	answer.SetRcode(req.msg, dns.RcodeServerFailure)
 	res.answer = answer
@@ -597,9 +605,6 @@ func (p *prog) upstreamsAndUpstreamConfigForLanAndPtr(upstreams []string, upstre
 }

 func (p *prog) upstreamConfigsFromUpstreamNumbers(upstreams []string) []*ctrld.UpstreamConfig {
-	if p.captivePortalDetected.Load() {
-		return nil // always use OS resolver if behind captive portal.
-	}
 	upstreamConfigs := make([]*ctrld.UpstreamConfig, 0, len(upstreams))
 	for _, upstream := range upstreams {
 		upstreamNum := strings.TrimPrefix(upstream, upstreamPrefix)
@@ -903,31 +908,16 @@ func (p *prog) selfUninstallCoolOfPeriod() {
 	p.selfUninstallMu.Unlock()
 }

-// performCaptivePortalDetection check if ctrld is running behind a captive portal.
-func (p *prog) performCaptivePortalDetection() {
-	mainLog.Load().Warn().Msg("Performing captive portal detection")
-	d := captivedetection.NewDetector(logf)
-	found := true
-	var resetDnsOnce sync.Once
-	for found {
-		time.Sleep(2 * time.Second)
-		found = d.Detect(context.Background(), netmon.NewStatic(), nil, 0)
-		if found {
-			resetDnsOnce.Do(func() {
-				mainLog.Load().Warn().Msg("found captive portal, leaking query to OS resolver")
-				// Store the result once here, so changes made below won't be reverted by DNS watchers.
-				p.captivePortalDetected.Store(found)
-				p.resetDNS()
-			})
-		}
-		p.captivePortalDetected.Store(found)
-	}
-
-	p.captivePortalMu.Lock()
-	p.captivePortalCheckWasRun = false
-	p.captivePortalMu.Unlock()
+// performLeakingQuery performs necessary works to leak queries to OS resolver.
+func (p *prog) performLeakingQuery() {
+	mainLog.Load().Warn().Msg("leaking query to OS resolver")
+	// Signal dns watchers to stop, so changes made below won't be reverted.
+	p.leakingQuery.Store(true)
+	p.resetDNS()
+	ns := ctrld.InitializeOsResolver()
+	mainLog.Load().Debug().Msgf("re-initialized OS resolver with nameservers: %v", ns)
+	p.dnsWg.Wait()
 	p.setDNS()
-	mainLog.Load().Warn().Msg("captive portal login finished, stop leaking query")
 }

 // forceFetchingAPI sends signal to force syncing API config if run in cd mode,
@@ -107,9 +107,9 @@ type prog struct {
 	loopMu sync.Mutex
 	loop   map[string]bool

-	captivePortalMu          sync.Mutex
-	captivePortalCheckWasRun bool
-	captivePortalDetected    atomic.Bool
+	leakingQueryMu     sync.Mutex
+	leakingQueryWasRun bool
+	leakingQuery       atomic.Bool

 	started       chan struct{}
 	onStartedDone chan struct{}
@@ -685,7 +685,7 @@ func (p *prog) dnsWatchdog(iface *net.Interface, nameservers []string, allIfaces
 			mainLog.Load().Debug().Msg("stop dns watchdog")
 			return
 		case <-ticker.C:
-			if p.captivePortalDetected.Load() {
+			if p.leakingQuery.Load() {
 				return
 			}
 			if dnsChanged(iface, ns) {
@@ -742,6 +742,18 @@ func (p *prog) resetDNS() {
 	}
 }

+// leakOnUpstreamFailure reports whether ctrld should leak query to OS resolver when failed to connect all upstreams.
+func (p *prog) leakOnUpstreamFailure() bool {
+	if ptr := p.cfg.Service.LeakOnUpstreamFailure; ptr != nil {
+		return *ptr
+	}
+	// Default is false on routers, since this leaking is only useful for devices that move between networks.
+	if router.Name() != "" {
+		return false
+	}
+	return true
+}
+
 func randomLocalIP() string {
 	n := rand.Intn(254-2) + 2
 	return fmt.Sprintf("127.0.0.%d", n)
@@ -40,7 +40,7 @@ func (p *prog) watchResolvConf(iface *net.Interface, ns []netip.Addr, setDnsFn f
 			mainLog.Load().Debug().Msgf("stopping watcher for %s", resolvConfPath)
 			return
 		case event, ok := <-watcher.Events:
-			if p.captivePortalDetected.Load() {
+			if p.leakingQuery.Load() {
 				return
 			}
 			if !ok {
@@ -71,19 +71,19 @@ func (um *upstreamMonitor) reset(upstream string) {

 // checkUpstream checks the given upstream status, periodically sending query to upstream
 // until successfully. An upstream status/counter will be reset once it becomes reachable.
-func (um *upstreamMonitor) checkUpstream(upstream string, uc *ctrld.UpstreamConfig) {
-	um.mu.Lock()
-	isChecking := um.checking[upstream]
+func (p *prog) checkUpstream(upstream string, uc *ctrld.UpstreamConfig) {
+	p.um.mu.Lock()
+	isChecking := p.um.checking[upstream]
 	if isChecking {
-		um.mu.Unlock()
+		p.um.mu.Unlock()
 		return
 	}
-	um.checking[upstream] = true
-	um.mu.Unlock()
+	p.um.checking[upstream] = true
+	p.um.mu.Unlock()
 	defer func() {
-		um.mu.Lock()
-		um.checking[upstream] = false
-		um.mu.Unlock()
+		p.um.mu.Lock()
+		p.um.checking[upstream] = false
+		p.um.mu.Unlock()
 	}()

 	resolver, err := ctrld.NewResolver(uc)
@@ -104,7 +104,13 @@ func (um *upstreamMonitor) checkUpstream(upstream string, uc *ctrld.UpstreamConf
 	for {
 		if err := check(); err == nil {
 			mainLog.Load().Debug().Msgf("upstream %q is online", uc.Endpoint)
-			um.reset(upstream)
+			p.um.reset(upstream)
+			if p.leakingQuery.CompareAndSwap(true, false) {
+				p.leakingQueryMu.Lock()
+				p.leakingQueryWasRun = false
+				p.leakingQueryMu.Unlock()
+				mainLog.Load().Warn().Msg("stop leaking query")
+			}
 			return
 		}
 		time.Sleep(checkUpstreamBackoffSleep)