increase OSresolver timeout, fix debug log statements

flush dns cache, manually hit captive portal on MacOS

fix real ip in debug log

treat all upstreams as down upon network change

delay upstream checks when leaking queries on network changes
This commit is contained in:
Alex
2025-02-03 21:19:03 -05:00
committed by Cuong Manh Le
parent 1560455ca3
commit 168eaf538b
2 changed files with 87 additions and 11 deletions

View File

@@ -7,7 +7,9 @@ import (
"errors"
"fmt"
"net"
"net/http"
"net/netip"
"os/exec"
"runtime"
"slices"
"strconv"
@@ -42,7 +44,7 @@ const (
var osUpstreamConfig = &ctrld.UpstreamConfig{
Name: "OS resolver",
Type: ctrld.ResolverTypeOS,
Timeout: 2000,
Timeout: 3000,
}
var privateUpstreamConfig = &ctrld.UpstreamConfig{
@@ -436,10 +438,14 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *proxyResponse {
leaked := false
if len(upstreamConfigs) > 0 {
p.leakingQueryMu.Lock()
if p.leakingQueryRunning[upstreamMapKey] {
if p.leakingQueryRunning[upstreamMapKey] || p.leakingQueryRunning["all"] {
upstreamConfigs = nil
leaked = true
ctrld.Log(ctx, mainLog.Load().Debug(), "%v is down, leaking query to OS resolver", upstreams)
if p.leakingQueryRunning["all"] {
ctrld.Log(ctx, mainLog.Load().Debug(), "all upstreams marked down for network change, leaking query to OS resolver")
} else {
ctrld.Log(ctx, mainLog.Load().Debug(), "%v is down, leaking query to OS resolver", upstreams)
}
}
p.leakingQueryMu.Unlock()
}
@@ -576,13 +582,13 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *proxyResponse {
Bool("is_lan_query", isLanOrPtrQuery)
if p.isLoop(upstreamConfig) {
logger.Msg("DNS loop detected")
ctrld.Log(ctx, logger, "DNS loop detected")
continue
}
if p.um.isDown(upstreams[n]) {
logger.
Bool("is_os_resolver", upstreams[n] == upstreamOS).
Msg("Upstream is down")
Bool("is_os_resolver", upstreams[n] == upstreamOS)
ctrld.Log(ctx, logger, "Upstream is down")
continue
}
answer := resolve(n, upstreamConfig, req.msg)
@@ -995,10 +1001,11 @@ func (p *prog) performLeakingQuery(failedUpstreams map[string]*ctrld.UpstreamCon
// we only want to reset DNS when our resolver is broken
// this allows us to find the new OS resolver nameservers
if p.um.isDown(upstreamOS) {
// we skip the all upstream lock key to prevent duplicate calls
if p.um.isDown(upstreamOS) && upstreamMapKey != "all" {
mainLog.Load().Debug().Msg("OS resolver is down, reinitializing")
p.reinitializeOSResolver()
p.reinitializeOSResolver(false)
}
@@ -1006,6 +1013,15 @@ func (p *prog) performLeakingQuery(failedUpstreams map[string]*ctrld.UpstreamCon
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// if a network change, delay upstream checks by 1s
// this is to ensure we actually leak queries to OS resolver
// We have observed some captive portals leak queries to public upstreams
// This can cause the captive portal on MacOS to not trigger a popup
if upstreamMapKey != "all" {
mainLog.Load().Debug().Msg("network change leaking queries, delaying upstream checks by 1s")
time.Sleep(1 * time.Second)
}
upstreamCh := make(chan string, len(failedUpstreams))
for name, uc := range failedUpstreams {
go func(name string, uc *ctrld.UpstreamConfig) {
@@ -1213,7 +1229,7 @@ func resolveInternalDomainTestQuery(ctx context.Context, domain string, m *dns.M
// by removing ctrld listenr from the interface, collecting the network nameservers
// and re-initializing the OS resolver with the nameservers
// applying listener back to the interface
func (p *prog) reinitializeOSResolver() {
func (p *prog) reinitializeOSResolver(networkChange bool) {
// Cancel any existing operations
p.resetCtxMu.Lock()
if p.resetCancel != nil {
@@ -1232,6 +1248,21 @@ func (p *prog) reinitializeOSResolver() {
p.leakingQueryReset.Store(true)
defer p.leakingQueryReset.Store(false)
defer func() {
// start leaking queries immediately
if networkChange {
// set all upstreams to fialed and provide to performLeakingQuery
failedUpstreams := make(map[string]*ctrld.UpstreamConfig)
for _, upstream := range p.cfg.Upstream {
failedUpstreams[upstream.Name] = upstream
}
go p.performLeakingQuery(failedUpstreams, "all")
}
if err := FlushDNSCache(); err != nil {
mainLog.Load().Warn().Err(err).Msg("failed to flush DNS cache")
}
}()
select {
case <-ctx.Done():
mainLog.Load().Debug().Msg("DNS reset cancelled by new network change")
@@ -1264,6 +1295,51 @@ func (p *prog) reinitializeOSResolver() {
}
}
func triggerCaptiveCheck() {
// Wait for a short period to ensure DNS reinitialization is complete.
time.Sleep(2 * time.Second)
// if not Mac OS, return
if runtime.GOOS != "darwin" {
return
}
// Trigger a lookup for captive.apple.com.
// This can be done either via a DNS query or an HTTP GET.
// Here we use a simple HTTP GET which is what macOS CaptiveNetworkAssistant uses.
client := &http.Client{
Timeout: 5 * time.Second,
}
resp, err := client.Get("http://captive.apple.com/generate_204")
if err != nil {
mainLog.Load().Debug().Msg("failed to trigger captive portal check")
return
}
resp.Body.Close()
mainLog.Load().Debug().Msg("triggered captive portal check by querying captive.apple.com")
}
// FlushDNSCache flushes the DNS cache on macOS.
func FlushDNSCache() error {
// if not Mac OS, return
if runtime.GOOS != "darwin" {
return nil
}
// Flush the DNS cache via mDNSResponder.
// This is typically needed on modern macOS systems.
if err := exec.Command("sudo", "killall", "-HUP", "mDNSResponder").Run(); err != nil {
return fmt.Errorf("failed to flush mDNSResponder: %w", err)
}
// Optionally, flush the directory services cache.
if err := exec.Command("sudo", "dscacheutil", "-flushcache").Run(); err != nil {
return fmt.Errorf("failed to flush dscacheutil: %w", err)
}
return nil
}
// monitorNetworkChanges starts monitoring for network interface changes
func (p *prog) monitorNetworkChanges() error {
mon, err := netmon.New(logger.WithPrefix(mainLog.Load().Printf, "netmon: "))
@@ -1321,7 +1397,7 @@ func (p *prog) monitorNetworkChanges() error {
}
if activeInterfaceExists {
p.reinitializeOSResolver()
p.reinitializeOSResolver(true)
} else {
mainLog.Load().Debug().Msg("No active interfaces found, skipping reinitialization")
}

View File

@@ -237,7 +237,7 @@ func (o *osResolver) Resolve(ctx context.Context, msg *dns.Msg) (*dns.Msg, error
ctx, cancel := context.WithCancel(ctx)
defer cancel()
dnsClient := &dns.Client{Net: "udp", Timeout: 2 * time.Second}
dnsClient := &dns.Client{Net: "udp", Timeout: 3 * time.Second}
ch := make(chan *osResolverResult, numServers)
wg := &sync.WaitGroup{}
wg.Add(numServers)