From 3442331695434d0ef0c5a7a626774b56e9926dd1 Mon Sep 17 00:00:00 2001 From: Codescribe Date: Thu, 5 Mar 2026 04:50:12 -0500 Subject: [PATCH] feat: add macOS pf DNS interception --- cmd/cli/dns_intercept_darwin.go | 1744 +++++++++++++++++ cmd/cli/dns_intercept_darwin_test.go | 127 ++ docs/pf-dns-intercept.md | 298 +++ test-scripts/README.md | 44 + test-scripts/darwin/test-dns-intercept.sh | 556 ++++++ .../darwin/test-pf-group-exemption.sh | 147 ++ test-scripts/darwin/test-recovery-bypass.sh | 301 +++ test-scripts/darwin/validate-pf-rules.sh | 272 +++ test-scripts/macos/diag-lo0-capture.sh | 40 + test-scripts/macos/diag-pf-poll.sh | 62 + test-scripts/macos/diag-windscribe-connect.sh | 183 ++ test-scripts/windows/diag-intercept.ps1 | 131 ++ test-scripts/windows/test-dns-intercept.ps1 | 544 +++++ test-scripts/windows/test-recovery-bypass.ps1 | 289 +++ 14 files changed, 4738 insertions(+) create mode 100644 cmd/cli/dns_intercept_darwin.go create mode 100644 cmd/cli/dns_intercept_darwin_test.go create mode 100644 docs/pf-dns-intercept.md create mode 100644 test-scripts/README.md create mode 100755 test-scripts/darwin/test-dns-intercept.sh create mode 100755 test-scripts/darwin/test-pf-group-exemption.sh create mode 100755 test-scripts/darwin/test-recovery-bypass.sh create mode 100755 test-scripts/darwin/validate-pf-rules.sh create mode 100644 test-scripts/macos/diag-lo0-capture.sh create mode 100644 test-scripts/macos/diag-pf-poll.sh create mode 100644 test-scripts/macos/diag-windscribe-connect.sh create mode 100644 test-scripts/windows/diag-intercept.ps1 create mode 100644 test-scripts/windows/test-dns-intercept.ps1 create mode 100644 test-scripts/windows/test-recovery-bypass.ps1 diff --git a/cmd/cli/dns_intercept_darwin.go b/cmd/cli/dns_intercept_darwin.go new file mode 100644 index 0000000..c5461d3 --- /dev/null +++ b/cmd/cli/dns_intercept_darwin.go @@ -0,0 +1,1744 @@ +//go:build darwin + +package cli + +import ( + "context" + "crypto/sha256" + "fmt" + "net" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "sync/atomic" + "syscall" + "time" + + "github.com/Control-D-Inc/ctrld" +) + +const ( + // pfWatchdogInterval is how often the periodic pf watchdog checks + // that our anchor references are still present in the running ruleset. + pfWatchdogInterval = 30 * time.Second + + // pfConsecutiveMissThreshold is the number of consecutive watchdog cycles + // where the anchor was found missing before escalating to ERROR level. + // This indicates something is persistently fighting our pf rules. + pfConsecutiveMissThreshold = 3 + + // pfAnchorRecheckDelay is how long to wait after a network change before + // performing a second pf anchor check. This catches race conditions where + // another program (e.g., Windscribe desktop) clears pf rules slightly + // after our network change handler runs. + pfAnchorRecheckDelay = 2 * time.Second + + // pfAnchorRecheckDelayLong is a second, longer delayed re-check after network + // changes. Some VPNs (e.g., Windscribe) take 3-4s to fully tear down their pf + // rules and DNS settings on disconnect. This catches slower teardowns that the + // 2s re-check misses. + pfAnchorRecheckDelayLong = 4 * time.Second + + // pfVPNInterfacePrefixes lists interface name prefixes that indicate VPN/tunnel + // interfaces on macOS. Used to add interface-specific DNS intercept rules so that + // VPN software with "pass out quick on " rules cannot bypass our intercept. + // Common prefixes: + // ipsec* - IKEv2/IPsec VPNs (Windscribe, macOS built-in) + // utun* - TUN interfaces (WireGuard, Tailscale, OpenVPN, etc.) + // ppp* - PPTP/L2TP VPNs + // tap* - TAP interfaces (OpenVPN in bridge mode) + // tun* - Legacy TUN interfaces + // lo0 is excluded since our rules already handle loopback. + pfVPNInterfacePrefixes = "ipsec,utun,ppp,tap,tun" +) + +const ( + // pfProbeDomain is the suffix used for pf interception probe queries. + // No trailing dot — canonicalName() in the DNS handler strips trailing dots. + pfProbeDomain = "pf-probe.ctrld.test" + + // pfProbeTimeout is how long to wait for a probe query to arrive at ctrld. + pfProbeTimeout = 1 * time.Second + + // pfGroupName is the macOS system group used to scope pf exemption rules. + // Only processes running with this effective GID can bypass the DNS redirect, + // preventing other applications from circumventing ctrld by querying exempted IPs directly. + pfGroupName = "_ctrld" + + // pfAnchorName is the pf anchor name used by ctrld for DNS interception. + // Using reverse-DNS convention to avoid conflicts with other software. + pfAnchorName = "com.controld.ctrld" + + // pfAnchorDir is the directory where pf anchor files are stored on macOS. + pfAnchorDir = "/etc/pf.anchors" + + // pfAnchorFile is the full path to ctrld's pf anchor configuration file. + pfAnchorFile = "/etc/pf.anchors/com.controld.ctrld" +) + +// pfState holds the state of the pf DNS interception on macOS. +type pfState struct { + anchorFile string + anchorName string +} + +// ensureCtrldGroup creates the _ctrld system group if it doesn't exist and returns its GID. +// Uses dscl (macOS Directory Services) to manage the group. This function is idempotent — +// safe to call multiple times across restarts. The group is intentionally never removed +// on shutdown to avoid race conditions during rapid restart cycles. +func ensureCtrldGroup() (int, error) { + // Check if the group already exists. + out, err := exec.Command("dscl", ".", "-read", "/Groups/"+pfGroupName, "PrimaryGroupID").CombinedOutput() + if err == nil { + // Group exists — parse and return its GID. + // Output format: "PrimaryGroupID: 350" + line := strings.TrimSpace(string(out)) + parts := strings.SplitN(line, ":", 2) + if len(parts) == 2 { + gid, err := strconv.Atoi(strings.TrimSpace(parts[1])) + if err != nil { + return 0, fmt.Errorf("failed to parse existing group GID from %q: %w", line, err) + } + mainLog.Load().Debug().Msgf("DNS intercept: group %s already exists with GID %d", pfGroupName, gid) + return gid, nil + } + return 0, fmt.Errorf("unexpected dscl output for existing group: %q", line) + } + + // Group doesn't exist — find an unused GID in the 350-450 range (system group range on macOS, + // above Apple's reserved range but below typical user groups). + listOut, err := exec.Command("dscl", ".", "-list", "/Groups", "PrimaryGroupID").CombinedOutput() + if err != nil { + return 0, fmt.Errorf("failed to list existing groups: %w (output: %s)", err, strings.TrimSpace(string(listOut))) + } + + usedGIDs := make(map[int]bool) + for _, line := range strings.Split(string(listOut), "\n") { + fields := strings.Fields(line) + if len(fields) >= 2 { + if gid, err := strconv.Atoi(fields[len(fields)-1]); err == nil { + usedGIDs[gid] = true + } + } + } + + chosenGID := 0 + for gid := 350; gid <= 450; gid++ { + if !usedGIDs[gid] { + chosenGID = gid + break + } + } + if chosenGID == 0 { + return 0, fmt.Errorf("no unused GID found in range 350-450") + } + + // Create the group record. Handle eDSRecordAlreadyExists gracefully in case of a + // race with another ctrld instance. + createOut, err := exec.Command("dscl", ".", "-create", "/Groups/"+pfGroupName).CombinedOutput() + if err != nil { + outStr := strings.TrimSpace(string(createOut)) + if !strings.Contains(outStr, "eDSRecordAlreadyExists") { + return 0, fmt.Errorf("failed to create group record: %w (output: %s)", err, outStr) + } + } + + // Set the GID. This is idempotent — dscl overwrites the attribute if it already exists. + if out, err := exec.Command("dscl", ".", "-create", "/Groups/"+pfGroupName, "PrimaryGroupID", strconv.Itoa(chosenGID)).CombinedOutput(); err != nil { + return 0, fmt.Errorf("failed to set group GID: %w (output: %s)", err, strings.TrimSpace(string(out))) + } + + if out, err := exec.Command("dscl", ".", "-create", "/Groups/"+pfGroupName, "RealName", "ctrld DNS Intercept Group").CombinedOutput(); err != nil { + return 0, fmt.Errorf("failed to set group RealName: %w (output: %s)", err, strings.TrimSpace(string(out))) + } + + mainLog.Load().Info().Msgf("DNS intercept: created system group %s with GID %d", pfGroupName, chosenGID) + return chosenGID, nil +} + +// setCtrldGroupID sets the process's effective GID to the _ctrld group. +// This must be called before any outbound DNS sockets are created so that +// pf's "group _ctrld" matching applies to ctrld's own DNS queries. +// Only ctrld (running as root with this effective GID) will match the exemption rules, +// preventing other processes from bypassing the DNS redirect. +func setCtrldGroupID(gid int) error { + if err := syscall.Setegid(gid); err != nil { + return fmt.Errorf("syscall.Setegid(%d) failed: %w", gid, err) + } + mainLog.Load().Info().Msgf("DNS intercept: set process effective GID to %d (%s)", gid, pfGroupName) + return nil +} + +// startDNSIntercept activates pf-based DNS interception on macOS. +// It creates a pf anchor that redirects all outbound DNS (port 53) traffic +// to ctrld's local listener at 127.0.0.1:53. This eliminates the race condition +// with VPN software that overwrites interface DNS settings. +// +// The approach: +// 1. Write a pf anchor file with redirect rules for all non-loopback interfaces +// 2. Load the anchor into pf +// 3. Ensure pf is enabled +// +// ctrld's own upstream queries use DoH (port 443), so they are NOT affected +// by the port 53 redirect. If an "os" upstream is configured (which uses port 53), +// we skip the redirect for traffic from the ctrld process itself. +func (p *prog) startDNSIntercept() error { + mainLog.Load().Info().Msg("DNS intercept: initializing macOS packet filter (pf) redirect") + + if err := p.validateDNSIntercept(); err != nil { + return err + } + + // Set up _ctrld group for pf exemption scoping. This ensures that only ctrld's + // own DNS queries (matching "group _ctrld" in pf rules) can bypass the redirect. + // Must happen BEFORE loading pf rules so the effective GID is set when sockets are created. + gid, err := ensureCtrldGroup() + if err != nil { + return fmt.Errorf("dns intercept: failed to create %s group: %w", pfGroupName, err) + } + if err := setCtrldGroupID(gid); err != nil { + return fmt.Errorf("dns intercept: failed to set process GID to %s: %w", pfGroupName, err) + } + + // Clean up any stale state from a previous crash. + if _, err := os.Stat(pfAnchorFile); err == nil { + mainLog.Load().Warn().Msg("DNS intercept: found stale pf anchor file from previous run — cleaning up") + exec.Command("pfctl", "-a", pfAnchorName, "-F", "all").CombinedOutput() + os.Remove(pfAnchorFile) + } + + // Pre-discover VPN DNS configurations before building initial rules. + // Without this, there's a startup gap where the initial anchor has no VPN DNS + // exemptions, causing queries to be intercepted and routed to ctrld. Stale pf + // state entries from the gap persist even after vpnDNS.Refresh() adds exemptions. + var initialExemptions []vpnDNSExemption + if !hardIntercept { + initialConfigs := ctrld.DiscoverVPNDNS(context.Background()) + type key struct{ server, iface string } + seen := make(map[key]bool) + for _, config := range initialConfigs { + for _, server := range config.Servers { + k := key{server, config.InterfaceName} + if !seen[k] { + seen[k] = true + initialExemptions = append(initialExemptions, vpnDNSExemption{ + Server: server, + Interface: config.InterfaceName, + }) + } + } + } + if len(initialExemptions) > 0 { + mainLog.Load().Info().Msgf("DNS intercept: pre-discovered %d VPN DNS exemptions for initial rules", len(initialExemptions)) + } + } + + rules := p.buildPFAnchorRules(initialExemptions) + + if err := os.MkdirAll(pfAnchorDir, 0755); err != nil { + return fmt.Errorf("dns intercept: failed to create pf anchor directory %s: %w", pfAnchorDir, err) + } + if err := os.WriteFile(pfAnchorFile, []byte(rules), 0644); err != nil { + return fmt.Errorf("dns intercept: failed to write pf anchor file %s: %w", pfAnchorFile, err) + } + mainLog.Load().Debug().Msgf("DNS intercept: wrote pf anchor file: %s", pfAnchorFile) + + out, err := exec.Command("pfctl", "-a", pfAnchorName, "-f", pfAnchorFile).CombinedOutput() + if err != nil { + os.Remove(pfAnchorFile) + return fmt.Errorf("dns intercept: failed to load pf anchor: %w (output: %s)", err, strings.TrimSpace(string(out))) + } + mainLog.Load().Debug().Msgf("DNS intercept: loaded pf anchor %q from %s", pfAnchorName, pfAnchorFile) + + if err := p.ensurePFAnchorReference(); err != nil { + mainLog.Load().Warn().Err(err).Msg("DNS intercept: could not add anchor references to running pf ruleset — anchor may not be active") + } + + out, err = exec.Command("pfctl", "-e").CombinedOutput() + if err != nil { + outStr := strings.TrimSpace(string(out)) + if !strings.Contains(outStr, "already enabled") { + mainLog.Load().Warn().Msgf("DNS intercept: pfctl -e returned: %s (err: %v) — pf may not be enabled", outStr, err) + } + } + + out, err = exec.Command("pfctl", "-a", pfAnchorName, "-sr").CombinedOutput() + if err != nil { + mainLog.Load().Warn().Msgf("DNS intercept: could not verify anchor rules: %v", err) + } else { + ruleCount := strings.Count(strings.TrimSpace(string(out)), "\n") + 1 + mainLog.Load().Info().Msgf("DNS intercept: pf anchor %q active with %d rules", pfAnchorName, ruleCount) + mainLog.Load().Debug().Msgf("DNS intercept: active pf rules:\n%s", strings.TrimSpace(string(out))) + } + + out, err = exec.Command("pfctl", "-a", pfAnchorName, "-sn").CombinedOutput() + if err == nil && len(strings.TrimSpace(string(out))) > 0 { + mainLog.Load().Debug().Msgf("DNS intercept: active pf NAT/redirect rules:\n%s", strings.TrimSpace(string(out))) + } + + // Post-load verification: confirm everything actually took effect. + p.verifyPFState() + + p.dnsInterceptState = &pfState{ + anchorFile: pfAnchorFile, + anchorName: pfAnchorName, + } + + // Store the initial set of tunnel interfaces so we can detect changes later. + p.mu.Lock() + p.lastTunnelIfaces = discoverTunnelInterfaces() + p.mu.Unlock() + + mainLog.Load().Info().Msgf("DNS intercept: pf redirect active — all outbound DNS (port 53) redirected to 127.0.0.1:53 via anchor %q", pfAnchorName) + + // Start the pf watchdog to detect and restore rules if another program + // (e.g., Windscribe desktop, macOS configd) replaces the pf ruleset. + go p.pfWatchdog() + + return nil +} + +// ensurePFAnchorReference ensures the running pf ruleset includes our anchor +// declarations. We dump the RUNNING ruleset via "pfctl -sr" (filter+scrub rules) +// and "pfctl -sn" (NAT/rdr rules), check if our references exist, and if not, +// inject them and reload the combined ruleset via stdin. +// +// pf enforces strict rule ordering: +// +// options → normalization (scrub) → queueing → translation (nat/rdr) → filtering (pass/block/anchor) +// +// "pfctl -sr" returns BOTH scrub-anchor (normalization) AND anchor/pass/block (filter) rules. +// "pfctl -sn" returns nat-anchor AND rdr-anchor (translation) rules. +// Both commands emit "No ALTQ support in kernel" warnings on stderr. +// +// We must reassemble in correct order: scrub → nat/rdr → filter. +// +// The anchor reference does not survive a reboot, but ctrld re-adds it on every start. +func (p *prog) ensurePFAnchorReference() error { + rdrAnchorRef := fmt.Sprintf("rdr-anchor \"%s\"", pfAnchorName) + anchorRef := fmt.Sprintf("anchor \"%s\"", pfAnchorName) + + // Dump running rules. Use CombinedOutput but filter out stderr warnings. + natOut, err := exec.Command("pfctl", "-sn").CombinedOutput() + if err != nil { + return fmt.Errorf("failed to dump running NAT rules: %w (output: %s)", err, strings.TrimSpace(string(natOut))) + } + + filterOut, err := exec.Command("pfctl", "-sr").CombinedOutput() + if err != nil { + return fmt.Errorf("failed to dump running filter rules: %w (output: %s)", err, strings.TrimSpace(string(filterOut))) + } + + // Filter pfctl output into actual pf rules, stripping stderr warnings. + natLines := pfFilterRuleLines(string(natOut)) + filterLines := pfFilterRuleLines(string(filterOut)) + + hasRdrAnchor := pfContainsRule(natLines, rdrAnchorRef) + hasAnchor := pfContainsRule(filterLines, anchorRef) + + if hasRdrAnchor && hasAnchor { + // Verify anchor ordering: our anchor should appear before other anchors + // for reliable DNS interception priority. Log a warning if out of order, + // but don't force a reload (the interface-specific rules in our anchor + // provide a secondary safety net even if ordering is suboptimal). + p.checkAnchorOrdering(filterLines, anchorRef) + mainLog.Load().Debug().Msg("DNS intercept: anchor references already present in running ruleset") + return nil + } + + mainLog.Load().Info().Msg("DNS intercept: injecting anchor references into running pf ruleset") + + // Separate scrub rules from filter rules (pfctl -sr returns both). + // scrub/scrub-anchor = normalization, must come BEFORE translation. + var scrubLines, pureFilterLines []string + for _, line := range filterLines { + if strings.HasPrefix(line, "scrub") { + scrubLines = append(scrubLines, line) + } else { + pureFilterLines = append(pureFilterLines, line) + } + } + + // Inject our references if missing. PREPEND both references to ensure our + // anchor is evaluated BEFORE any other anchors (e.g., Windscribe's + // "windscribe_vpn_traffic"). pf evaluates rules top-to-bottom, so "quick" + // rules in whichever anchor appears first win. By prepending, our DNS + // intercept rules match port 53 traffic before a VPN app's broader + // "pass out quick on all" rules in their anchor. + if !hasRdrAnchor { + natLines = append([]string{rdrAnchorRef}, natLines...) + } + if !hasAnchor { + pureFilterLines = append([]string{anchorRef}, pureFilterLines...) + } + + // Dump and clean pf options. VPN apps (e.g., Windscribe) set "set skip on { lo0 }" + // which disables pf processing on loopback, breaking our route-to + rdr mechanism. + // We strip lo0 and tunnel interfaces from the skip list before reloading. + cleanedOptions, hadLoopbackSkip := pfGetCleanedOptions() + if hadLoopbackSkip { + mainLog.Load().Info().Msg("DNS intercept: will reload pf options without lo0 in skip list") + } + + // Reassemble in pf's required order: options → scrub → translation → filtering. + var combined strings.Builder + if cleanedOptions != "" { + combined.WriteString(cleanedOptions) + } + for _, line := range scrubLines { + combined.WriteString(line + "\n") + } + for _, line := range natLines { + combined.WriteString(line + "\n") + } + for _, line := range pureFilterLines { + combined.WriteString(line + "\n") + } + + cmd := exec.Command("pfctl", "-f", "-") + cmd.Stdin = strings.NewReader(combined.String()) + out, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to load pf ruleset with anchor references: %w (output: %s)", err, strings.TrimSpace(string(out))) + } + + mainLog.Load().Info().Msg("DNS intercept: anchor references active in running pf ruleset") + return nil +} + +// checkAnchorOrdering logs a warning if our anchor reference is not the first +// anchor in the filter ruleset. When another anchor (e.g., Windscribe's +// "windscribe_vpn_traffic") appears before ours, its "quick" rules may match +// DNS traffic first. The interface-specific tunnel rules in our anchor provide +// a secondary defense, but first position is still preferred. +func (p *prog) checkAnchorOrdering(filterLines []string, ourAnchorRef string) { + for _, line := range filterLines { + if strings.HasPrefix(line, "anchor ") { + if strings.Contains(line, ourAnchorRef) { + // Our anchor is first — ideal ordering. + return + } + // Another anchor appears before ours. + mainLog.Load().Warn().Msgf("DNS intercept: anchor ordering suboptimal — %q appears before our anchor %q. "+ + "Interface-specific rules provide fallback protection, but prepending is preferred.", line, pfAnchorName) + return + } + } +} + +// pfGetCleanedOptions dumps the running pf options via "pfctl -sO" and returns +// them with lo0 removed from any "set skip on" directive. VPN apps like Windscribe +// set "set skip on { lo0 }" which tells pf to bypass ALL processing on +// loopback — this breaks our route-to + rdr interception mechanism which depends on +// lo0. We strip lo0 (and any known VPN tunnel interfaces) from the skip list so our +// rdr rules on lo0 can fire. Other options (timeouts, limits, etc.) are preserved. +// +// Returns the cleaned options as a string suitable for prepending to a pfctl -f reload, +// and a boolean indicating whether lo0 was found in the skip list (i.e., we needed to fix it). +func pfGetCleanedOptions() (string, bool) { + out, err := exec.Command("pfctl", "-sO").CombinedOutput() + if err != nil { + mainLog.Load().Debug().Err(err).Msg("DNS intercept: could not dump pf options") + return "", false + } + + var cleaned strings.Builder + hadLoopbackSkip := false + + for _, line := range strings.Split(string(out), "\n") { + line = strings.TrimSpace(line) + if line == "" || strings.Contains(line, "ALTQ") { + continue + } + + // Parse "set skip on { lo0 ipsec0 }" or "set skip on lo0" + if strings.HasPrefix(line, "set skip on") { + // Extract interface list from the skip directive. + skipPart := strings.TrimPrefix(line, "set skip on") + skipPart = strings.TrimSpace(skipPart) + skipPart = strings.Trim(skipPart, "{}") + skipPart = strings.TrimSpace(skipPart) + + ifaces := strings.Fields(skipPart) + var kept []string + for _, iface := range ifaces { + if iface == "lo0" { + hadLoopbackSkip = true + continue // Remove lo0 — we need pf to process lo0 for our rdr rules. + } + // Also remove VPN tunnel interfaces — we have explicit intercept + // rules for them in our anchor, so skipping defeats the purpose. + isTunnel := false + for _, prefix := range strings.Split(pfVPNInterfacePrefixes, ",") { + if strings.HasPrefix(iface, strings.TrimSpace(prefix)) { + isTunnel = true + break + } + } + if isTunnel { + mainLog.Load().Debug().Msgf("DNS intercept: removing tunnel interface %q from pf skip list", iface) + continue + } + kept = append(kept, iface) + } + + if len(kept) > 0 { + cleaned.WriteString(fmt.Sprintf("set skip on { %s }\n", strings.Join(kept, " "))) + } + // If no interfaces left, omit the skip directive entirely. + continue + } + + // Preserve all other options (timeouts, limits, etc.). + cleaned.WriteString(line + "\n") + } + + if hadLoopbackSkip { + mainLog.Load().Warn().Msg("DNS intercept: detected 'set skip on lo0' — another program (likely VPN software) " + + "disabled pf processing on loopback, which breaks our DNS interception. Removing lo0 from skip list.") + } + + return cleaned.String(), hadLoopbackSkip +} + +// pfFilterRuleLines filters pfctl output into actual pf rule lines, +// stripping stderr warnings (e.g., "No ALTQ support in kernel") and empty lines. +func pfFilterRuleLines(output string) []string { + var rules []string + for _, line := range strings.Split(output, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + // Skip pfctl stderr warnings that appear in CombinedOutput. + if strings.Contains(line, "ALTQ") { + continue + } + rules = append(rules, line) + } + return rules +} + +// pfContainsRule checks if any line in the slice contains the given rule string. +// Uses substring matching because pfctl may append extra tokens like " all" to rules +// (e.g., `rdr-anchor "com.controld.ctrld" all`), which would fail exact matching. +func pfContainsRule(lines []string, rule string) bool { + for _, line := range lines { + if strings.Contains(line, rule) { + return true + } + } + return false +} + +// stopDNSIntercept removes all pf rules and cleans up the DNS interception. +func (p *prog) stopDNSIntercept() error { + if p.dnsInterceptState == nil { + mainLog.Load().Debug().Msg("DNS intercept: no pf state to clean up") + return nil + } + + mainLog.Load().Info().Msg("DNS intercept: shutting down pf redirect") + + out, err := exec.Command("pfctl", "-a", p.dnsInterceptState.(*pfState).anchorName, "-F", "all").CombinedOutput() + if err != nil { + mainLog.Load().Warn().Msgf("DNS intercept: failed to flush pf anchor %q: %v (output: %s)", + p.dnsInterceptState.(*pfState).anchorName, err, strings.TrimSpace(string(out))) + } else { + mainLog.Load().Debug().Msgf("DNS intercept: flushed pf anchor %q", p.dnsInterceptState.(*pfState).anchorName) + } + + if err := os.Remove(p.dnsInterceptState.(*pfState).anchorFile); err != nil && !os.IsNotExist(err) { + mainLog.Load().Warn().Msgf("DNS intercept: failed to remove anchor file %s: %v", p.dnsInterceptState.(*pfState).anchorFile, err) + } else { + mainLog.Load().Debug().Msgf("DNS intercept: removed anchor file %s", p.dnsInterceptState.(*pfState).anchorFile) + } + + if err := p.removePFAnchorReference(); err != nil { + mainLog.Load().Warn().Err(err).Msg("DNS intercept: failed to remove anchor references from running pf ruleset") + } + + p.dnsInterceptState = nil + mainLog.Load().Info().Msg("DNS intercept: pf shutdown complete") + return nil +} + +// removePFAnchorReference removes our anchor references from the running pf ruleset. +// Uses the same dump → filter → reassemble approach as ensurePFAnchorReference. +// The anchor itself is already flushed by stopDNSIntercept, so even if removal +// fails, the empty anchor is a no-op. +func (p *prog) removePFAnchorReference() error { + rdrAnchorRef := fmt.Sprintf("rdr-anchor \"%s\"", pfAnchorName) + anchorRef := fmt.Sprintf("anchor \"%s\"", pfAnchorName) + + natOut, err := exec.Command("pfctl", "-sn").CombinedOutput() + if err != nil { + return fmt.Errorf("failed to dump running NAT rules: %w (output: %s)", err, strings.TrimSpace(string(natOut))) + } + filterOut, err := exec.Command("pfctl", "-sr").CombinedOutput() + if err != nil { + return fmt.Errorf("failed to dump running filter rules: %w (output: %s)", err, strings.TrimSpace(string(filterOut))) + } + + // Filter and remove our lines. + natLines := pfFilterRuleLines(string(natOut)) + filterLines := pfFilterRuleLines(string(filterOut)) + + var cleanNat []string + for _, line := range natLines { + if !strings.Contains(line, rdrAnchorRef) { + cleanNat = append(cleanNat, line) + } + } + + // Separate scrub from filter, remove our anchor ref. + var scrubLines, cleanFilter []string + for _, line := range filterLines { + if strings.Contains(line, anchorRef) { + continue + } + if strings.HasPrefix(line, "scrub") { + scrubLines = append(scrubLines, line) + } else { + cleanFilter = append(cleanFilter, line) + } + } + + // Reassemble in correct order: scrub → translation → filtering. + var combined strings.Builder + for _, line := range scrubLines { + combined.WriteString(line + "\n") + } + for _, line := range cleanNat { + combined.WriteString(line + "\n") + } + for _, line := range cleanFilter { + combined.WriteString(line + "\n") + } + + cmd := exec.Command("pfctl", "-f", "-") + cmd.Stdin = strings.NewReader(combined.String()) + out, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to reload pf ruleset without anchor references: %w (output: %s)", err, strings.TrimSpace(string(out))) + } + + mainLog.Load().Debug().Msg("DNS intercept: removed anchor references from running pf ruleset") + return nil +} + +// pfAddressFamily returns "inet" for IPv4 addresses and "inet6" for IPv6 addresses. +// Used to generate pf rules with the correct address family for each IP. +// flushPFStates flushes ALL pf state entries after anchor reloads. +// pf checks state table BEFORE rules — stale entries from old rules keep routing +// packets via route-to even after interface-scoped exemptions are added. +func flushPFStates() { + if out, err := exec.Command("pfctl", "-F", "states").CombinedOutput(); err != nil { + mainLog.Load().Warn().Err(err).Msgf("DNS intercept: failed to flush pf states (output: %s)", strings.TrimSpace(string(out))) + } else { + mainLog.Load().Debug().Msg("DNS intercept: flushed pf states after anchor reload") + } +} + +func pfAddressFamily(ip string) string { + if addr := net.ParseIP(ip); addr != nil && addr.To4() == nil { + return "inet6" + } + return "inet" +} + +// discoverTunnelInterfaces returns the names of active VPN/tunnel network interfaces. +// These interfaces may have pf rules from VPN software (e.g., Windscribe's "pass out quick +// on ipsec0") that would match DNS traffic before our anchor rules. By discovering them, +// we can add interface-specific intercept rules that take priority. +func discoverTunnelInterfaces() []string { + ifaces, err := net.Interfaces() + if err != nil { + mainLog.Load().Warn().Err(err).Msg("DNS intercept: failed to enumerate network interfaces") + return nil + } + + prefixes := strings.Split(pfVPNInterfacePrefixes, ",") + var tunnels []string + + for _, iface := range ifaces { + // Only consider interfaces that are up — down interfaces can't carry DNS traffic. + if iface.Flags&net.FlagUp == 0 { + continue + } + for _, prefix := range prefixes { + if strings.HasPrefix(iface.Name, strings.TrimSpace(prefix)) { + tunnels = append(tunnels, iface.Name) + break + } + } + } + + if len(tunnels) > 0 { + mainLog.Load().Debug().Msgf("DNS intercept: discovered active tunnel interfaces: %v", tunnels) + } + return tunnels +} + +// dnsInterceptSupported reports whether DNS intercept mode is supported on this platform. +func dnsInterceptSupported() bool { + _, err := exec.LookPath("pfctl") + return err == nil +} + +// validateDNSIntercept checks that the system meets requirements for DNS intercept mode. +func (p *prog) validateDNSIntercept() error { + if !dnsInterceptSupported() { + return fmt.Errorf("dns intercept: pfctl not found — pf is required for DNS intercept on macOS") + } + + if os.Geteuid() != 0 { + return fmt.Errorf("dns intercept: root privileges required for pf filter management") + } + + if err := os.MkdirAll(filepath.Dir(pfAnchorFile), 0755); err != nil { + return fmt.Errorf("dns intercept: cannot create anchor directory: %w", err) + } + + if p.cfg != nil { + for name, uc := range p.cfg.Upstream { + if uc.Type == "os" || uc.Type == "" { + return fmt.Errorf("dns intercept: upstream %q uses OS resolver (port 53) which would create "+ + "a redirect loop with pf. Use DoH upstreams (--proto doh) with dns-intercept mode", name) + } + } + } + + return nil +} + +// buildPFAnchorRules generates the pf anchor rules for DNS interception. +// vpnExemptions are VPN DNS server+interface pairs to exempt from interception. +// +// macOS pf "rdr" rules only apply to forwarded traffic, NOT locally-originated +// packets. To intercept DNS from the machine itself, we use a two-step approach: +// 1. "pass out route-to lo0" forces outbound DNS through the loopback interface +// 2. "rdr on lo0" catches it on loopback and redirects to our listener +// +// STATE AND ROUTING (critical for VPN firewall coexistence): +// - route-to rules: keep state (default). State is floating (matches on any interface), +// but "pass out on lo0 no state" ensures no state exists on the lo0 outbound path, +// so rdr still fires on the lo0 inbound pass. +// - pass out on lo0: NO STATE — prevents state from being created on lo0 outbound, +// which would match inbound and bypass rdr. +// - rdr: no "pass" keyword — packet goes through filter so "pass in" creates state. +// - pass in on lo0: keep state + REPLY-TO lo0 — creates state for response routing +// AND forces the response back through lo0. Without reply-to, the response to a +// VPN client IP gets routed through the VPN tunnel and is lost. +// +// ctrld's own OS resolver nameservers (used for bootstrap DNS) must be exempted +// from the redirect to prevent ctrld from querying itself in a loop. +// +// pf requires strict rule ordering: translation (rdr) BEFORE filtering (pass). +func (p *prog) buildPFAnchorRules(vpnExemptions []vpnDNSExemption) string { + var rules strings.Builder + rules.WriteString("# ctrld DNS Intercept Mode\n") + rules.WriteString("# Intercepts locally-originated DNS (port 53) via route-to + rdr on lo0.\n") + rules.WriteString("#\n") + rules.WriteString("# How it works:\n") + rules.WriteString("# 1. \"pass out route-to lo0\" forces outbound DNS through the loopback interface\n") + rules.WriteString("# 2. \"rdr on lo0\" catches it on loopback and redirects to ctrld at 127.0.0.1:53\n") + rules.WriteString("#\n") + rules.WriteString("# All ctrld traffic is blanket-exempted via \"pass out quick group " + pfGroupName + "\",\n") + rules.WriteString("# ensuring ctrld's DoH/DoT upstream connections and DNS queries are never\n") + rules.WriteString("# blocked by VPN firewalls (e.g., Windscribe's \"block drop all\").\n") + rules.WriteString("#\n") + rules.WriteString("# pf requires strict rule ordering: translation (rdr) BEFORE filtering (pass).\n\n") + + // --- Translation rules (must come first per pf ordering) --- + // Uses "rdr" without "pass" so the redirected packet continues to filter evaluation. + // The filter rule "pass in on lo0 ... to 127.0.0.1 port 53 keep state" then creates + // a stateful entry that handles response routing. Using "rdr pass" would skip filter + // evaluation, and its implicit state alone is insufficient for response delivery — + // proven by commit 51cf029 where responses were silently dropped. + rules.WriteString("# --- Translation rules (rdr) ---\n") + rules.WriteString("# Redirect DNS traffic arriving on loopback (from route-to) to ctrld's listener.\n") + rules.WriteString("# Uses rdr (not rdr pass) — filter rules must evaluate to create response state.\n") + rules.WriteString("rdr on lo0 inet proto udp from any to ! 127.0.0.1 port 53 -> 127.0.0.1 port 53\n") + rules.WriteString("rdr on lo0 inet proto tcp from any to ! 127.0.0.1 port 53 -> 127.0.0.1 port 53\n\n") + + // --- Filtering rules --- + rules.WriteString("# --- Filtering rules (pass) ---\n\n") + + // Blanket exemption: allow ALL outbound traffic from ctrld (group _ctrld) through + // without any pf filtering or redirection. This is critical for VPN coexistence — + // VPN apps like Windscribe load "block drop all" rulesets that would otherwise block + // ctrld's DoH connections (TCP 443) to upstream DNS servers (e.g., 76.76.2.22). + // Because our anchor is prepended before other anchors, this rule evaluates first, + // ensuring ctrld's traffic is never blocked by downstream firewall rules. + // + // The per-IP exemptions below (OS resolver, VPN DNS) remain as defense-in-depth: + // they prevent DNS redirect loops for ctrld's own port-53 queries specifically, + // while this rule handles everything else (DoH, DoT, health checks, etc.). + rules.WriteString("# Blanket exemption: let all ctrld traffic through regardless of other pf rules.\n") + rules.WriteString("# VPN firewalls (e.g., Windscribe's \"block drop all\") would otherwise block\n") + rules.WriteString("# ctrld's DoH (TCP 443) connections to upstream DNS servers.\n") + rules.WriteString(fmt.Sprintf("pass out quick group %s\n\n", pfGroupName)) + + // Exempt OS resolver nameservers (read live from the global OS resolver) + // so ctrld's bootstrap DNS queries don't get redirected back to itself. + // IPv4 addresses use "inet", IPv6 addresses use "inet6". + osNS := ctrld.OsResolverNameservers() + if len(osNS) > 0 { + rules.WriteString("# Exempt OS resolver nameservers (ctrld bootstrap DNS) from redirect.\n") + rules.WriteString("# Scoped to group " + pfGroupName + " so only ctrld's own queries are exempted,\n") + rules.WriteString("# preventing other processes from bypassing the redirect by querying these IPs.\n") + for _, ns := range osNS { + host, _, _ := net.SplitHostPort(ns) + if host == "" { + host = ns + } + af := pfAddressFamily(host) + rules.WriteString(fmt.Sprintf("pass out quick on ! lo0 %s proto { udp, tcp } from any to %s port 53 group %s\n", af, host, pfGroupName)) + } + rules.WriteString("\n") + } + + // Build sets of VPN DNS interfaces and server IPs for exclusion from intercept rules. + // + // EXIT MODE EXCEPTION: When a VPN is in exit/full-tunnel mode (VPN DNS server is + // also the system default resolver), we do NOT exempt the interface. In exit mode, + // all traffic routes through the VPN, so exempting the interface would bypass ctrld + // for ALL DNS — losing profile enforcement (blocking, filtering). Instead, we keep + // intercepting and let ctrld's VPN DNS split routing + group exemption handle it. + vpnDNSIfaces := make(map[string]bool) // non-exit interfaces to skip in tunnel intercept + vpnDNSIfacePassthrough := make(map[string]bool) // non-exit interfaces needing passthrough rules + vpnDNSServerIPs := make(map[string]bool) // IPs for group exemptions and table + for _, ex := range vpnExemptions { + if ex.Interface != "" && !ex.IsExitMode { + vpnDNSIfaces[ex.Interface] = true + vpnDNSIfacePassthrough[ex.Interface] = true + } + vpnDNSServerIPs[ex.Server] = true + } + + // Group-scoped exemptions for ctrld's own VPN DNS queries. + // When ctrld's proxy() VPN DNS split routing sends queries to VPN DNS servers, + // these rules let ctrld's traffic through without being intercepted by the + // generic route-to rule. Scoped to group _ctrld so only ctrld benefits. + if len(vpnExemptions) > 0 { + rules.WriteString("# Exempt VPN DNS servers: ctrld's own queries (group-scoped).\n") + seen := make(map[string]bool) + for _, ex := range vpnExemptions { + if !seen[ex.Server] { + seen[ex.Server] = true + af := pfAddressFamily(ex.Server) + rules.WriteString(fmt.Sprintf("pass out quick on ! lo0 %s proto { udp, tcp } from any to %s port 53 group %s\n", af, ex.Server, pfGroupName)) + } + } + rules.WriteString("\n") + } + + // Block all outbound IPv6 DNS. ctrld only listens on 0.0.0.0:53 (IPv4), so we cannot + // redirect IPv6 DNS to our listener. Without this rule, macOS may use IPv6 link-local + // DNS servers (e.g., fe80::...%en0) assigned by the router, completely bypassing the + // IPv4 pf intercept. Blocking forces macOS to fall back to IPv4 DNS, which is intercepted. + // This rule must come BEFORE the IPv4 route-to rules (pf evaluates last match by default, + // but "quick" makes first-match — and exemptions above are already "quick"). + rules.WriteString("# Block outbound IPv6 DNS — ctrld listens on IPv4 only (0.0.0.0:53).\n") + rules.WriteString("# Without this, macOS may use IPv6 link-local DNS servers from the router,\n") + rules.WriteString("# bypassing the IPv4 intercept entirely.\n") + rules.WriteString("block out quick on ! lo0 inet6 proto { udp, tcp } from any to any port 53\n\n") + + // --- VPN DNS interface passthrough (split DNS mode only) --- + // + // In split DNS mode, the VPN's DNS handler (e.g., Tailscale MagicDNS) runs as a + // Network Extension that intercepts packets on its tunnel interface. MagicDNS then + // forwards queries to its own upstream nameservers (e.g., 10.3.112.11) — IPs we + // can't know in advance. Without these rules, pf's generic "on !lo0" intercept + // catches MagicDNS's upstream queries, routing them back to ctrld in a loop. + // + // These "pass" rules (no route-to) let MagicDNS's upstream queries pass through. + // Traffic TO the VPN DNS server (e.g., 100.100.100.100) is excluded via + // so those queries get intercepted → ctrld enforces its profile on non-search-domain queries. + // + // NOT applied in exit mode — in exit mode, all traffic routes through the VPN + // interface, so exempting it would bypass ctrld's profile enforcement entirely. + if len(vpnDNSIfacePassthrough) > 0 { + // Build table of VPN DNS server IPs to exclude from passthrough. + var vpnDNSTableMembers []string + for ip := range vpnDNSServerIPs { + if net.ParseIP(ip) != nil && net.ParseIP(ip).To4() != nil { + vpnDNSTableMembers = append(vpnDNSTableMembers, ip) + } + } + if len(vpnDNSTableMembers) > 0 { + rules.WriteString("# Table of VPN DNS server IPs — queries to these must be intercepted.\n") + rules.WriteString(fmt.Sprintf("table { %s }\n", strings.Join(vpnDNSTableMembers, ", "))) + } + rules.WriteString("# --- VPN DNS interface passthrough (split DNS mode) ---\n") + rules.WriteString("# Pass MagicDNS upstream queries; intercept queries TO MagicDNS itself.\n") + for iface := range vpnDNSIfacePassthrough { + if len(vpnDNSTableMembers) > 0 { + rules.WriteString(fmt.Sprintf("pass out quick on %s inet proto udp from any to ! port 53\n", iface)) + rules.WriteString(fmt.Sprintf("pass out quick on %s inet proto tcp from any to ! port 53\n", iface)) + } else { + rules.WriteString(fmt.Sprintf("pass out quick on %s inet proto udp from any to any port 53\n", iface)) + rules.WriteString(fmt.Sprintf("pass out quick on %s inet proto tcp from any to any port 53\n", iface)) + } + } + rules.WriteString("\n") + } + + // --- Interface-specific VPN/tunnel intercept rules --- + tunnelIfaces := discoverTunnelInterfaces() + if len(tunnelIfaces) > 0 { + rules.WriteString("# --- VPN/tunnel interface intercept rules ---\n") + rules.WriteString("# Explicit intercept on tunnel interfaces prevents VPN apps from capturing\n") + rules.WriteString("# DNS traffic with their own broad \"pass out quick on \" rules.\n") + rules.WriteString("# VPN DNS interfaces (split DNS mode) are excluded — passthrough rules above handle them.\n") + for _, iface := range tunnelIfaces { + if vpnDNSIfaces[iface] { + rules.WriteString(fmt.Sprintf("# Skipped %s — VPN DNS interface (passthrough rules handle this)\n", iface)) + continue + } + rules.WriteString(fmt.Sprintf("pass out quick on %s route-to lo0 inet proto udp from any to ! 127.0.0.1 port 53\n", iface)) + rules.WriteString(fmt.Sprintf("pass out quick on %s route-to lo0 inet proto tcp from any to ! 127.0.0.1 port 53\n", iface)) + } + rules.WriteString("\n") + } + + // Force all remaining outbound IPv4 DNS through loopback for interception. + // route-to rules use stateful tracking (keep state, the default). State is floating + // (matches on any interface), but "pass out on lo0 no state" below ensures no state + // is created on the lo0 outbound path, allowing rdr to fire on lo0 inbound. + rules.WriteString("# Force remaining outbound IPv4 DNS through loopback for interception.\n") + rules.WriteString("pass out quick on ! lo0 route-to lo0 inet proto udp from any to ! 127.0.0.1 port 53\n") + rules.WriteString("pass out quick on ! lo0 route-to lo0 inet proto tcp from any to ! 127.0.0.1 port 53\n\n") + + // Allow route-to'd DNS packets to pass outbound on lo0. + // Without this, VPN firewalls with "block drop all" (e.g., Windscribe) drop the packet + // after route-to redirects it to lo0 but before it can reflect inbound for rdr processing. + // + // CRITICAL: This rule MUST use "no state". If it created state, that state would match + // the packet when it reflects inbound on lo0, causing pf to fast-path it and bypass + // rdr entirely. With "no state", the inbound packet gets fresh evaluation and rdr fires. + rules.WriteString("# Pass route-to'd DNS outbound on lo0 — no state to avoid bypassing rdr inbound.\n") + rules.WriteString("pass out quick on lo0 inet proto udp from any to ! 127.0.0.1 port 53 no state\n") + rules.WriteString("pass out quick on lo0 inet proto tcp from any to ! 127.0.0.1 port 53 no state\n\n") + + // Allow the redirected traffic through on loopback (inbound after rdr). + // + // "reply-to lo0" is CRITICAL for VPN coexistence. Without it, ctrld's response to a + // VPN client IP (e.g., 100.94.163.168) gets routed via the VPN tunnel interface + // (utun420) by the kernel routing table — the response enters the tunnel and is lost. + // "reply-to lo0" forces pf to route the response back through lo0 regardless of the + // kernel routing table, ensuring it stays local and reaches the client process. + // + // "keep state" (the default) creates the stateful entry used by reply-to to route + // the response. The rdr NAT state handles the address rewrite on the response + // (source 127.0.0.1 → original DNS server IP, e.g., 10.255.255.3). + rules.WriteString("# Accept redirected DNS — reply-to lo0 forces response through loopback.\n") + rules.WriteString("pass in quick on lo0 reply-to lo0 inet proto { udp, tcp } from any to 127.0.0.1 port 53\n") + + return rules.String() +} + +// verifyPFState checks that the pf ruleset is correctly configured after loading. +// It verifies both the anchor references in the main ruleset and the rules within +// our anchor. Failures are logged at ERROR level to make them impossible to miss. +func (p *prog) verifyPFState() { + rdrAnchorRef := fmt.Sprintf("rdr-anchor \"%s\"", pfAnchorName) + anchorRef := fmt.Sprintf("anchor \"%s\"", pfAnchorName) + verified := true + + // Check main ruleset for anchor references. + natOut, err := exec.Command("pfctl", "-sn").CombinedOutput() + if err != nil { + mainLog.Load().Error().Err(err).Msg("DNS intercept: VERIFICATION FAILED — could not dump NAT rules") + verified = false + } else if !strings.Contains(string(natOut), rdrAnchorRef) { + mainLog.Load().Error().Msg("DNS intercept: VERIFICATION FAILED — rdr-anchor reference missing from running NAT rules") + verified = false + } + + filterOut, err := exec.Command("pfctl", "-sr").CombinedOutput() + if err != nil { + mainLog.Load().Error().Err(err).Msg("DNS intercept: VERIFICATION FAILED — could not dump filter rules") + verified = false + } else if !strings.Contains(string(filterOut), anchorRef) { + mainLog.Load().Error().Msg("DNS intercept: VERIFICATION FAILED — anchor reference missing from running filter rules") + verified = false + } + + // Check our anchor has rules loaded. + anchorFilter, err := exec.Command("pfctl", "-a", pfAnchorName, "-sr").CombinedOutput() + if err != nil { + mainLog.Load().Error().Err(err).Msg("DNS intercept: VERIFICATION FAILED — could not dump anchor filter rules") + verified = false + } else if len(strings.TrimSpace(string(anchorFilter))) == 0 { + mainLog.Load().Error().Msg("DNS intercept: VERIFICATION FAILED — anchor has no filter rules loaded") + verified = false + } + + anchorNat, err := exec.Command("pfctl", "-a", pfAnchorName, "-sn").CombinedOutput() + if err != nil { + mainLog.Load().Error().Err(err).Msg("DNS intercept: VERIFICATION FAILED — could not dump anchor NAT rules") + verified = false + } else if len(strings.TrimSpace(string(anchorNat))) == 0 { + mainLog.Load().Error().Msg("DNS intercept: VERIFICATION FAILED — anchor has no NAT/redirect rules loaded") + verified = false + } + + // Check that lo0 is not in the skip list — if it is, our rdr rules are dead. + optOut, err := exec.Command("pfctl", "-sO").CombinedOutput() + if err == nil { + for _, line := range strings.Split(string(optOut), "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, "set skip on") && strings.Contains(line, "lo0") { + mainLog.Load().Error().Msg("DNS intercept: VERIFICATION FAILED — 'set skip on lo0' is active, rdr rules on loopback will not fire") + verified = false + break + } + } + } + + if verified { + mainLog.Load().Info().Msg("DNS intercept: post-load verification passed — all pf rules confirmed active") + } +} + +// resetUpstreamTransports forces all DoH/DoT/DoQ upstreams to re-bootstrap their +// network transports. This is called when the pf watchdog detects that the pf state +// table was flushed (e.g., by Windscribe running "pfctl -f"), which kills all existing +// TCP connections including ctrld's DoH connections to upstream DNS servers. +// +// Without this, Go's http.Transport keeps trying to use dead connections until each +// request hits its 5s context deadline — causing a ~5s DNS blackout. +// +// ForceReBootstrap() immediately creates a new transport (closing old idle +// connections), so new queries use fresh connections without waiting for the +// lazy re-bootstrap flag. This reduces the blackout from ~5s to ~100ms. +func (p *prog) resetUpstreamTransports() { + if p.cfg == nil { + return + } + count := 0 + for _, uc := range p.cfg.Upstream { + if uc == nil { + continue + } + uc.ForceReBootstrap(ctrld.LoggerCtx(context.Background(), p.logger.Load())) + count++ + } + if count > 0 { + mainLog.Load().Info().Msgf("DNS intercept watchdog: force-reset %d upstream transport(s) — pf state flush likely killed existing DoH connections", count) + } +} + +// checkTunnelInterfaceChanges compares the current set of active tunnel interfaces +// against the last known set. If they differ (e.g., a VPN connected and created utun420), +// it rebuilds and reloads the pf anchor rules to include interface-specific intercept +// rules for the new interface. +// +// Returns true if the anchor was rebuilt, false if no changes detected. +// This is called from the network change callback even when validInterfacesMap() +// reports no changes — because validInterfacesMap() only tracks physical hardware +// ports (en0, bridge0, etc.) and ignores tunnel interfaces (utun*, ipsec*, etc.). +func (p *prog) checkTunnelInterfaceChanges() bool { + if p.dnsInterceptState == nil { + return false + } + + current := discoverTunnelInterfaces() + + p.mu.Lock() + prev := p.lastTunnelIfaces + changed := !stringSlicesEqual(prev, current) + if changed { + p.lastTunnelIfaces = current + } + p.mu.Unlock() + + if !changed { + return false + } + + // Detect NEW tunnel interfaces (not just any change). + prevSet := make(map[string]bool, len(prev)) + for _, iface := range prev { + prevSet[iface] = true + } + hasNewTunnel := false + for _, iface := range current { + if !prevSet[iface] { + hasNewTunnel = true + mainLog.Load().Info().Msgf("DNS intercept: new tunnel interface detected: %s", iface) + break + } + } + + if hasNewTunnel { + // A new VPN tunnel appeared. Enter stabilization mode — the VPN may be + // about to wipe our pf rules (Windscribe does this ~500ms after tunnel creation). + // We can't check pfAnchorIsWiped() here because the wipe hasn't happened yet. + // The stabilization loop will detect whether pf actually gets wiped: + // - If rules change (VPN touches pf): wait for stability, then restore. + // - If rules stay stable for the full wait (Tailscale): exit early and rebuild immediately. + p.pfStartStabilization() + return true + } + + mainLog.Load().Info().Msgf("DNS intercept: tunnel interfaces changed (was %v, now %v) — rebuilding pf anchor rules", prev, current) + + // Rebuild anchor rules with the updated tunnel interface list. + // Pass current VPN DNS exemptions so they are preserved for still-active VPNs. + var vpnExemptions []vpnDNSExemption + if p.vpnDNS != nil { + vpnExemptions = p.vpnDNS.CurrentExemptions() + } + rulesStr := p.buildPFAnchorRules(vpnExemptions) + if err := os.WriteFile(pfAnchorFile, []byte(rulesStr), 0644); err != nil { + mainLog.Load().Error().Err(err).Msg("DNS intercept: failed to write rebuilt anchor file") + return true + } + out, err := exec.Command("pfctl", "-a", pfAnchorName, "-f", pfAnchorFile).CombinedOutput() + if err != nil { + mainLog.Load().Error().Err(err).Msgf("DNS intercept: failed to reload rebuilt anchor (output: %s)", strings.TrimSpace(string(out))) + return true + } + + flushPFStates() + mainLog.Load().Info().Msgf("DNS intercept: rebuilt pf anchor with %d tunnel interfaces", len(current)) + return true +} + +// stringSlicesEqual reports whether two string slices have the same elements in the same order. +func stringSlicesEqual(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +// pfAnchorIsWiped checks if our pf anchor references have been removed from the +// running ruleset. This is a read-only check — it does NOT attempt to restore. +// Used to distinguish VPNs that wipe pf (Windscribe) from those that don't (Tailscale). +func (p *prog) pfAnchorIsWiped() bool { + rdrAnchorRef := fmt.Sprintf("rdr-anchor \"%s\"", pfAnchorName) + anchorRef := fmt.Sprintf("anchor \"%s\"", pfAnchorName) + + natOut, err := exec.Command("pfctl", "-sn").CombinedOutput() + if err != nil { + return true // Can't check — assume wiped (safer) + } + if !strings.Contains(string(natOut), rdrAnchorRef) { + return true + } + + filterOut, err := exec.Command("pfctl", "-sr").CombinedOutput() + if err != nil { + return true + } + return !strings.Contains(string(filterOut), anchorRef) +} + +// pfStartStabilization enters stabilization mode, suppressing all pf restores +// until the VPN's ruleset stops changing. This prevents a death spiral where +// ctrld and the VPN repeatedly overwrite each other's pf rules. +func (p *prog) pfStartStabilization() { + if p.pfStabilizing.Load() { + // Already stabilizing — extending is handled by backoff. + return + } + p.pfStabilizing.Store(true) + + multiplier := max(int(p.pfBackoffMultiplier.Load()), 1) + baseStableTime := 6000 * time.Millisecond // 4 polls at 1.5s + stableRequired := time.Duration(multiplier) * baseStableTime + if stableRequired > 45*time.Second { + stableRequired = 45 * time.Second + } + + mainLog.Load().Info().Msgf("DNS intercept: VPN connecting — entering stabilization mode (waiting %s for pf to settle)", stableRequired) + + ctx, cancel := context.WithCancel(context.Background()) + p.mu.Lock() + if p.pfStabilizeCancel != nil { + p.pfStabilizeCancel() // Cancel any previous stabilization + } + p.pfStabilizeCancel = cancel + p.mu.Unlock() + + go p.pfStabilizationLoop(ctx, stableRequired) +} + +// pfStabilizationLoop polls pfctl -sr hash until the ruleset is stable for the +// required duration, then restores our anchor rules. +func (p *prog) pfStabilizationLoop(ctx context.Context, stableRequired time.Duration) { + defer p.pfStabilizing.Store(false) + + pollInterval := 1500 * time.Millisecond + var lastHash string + stableSince := time.Time{} + + for { + select { + case <-ctx.Done(): + mainLog.Load().Debug().Msg("DNS intercept: stabilization cancelled") + return + case <-p.stopCh: + return + case <-time.After(pollInterval): + } + + // Hash the current filter ruleset. + out, err := exec.Command("pfctl", "-sr").CombinedOutput() + if err != nil { + continue + } + hash := fmt.Sprintf("%x", sha256.Sum256(out)) + + if hash != lastHash { + // Rules changed — reset stability timer + lastHash = hash + stableSince = time.Now() + mainLog.Load().Debug().Msg("DNS intercept: pf rules changed during stabilization — resetting timer") + continue + } + + if stableSince.IsZero() { + stableSince = time.Now() + continue + } + + if time.Since(stableSince) >= stableRequired { + // Stable long enough — restore our rules. + // Clear stabilizing flag BEFORE calling ensurePFAnchorActive so + // the guard inside that function doesn't suppress our restore. + p.pfStabilizing.Store(false) + mainLog.Load().Info().Msgf("DNS intercept: pf stable for %s — restoring anchor rules", stableRequired) + p.ensurePFAnchorActive() + p.pfLastRestoreTime.Store(time.Now().UnixMilli()) + return + } + } +} + +// ensurePFAnchorActive checks that our pf anchor references and rules are still +// present in the running ruleset. If anything is missing (e.g., another program +// like Windscribe desktop or macOS itself reloaded pf.conf), it restores them. +// +// Returns true if restoration was needed, false if everything was already intact. +// Called both on network changes (immediate) and by the periodic pfWatchdog. +func (p *prog) ensurePFAnchorActive() bool { + if p.dnsInterceptState == nil { + return false + } + + // While stabilizing (VPN connecting), suppress all restores. + // The stabilization loop will restore once pf settles. + if p.pfStabilizing.Load() { + mainLog.Load().Debug().Msg("DNS intercept watchdog: suppressed — VPN stabilization in progress") + return false + } + + // Check if our last restore was very recent and got wiped again. + // This indicates a VPN reconnect cycle — enter stabilization with backoff. + if lastRestore := p.pfLastRestoreTime.Load(); lastRestore > 0 { + elapsed := time.Since(time.UnixMilli(lastRestore)) + if elapsed < 10*time.Second { + // Rules were wiped within 10s of our last restore — VPN is fighting us. + p.pfBackoffMultiplier.Add(1) + mainLog.Load().Warn().Msgf("DNS intercept: rules wiped %s after restore — entering stabilization (backoff multiplier: %d)", + elapsed, p.pfBackoffMultiplier.Load()) + p.pfStartStabilization() + return false + } + // Rules survived >10s — reset backoff + if p.pfBackoffMultiplier.Load() > 0 { + p.pfBackoffMultiplier.Store(0) + } + } + + rdrAnchorRef := fmt.Sprintf("rdr-anchor \"%s\"", pfAnchorName) + anchorRef := fmt.Sprintf("anchor \"%s\"", pfAnchorName) + needsRestore := false + + // Check 1: anchor references in the main ruleset. + natOut, err := exec.Command("pfctl", "-sn").CombinedOutput() + if err != nil { + mainLog.Load().Warn().Err(err).Msg("DNS intercept watchdog: could not dump NAT rules") + return false + } + if !strings.Contains(string(natOut), rdrAnchorRef) { + mainLog.Load().Warn().Msg("DNS intercept watchdog: rdr-anchor reference missing from running ruleset") + needsRestore = true + } + + if !needsRestore { + filterOut, err := exec.Command("pfctl", "-sr").CombinedOutput() + if err != nil { + mainLog.Load().Warn().Err(err).Msg("DNS intercept watchdog: could not dump filter rules") + return false + } + if !strings.Contains(string(filterOut), anchorRef) { + mainLog.Load().Warn().Msg("DNS intercept watchdog: anchor reference missing from running filter rules") + needsRestore = true + } + } + + // Check 2: anchor content (rules inside our anchor). + // Verify BOTH filter rules (-sr) AND rdr/NAT rules (-sn). Programs like Parallels' + // internet-sharing can flush our anchor's rdr rules while leaving filter rules intact. + // Without rdr, route-to sends packets to lo0 but they never get redirected to 127.0.0.1:53, + // causing an infinite packet loop on lo0 and complete DNS failure. + if !needsRestore { + anchorFilter, err := exec.Command("pfctl", "-a", pfAnchorName, "-sr").CombinedOutput() + if err != nil || len(strings.TrimSpace(string(anchorFilter))) == 0 { + mainLog.Load().Warn().Msg("DNS intercept watchdog: anchor has no filter rules — content was flushed") + needsRestore = true + } + } + if !needsRestore { + anchorNat, err := exec.Command("pfctl", "-a", pfAnchorName, "-sn").CombinedOutput() + if err != nil || len(strings.TrimSpace(string(anchorNat))) == 0 { + mainLog.Load().Warn().Msg("DNS intercept watchdog: anchor has no rdr rules — translation was flushed (will cause packet loop on lo0)") + needsRestore = true + } + } + + // Check 3: "set skip on lo0" — VPN apps (e.g., Windscribe) load a complete pf.conf + // with "set skip on { lo0 }" which disables ALL pf processing on loopback. + // Our entire interception mechanism (route-to lo0 + rdr on lo0) depends on lo0 being + // processed by pf. This check detects the skip and triggers a restore that removes it. + if !needsRestore { + optOut, err := exec.Command("pfctl", "-sO").CombinedOutput() + if err == nil { + optStr := string(optOut) + // Check if lo0 appears in any "set skip on" directive. + for _, line := range strings.Split(optStr, "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, "set skip on") && strings.Contains(line, "lo0") { + mainLog.Load().Warn().Msg("DNS intercept watchdog: 'set skip on lo0' detected — loopback bypass breaks our rdr rules") + needsRestore = true + break + } + } + } + } + + if !needsRestore { + mainLog.Load().Debug().Msg("DNS intercept watchdog: pf anchor intact") + return false + } + + // Restore: re-inject anchor references into the main ruleset. + mainLog.Load().Info().Msg("DNS intercept watchdog: restoring pf anchor references") + if err := p.ensurePFAnchorReference(); err != nil { + mainLog.Load().Error().Err(err).Msg("DNS intercept watchdog: failed to restore anchor references") + return true + } + + // Restore: always rebuild anchor rules from scratch to ensure tunnel interface + // rules are up-to-date (VPN interfaces may have appeared/disappeared since the + // anchor file was last written). + mainLog.Load().Info().Msg("DNS intercept watchdog: rebuilding anchor rules with current network state") + var vpnExemptions []vpnDNSExemption + if p.vpnDNS != nil { + vpnExemptions = p.vpnDNS.CurrentExemptions() + } + rulesStr := p.buildPFAnchorRules(vpnExemptions) + if err := os.WriteFile(pfAnchorFile, []byte(rulesStr), 0644); err != nil { + mainLog.Load().Error().Err(err).Msg("DNS intercept watchdog: failed to write anchor file") + } else if out, err := exec.Command("pfctl", "-a", pfAnchorName, "-f", pfAnchorFile).CombinedOutput(); err != nil { + mainLog.Load().Error().Err(err).Msgf("DNS intercept watchdog: failed to load rebuilt anchor (output: %s)", strings.TrimSpace(string(out))) + } else { + flushPFStates() + mainLog.Load().Info().Msg("DNS intercept watchdog: rebuilt and loaded anchor rules") + } + + // Update tracked tunnel interfaces after rebuild so checkTunnelInterfaceChanges() + // has an accurate baseline for subsequent comparisons. + p.mu.Lock() + p.lastTunnelIfaces = discoverTunnelInterfaces() + p.mu.Unlock() + + // Verify the restoration worked. + p.verifyPFState() + + // Proactively reset upstream transports. When another program replaces the pf + // ruleset with "pfctl -f", it flushes the entire state table — killing all + // existing TCP connections including our DoH connections to upstream DNS servers. + // Without this reset, Go's http.Transport keeps trying dead connections until + // the 5s context deadline, causing a DNS blackout. Re-bootstrapping forces fresh + // TLS handshakes on the next query (~200ms vs ~5s recovery). + p.resetUpstreamTransports() + + p.pfLastRestoreTime.Store(time.Now().UnixMilli()) + mainLog.Load().Info().Msg("DNS intercept watchdog: pf anchor restored successfully") + return true +} + +// pfWatchdog periodically checks that our pf anchor is still active. +// Other programs (e.g., Windscribe desktop app, macOS configd) can replace +// scheduleDelayedRechecks schedules delayed re-checks after a network change event. +// VPN apps often modify pf rules and DNS settings asynchronously after the network +// change that triggered our handler. These delayed checks catch: +// - pf anchor wipes by VPN disconnect (Windscribe's firewallOff) +// - Stale OS resolver nameservers (VPN DNS not yet cleaned from scutil) +// - Stale VPN DNS routes in vpnDNSManager +// - Tunnel interface additions/removals not yet visible +// +// Two delays (2s and 4s) cover both fast and slow VPN teardowns. +func (p *prog) scheduleDelayedRechecks() { + for _, delay := range []time.Duration{pfAnchorRecheckDelay, pfAnchorRecheckDelayLong} { + time.AfterFunc(delay, func() { + if p.dnsInterceptState == nil || p.pfStabilizing.Load() { + return + } + p.ensurePFAnchorActive() + p.checkTunnelInterfaceChanges() + // Refresh OS resolver — VPN may have finished DNS cleanup since the + // immediate handler ran. This clears stale LAN nameservers (e.g., + // Windscribe's 10.255.255.3 lingering in scutil --dns). + ctx := ctrld.LoggerCtx(context.Background(), p.logger.Load()) + ctrld.InitializeOsResolver(ctx, true) + if p.vpnDNS != nil { + p.vpnDNS.Refresh(ctx) + } + }) + } +} + +// the entire pf ruleset with pfctl -f, which wipes our anchor references. +// This watchdog detects and restores them. +func (p *prog) pfWatchdog() { + mainLog.Load().Info().Msgf("DNS intercept: starting pf watchdog (interval: %s)", pfWatchdogInterval) + + var consecutiveMisses atomic.Int32 + ticker := time.NewTicker(pfWatchdogInterval) + defer ticker.Stop() + + for { + select { + case <-p.stopCh: + mainLog.Load().Debug().Msg("DNS intercept: pf watchdog stopped") + return + case <-ticker.C: + if p.dnsInterceptState == nil { + mainLog.Load().Debug().Msg("DNS intercept: pf watchdog exiting — intercept state is nil") + return + } + + restored := p.ensurePFAnchorActive() + if !restored { + // Rules are intact in text form — also probe actual interception. + if !p.pfStabilizing.Load() && !p.pfMonitorRunning.Load() { + if !p.probePFIntercept() { + mainLog.Load().Warn().Msg("DNS intercept watchdog: rules intact but probe FAILED — forcing full reload") + p.forceReloadPFMainRuleset() + restored = true + } + } + + // Check if backoff should be reset. + if p.pfBackoffMultiplier.Load() > 0 && p.pfLastRestoreTime.Load() > 0 { + elapsed := time.Since(time.UnixMilli(p.pfLastRestoreTime.Load())) + if elapsed > 60*time.Second { + p.pfBackoffMultiplier.Store(0) + mainLog.Load().Info().Msg("DNS intercept watchdog: rules stable for >60s — reset backoff") + } + } + } + if restored { + misses := consecutiveMisses.Add(1) + if misses >= pfConsecutiveMissThreshold { + mainLog.Load().Error().Msgf("DNS intercept watchdog: pf anchor has been missing for %d consecutive checks — something is persistently overwriting pf rules", misses) + } else { + mainLog.Load().Warn().Msgf("DNS intercept watchdog: pf anchor was missing and restored (consecutive misses: %d)", misses) + } + } else { + if old := consecutiveMisses.Swap(0); old > 0 { + mainLog.Load().Info().Msgf("DNS intercept watchdog: pf anchor stable again after %d consecutive restores", old) + } + } + } + } +} + +// exemptVPNDNSServers updates the pf anchor rules with interface-scoped exemptions +// for VPN DNS servers, allowing VPN local DNS handlers (e.g., Tailscale MagicDNS +// via Network Extension) to receive DNS queries from all processes on their interface. +// +// Called by vpnDNSManager.Refresh() whenever VPN DNS servers change. +func (p *prog) exemptVPNDNSServers(exemptions []vpnDNSExemption) error { + if p.dnsInterceptState == nil { + return fmt.Errorf("pf state not available") + } + + rulesStr := p.buildPFAnchorRules(exemptions) + + if err := os.WriteFile(pfAnchorFile, []byte(rulesStr), 0644); err != nil { + return fmt.Errorf("dns intercept: failed to rewrite pf anchor: %w", err) + } + + out, err := exec.Command("pfctl", "-a", pfAnchorName, "-f", pfAnchorFile).CombinedOutput() + if err != nil { + return fmt.Errorf("dns intercept: failed to reload pf anchor: %w (output: %s)", err, strings.TrimSpace(string(out))) + } + + // Flush stale pf states so packets are re-evaluated against new rules. + flushPFStates() + + // Ensure the anchor reference still exists in the main ruleset. + // Another program may have replaced the ruleset since we last checked. + if err := p.ensurePFAnchorReference(); err != nil { + mainLog.Load().Warn().Err(err).Msg("DNS intercept: failed to verify anchor reference during VPN DNS update") + } + + mainLog.Load().Info().Msgf("DNS intercept: updated pf rules — exempted %d VPN DNS + %d OS resolver servers", + len(exemptions), len(ctrld.OsResolverNameservers())) + return nil +} + +// probePFIntercept tests whether pf's rdr translation is actually working by +// sending a DNS query through the interception path from a subprocess that does +// NOT have the _ctrld group GID. If pf interception is working, the query gets +// redirected to 127.0.0.1:53 (ctrld), and the DNS handler signals us. If broken +// (rdr rules present but not evaluating), the query goes to the real DNS server +// and we time out. +// +// Returns true if interception is working, false if broken or indeterminate. +func (p *prog) probePFIntercept() bool { + if p.dnsInterceptState == nil { + return true + } + + nsIPs := ctrld.OsResolverNameservers() + if len(nsIPs) == 0 { + mainLog.Load().Debug().Msg("DNS intercept probe: no OS resolver nameservers available") + return true // can't probe without a target + } + host, _, _ := net.SplitHostPort(nsIPs[0]) + if host == "" || host == "127.0.0.1" || host == "::1" { + mainLog.Load().Debug().Msg("DNS intercept probe: OS resolver is localhost, skipping probe") + return true // can't probe through localhost + } + + // Generate unique probe domain + probeID := fmt.Sprintf("_pf-probe-%x.%s", time.Now().UnixNano()&0xFFFFFFFF, pfProbeDomain) + + // Register probe so DNS handler can detect and signal it + probeCh := make(chan struct{}, 1) + p.pfProbeExpected.Store(probeID) + p.pfProbeCh.Store(&probeCh) + defer func() { + p.pfProbeExpected.Store("") + p.pfProbeCh.Store((*chan struct{})(nil)) + }() + + // Build a minimal DNS query packet for the probe domain. + // We use exec.Command to send from a subprocess with GID=0 (wheel), + // so pf's _ctrld group exemption does NOT apply and the query gets intercepted. + dnsPacket := buildDNSQueryPacket(probeID) + + // Send via a helper subprocess that drops the _ctrld group + cmd := exec.Command(os.Args[0], "pf-probe-send", host, fmt.Sprintf("%x", dnsPacket)) + cmd.SysProcAttr = &syscall.SysProcAttr{ + Credential: &syscall.Credential{ + Uid: 0, + Gid: 0, // wheel group — NOT _ctrld, so pf intercepts it + }, + } + + if err := cmd.Start(); err != nil { + mainLog.Load().Debug().Err(err).Msg("DNS intercept probe: failed to start probe subprocess") + return true // can't probe, assume OK + } + + // Don't leak the subprocess + go func() { + _ = cmd.Wait() + }() + + select { + case <-probeCh: + return true + case <-time.After(pfProbeTimeout): + return false + } +} + +// buildDNSQueryPacket constructs a minimal DNS query packet (wire format) for the given domain. +func buildDNSQueryPacket(domain string) []byte { + // DNS header: ID=0x1234, QR=0, OPCODE=0, RD=1, QDCOUNT=1 + header := []byte{ + 0x12, 0x34, // ID + 0x01, 0x00, // Flags: RD=1 + 0x00, 0x01, // QDCOUNT=1 + 0x00, 0x00, // ANCOUNT=0 + 0x00, 0x00, // NSCOUNT=0 + 0x00, 0x00, // ARCOUNT=0 + } + + // Encode domain name in DNS wire format (label-length encoding) + // Remove trailing dot if present + d := strings.TrimSuffix(domain, ".") + var qname []byte + for _, label := range strings.Split(d, ".") { + qname = append(qname, byte(len(label))) + qname = append(qname, []byte(label)...) + } + qname = append(qname, 0x00) // root label + + // QTYPE=A (1), QCLASS=IN (1) + question := append(qname, 0x00, 0x01, 0x00, 0x01) + + return append(header, question...) +} + +// pfInterceptMonitor runs asynchronously after interface changes are detected. +// It probes pf interception with exponential backoff and forces a full pf reload +// if the probe fails. Only one instance runs at a time (singleton via atomic.Bool). +// +// The backoff schedule provides both fast detection (immediate + 500ms) and extended +// coverage (up to ~8s) to win the race against async pf reloads by hypervisors. +func (p *prog) pfInterceptMonitor() { + if !p.pfMonitorRunning.CompareAndSwap(false, true) { + mainLog.Load().Debug().Msg("DNS intercept monitor: already running, skipping") + return + } + defer p.pfMonitorRunning.Store(false) + + mainLog.Load().Info().Msg("DNS intercept monitor: starting interception probe sequence") + + // Backoff schedule: probe quickly first, then space out. + // Total monitoring window: ~0 + 0.5 + 1 + 2 + 4 = ~7.5s + delays := []time.Duration{0, 500 * time.Millisecond, time.Second, 2 * time.Second, 4 * time.Second} + + for i, delay := range delays { + if delay > 0 { + time.Sleep(delay) + } + if p.dnsInterceptState == nil || p.pfStabilizing.Load() { + mainLog.Load().Debug().Msg("DNS intercept monitor: aborting — intercept disabled or stabilizing") + return + } + + if p.probePFIntercept() { + mainLog.Load().Debug().Msgf("DNS intercept monitor: probe %d/%d passed", i+1, len(delays)) + continue // working now — keep monitoring in case it breaks later in the window + } + + // Probe failed — pf translation is broken. Force full reload. + mainLog.Load().Warn().Msgf("DNS intercept monitor: probe %d/%d FAILED — pf translation broken, forcing full ruleset reload", i+1, len(delays)) + p.forceReloadPFMainRuleset() + + // Verify the reload fixed it + time.Sleep(200 * time.Millisecond) + if p.probePFIntercept() { + mainLog.Load().Info().Msg("DNS intercept monitor: probe passed after reload — interception restored") + // Continue monitoring in case the hypervisor reloads pf again + } else { + mainLog.Load().Error().Msg("DNS intercept monitor: probe still failing after reload — pf may need manual intervention") + } + } + + mainLog.Load().Info().Msg("DNS intercept monitor: probe sequence completed") +} + +// forceReloadPFMainRuleset unconditionally reloads the entire pf ruleset via +// "pfctl -f -". This resets pf's internal translation engine, fixing cases where +// rdr rules exist in text form but aren't being evaluated (e.g., after a hypervisor +// like Parallels reloads /etc/pf.conf as a side effect of creating/destroying +// virtual network interfaces). +// +// Unlike ensurePFAnchorReference() which returns early when anchor references are +// already present, this function always performs the full reload. +// +// The reload is safe for VPN interop because it reassembles from the current running +// ruleset (pfctl -sr/-sn), preserving all existing anchors and rules. +func (p *prog) forceReloadPFMainRuleset() { + rdrAnchorRef := fmt.Sprintf("rdr-anchor \"%s\"", pfAnchorName) + anchorRef := fmt.Sprintf("anchor \"%s\"", pfAnchorName) + + // Dump running rules. + natOut, err := exec.Command("pfctl", "-sn").CombinedOutput() + if err != nil { + mainLog.Load().Error().Err(err).Msg("DNS intercept: force reload — failed to dump NAT rules") + return + } + + filterOut, err := exec.Command("pfctl", "-sr").CombinedOutput() + if err != nil { + mainLog.Load().Error().Err(err).Msg("DNS intercept: force reload — failed to dump filter rules") + return + } + + natLines := pfFilterRuleLines(string(natOut)) + filterLines := pfFilterRuleLines(string(filterOut)) + + // Separate scrub rules from filter rules. + var scrubLines, pureFilterLines []string + for _, line := range filterLines { + if strings.HasPrefix(line, "scrub") { + scrubLines = append(scrubLines, line) + } else { + pureFilterLines = append(pureFilterLines, line) + } + } + + // Ensure our anchor references are present (they may have been wiped). + if !pfContainsRule(natLines, rdrAnchorRef) { + natLines = append([]string{rdrAnchorRef}, natLines...) + } + if !pfContainsRule(pureFilterLines, anchorRef) { + pureFilterLines = append([]string{anchorRef}, pureFilterLines...) + } + + // Clean pf options (remove "set skip on lo0" if present). + cleanedOptions, _ := pfGetCleanedOptions() + + // Reassemble in pf's required order: options → scrub → translation → filtering. + var combined strings.Builder + if cleanedOptions != "" { + combined.WriteString(cleanedOptions) + } + for _, line := range scrubLines { + combined.WriteString(line + "\n") + } + for _, line := range natLines { + combined.WriteString(line + "\n") + } + for _, line := range pureFilterLines { + combined.WriteString(line + "\n") + } + + cmd := exec.Command("pfctl", "-f", "-") + cmd.Stdin = strings.NewReader(combined.String()) + out, err := cmd.CombinedOutput() + if err != nil { + mainLog.Load().Error().Err(err).Msgf("DNS intercept: force reload — pfctl -f - failed (output: %s)", strings.TrimSpace(string(out))) + return + } + + // Also reload the anchor rules to ensure they're fresh. + var vpnExemptions []vpnDNSExemption + if p.vpnDNS != nil { + vpnExemptions = p.vpnDNS.CurrentExemptions() + } + rulesStr := p.buildPFAnchorRules(vpnExemptions) + if err := os.WriteFile(pfAnchorFile, []byte(rulesStr), 0644); err != nil { + mainLog.Load().Error().Err(err).Msg("DNS intercept: force reload — failed to write anchor file") + } else if out, err := exec.Command("pfctl", "-a", pfAnchorName, "-f", pfAnchorFile).CombinedOutput(); err != nil { + mainLog.Load().Error().Err(err).Msgf("DNS intercept: force reload — failed to load anchor (output: %s)", strings.TrimSpace(string(out))) + } + + // Reset upstream transports — pf reload flushes state table, killing DoH connections. + p.resetUpstreamTransports() + + mainLog.Load().Info().Msg("DNS intercept: force reload — pf ruleset and anchor reloaded successfully") +} diff --git a/cmd/cli/dns_intercept_darwin_test.go b/cmd/cli/dns_intercept_darwin_test.go new file mode 100644 index 0000000..822f2c5 --- /dev/null +++ b/cmd/cli/dns_intercept_darwin_test.go @@ -0,0 +1,127 @@ +//go:build darwin + +package cli + +import ( + "strings" + "testing" +) + +// ============================================================================= +// buildPFAnchorRules tests +// ============================================================================= + +func TestPFBuildAnchorRules_Basic(t *testing.T) { + p := &prog{} + rules := p.buildPFAnchorRules(nil) + + // rdr (translation) must come before pass (filtering) + rdrIdx := strings.Index(rules, "rdr pass on lo0") + passRouteIdx := strings.Index(rules, "pass out quick on ! lo0 route-to lo0") + passInIdx := strings.Index(rules, "pass in quick on lo0") + + if rdrIdx < 0 { + t.Fatal("missing rdr rule") + } + if passRouteIdx < 0 { + t.Fatal("missing pass out route-to rule") + } + if passInIdx < 0 { + t.Fatal("missing pass in on lo0 rule") + } + if rdrIdx >= passRouteIdx { + t.Error("rdr rules must come before pass out route-to rules") + } + if passRouteIdx >= passInIdx { + t.Error("pass out route-to must come before pass in on lo0") + } + + // Both UDP and TCP rdr rules + if !strings.Contains(rules, "proto udp") || !strings.Contains(rules, "proto tcp") { + t.Error("must have both UDP and TCP rdr rules") + } +} + +func TestPFBuildAnchorRules_WithVPNServers(t *testing.T) { + p := &prog{} + vpnServers := []string{"10.8.0.1", "10.8.0.2"} + rules := p.buildPFAnchorRules(vpnServers) + + // VPN exemption rules must appear + for _, s := range vpnServers { + if !strings.Contains(rules, s) { + t.Errorf("missing VPN exemption for %s", s) + } + } + + // VPN exemptions must come before route-to + exemptIdx := strings.Index(rules, "10.8.0.1") + routeIdx := strings.Index(rules, "route-to lo0") + if exemptIdx >= routeIdx { + t.Error("VPN exemptions must come before route-to rules") + } +} + +func TestPFBuildAnchorRules_IPv4AndIPv6VPN(t *testing.T) { + p := &prog{} + vpnServers := []string{"10.8.0.1", "fd00::1"} + rules := p.buildPFAnchorRules(vpnServers) + + // IPv4 server should use "inet" + lines := strings.Split(rules, "\n") + for _, line := range lines { + if strings.Contains(line, "10.8.0.1") { + if !strings.Contains(line, "inet ") { + t.Error("IPv4 VPN server rule should contain 'inet'") + } + if strings.Contains(line, "inet6") { + t.Error("IPv4 VPN server rule should not contain 'inet6'") + } + } + if strings.Contains(line, "fd00::1") { + if !strings.Contains(line, "inet6") { + t.Error("IPv6 VPN server rule should contain 'inet6'") + } + } + } +} + +func TestPFBuildAnchorRules_Ordering(t *testing.T) { + p := &prog{} + vpnServers := []string{"10.8.0.1"} + rules := p.buildPFAnchorRules(vpnServers) + + // Verify ordering: rdr → exemptions → route-to → pass in on lo0 + rdrIdx := strings.Index(rules, "rdr pass on lo0") + exemptIdx := strings.Index(rules, "pass out quick on ! lo0 inet proto { udp, tcp } from any to 10.8.0.1") + routeIdx := strings.Index(rules, "pass out quick on ! lo0 route-to lo0") + passInIdx := strings.Index(rules, "pass in quick on lo0") + + if rdrIdx < 0 || exemptIdx < 0 || routeIdx < 0 || passInIdx < 0 { + t.Fatalf("missing expected rules: rdr=%d exempt=%d route=%d passIn=%d", rdrIdx, exemptIdx, routeIdx, passInIdx) + } + + if !(rdrIdx < exemptIdx && exemptIdx < routeIdx && routeIdx < passInIdx) { + t.Errorf("incorrect rule ordering: rdr(%d) < exempt(%d) < route(%d) < passIn(%d)", rdrIdx, exemptIdx, routeIdx, passInIdx) + } +} + +// TestPFAddressFamily tests the pfAddressFamily helper. +func TestPFAddressFamily(t *testing.T) { + tests := []struct { + ip string + want string + }{ + {"10.0.0.1", "inet"}, + {"192.168.1.1", "inet"}, + {"127.0.0.1", "inet"}, + {"::1", "inet6"}, + {"fd00::1", "inet6"}, + {"2001:db8::1", "inet6"}, + } + for _, tt := range tests { + if got := pfAddressFamily(tt.ip); got != tt.want { + t.Errorf("pfAddressFamily(%q) = %q, want %q", tt.ip, got, tt.want) + } + } +} diff --git a/docs/pf-dns-intercept.md b/docs/pf-dns-intercept.md new file mode 100644 index 0000000..9008e04 --- /dev/null +++ b/docs/pf-dns-intercept.md @@ -0,0 +1,298 @@ +# macOS pf DNS Interception — Technical Reference + +## Overview + +ctrld uses macOS's built-in packet filter (pf) to intercept all DNS traffic at the kernel level, redirecting it to ctrld's local listener at `127.0.0.1:53`. This operates below interface DNS settings, making it immune to VPN software (F5, Cisco, GlobalProtect, etc.) that overwrites DNS on network interfaces. + +## How pf Works (Relevant Basics) + +pf is a stateful packet filter built into macOS (and BSD). It processes packets through a pipeline with **strict rule ordering**: + +``` +options (set) → normalization (scrub) → queueing → translation (nat/rdr) → filtering (pass/block) +``` + +**Anchors** are named rule containers that allow programs to manage their own rules without modifying the global ruleset. Each anchor type must appear in the correct section: + +| Anchor Type | Section | Purpose | +|-------------|---------|---------| +| `scrub-anchor` | Normalization | Packet normalization | +| `nat-anchor` | Translation | NAT rules | +| `rdr-anchor` | Translation | Redirect rules | +| `anchor` | Filtering | Pass/block rules | + +**Critical constraint:** If you place a `rdr-anchor` line after an `anchor` line, pf rejects the entire config with "Rules must be in order." + +## Why We Can't Just Use `rdr on ! lo0` + +The obvious approach: +``` +rdr pass on ! lo0 proto udp from any to any port 53 -> 127.0.0.1 port 53 +``` + +**This doesn't work.** macOS pf `rdr` rules only apply to *forwarded/routed* traffic — packets passing through the machine to another destination. DNS queries originating from the machine itself (locally-originated) are never matched by `rdr` on non-loopback interfaces. + +This is a well-known pf limitation on macOS/BSD. It means the VPN client's DNS queries would be redirected (if routed through the machine), but the user's own applications querying DNS directly would not. + +## Our Approach: route-to + rdr (Two-Step) + +We use a two-step technique to intercept locally-originated DNS: + +``` +Step 1: Force outbound DNS through loopback + pass out quick on ! lo0 route-to lo0 inet proto udp from any to ! 127.0.0.1 port 53 + +Step 2: Pass the packet outbound on lo0 (needed when VPN firewalls have "block drop all") + pass out quick on lo0 inet proto udp from any to ! 127.0.0.1 port 53 no state + +Step 3: Redirect it on loopback to ctrld's listener + rdr on lo0 inet proto udp from any to ! 127.0.0.1 port 53 -> 127.0.0.1 port 53 + +Step 4: Accept and create state for response routing + pass in quick on lo0 reply-to lo0 inet proto { udp, tcp } from any to 127.0.0.1 port 53 +``` + +> **State handling is critical for VPN firewall coexistence:** +> - **route-to**: `keep state` (default). State is interface-bound on macOS — doesn't match on lo0. +> - **pass out lo0**: `no state`. If this created state, it would match inbound on lo0 and bypass rdr. +> - **rdr**: no `pass` keyword. Packet must go through filter so `pass in` can create response state. +> - **pass in lo0**: `keep state` (default). Creates the ONLY state on lo0 — handles response routing. + +### Packet Flow + +``` +Application queries 10.255.255.3:53 (e.g., VPN DNS server) + ↓ +Kernel: outbound on en0 (or utun420 for VPN) + ↓ +pf filter: "pass out route-to lo0 ... port 53" → redirects to lo0, creates state on en0 + ↓ +pf filter (outbound lo0): "pass out on lo0 ... no state" → passes, NO state created + ↓ +Loopback reflects packet inbound on lo0 + ↓ +pf rdr (inbound lo0): "rdr on lo0 ... port 53 -> 127.0.0.1:53" → rewrites destination + ↓ +pf filter (inbound lo0): "pass in reply-to lo0 ... to 127.0.0.1:53" → creates state + reply route + ↓ +ctrld receives query on 127.0.0.1:53 + ↓ +ctrld resolves via DoH (port 443, exempted by group _ctrld) + ↓ +Response from ctrld: 127.0.0.1:53 → 100.94.163.168:54851 + ↓ +reply-to lo0: forces response through lo0 (without this, kernel routes via utun420 → lost in VPN tunnel) + ↓ +pf applies rdr reverse NAT: src 127.0.0.1 → 10.255.255.3 + ↓ +Application receives response from 10.255.255.3:53 ✓ +``` + +### Why This Works + +1. `route-to lo0` forces the packet onto loopback at the filter stage +2. `pass out on lo0 no state` gets past VPN "block drop all" without creating state +3. No state on lo0 means rdr gets fresh evaluation on the inbound pass +4. `reply-to lo0` on `pass in` forces the response through lo0 — without it, the kernel routes the response to VPN tunnel IPs via the VPN interface and it's lost +4. `rdr` (without `pass`) redirects then hands off to filter rules +5. `pass in keep state` creates the response state — the only state on the lo0 path +6. Traffic already destined for `127.0.0.1` is excluded (`to ! 127.0.0.1`) to prevent loops +7. ctrld's own upstream queries use DoH (port 443), bypassing port 53 rules entirely + +### Why Each State Decision Matters + +| Rule | State | Why | +|------|-------|-----| +| route-to on en0/utun | keep state | Needed for return routing. Interface-bound, won't match on lo0. | +| pass out on lo0 | **no state** | If stateful, it would match inbound lo0 → bypass rdr → DNS broken | +| rdr on lo0 | N/A (no pass) | Must go through filter so pass-in creates response state | +| pass in on lo0 | keep state + reply-to lo0 | Creates lo0 state. `reply-to` forces response through lo0 (not VPN tunnel). | + +## Rule Ordering Within the Anchor + +pf requires translation rules before filter rules, even within an anchor: + +```pf +# === Translation rules (MUST come first) === +rdr on lo0 inet proto udp from any to ! 127.0.0.1 port 53 -> 127.0.0.1 port 53 +rdr on lo0 inet proto tcp from any to ! 127.0.0.1 port 53 -> 127.0.0.1 port 53 + +# === Exemptions (filter phase, scoped to _ctrld group) === +pass out quick on ! lo0 inet proto { udp, tcp } from any to port 53 group _ctrld +pass out quick on ! lo0 inet proto { udp, tcp } from any to port 53 group _ctrld + +# === Main intercept (filter phase) === +pass out quick on ! lo0 route-to lo0 inet proto udp from any to ! 127.0.0.1 port 53 +pass out quick on ! lo0 route-to lo0 inet proto tcp from any to ! 127.0.0.1 port 53 + +# === Allow redirected traffic on loopback === +pass in quick on lo0 reply-to lo0 inet proto { udp, tcp } from any to 127.0.0.1 port 53 +``` + +### Exemption Mechanism (Group-Scoped) + +Some IPs must bypass the redirect: + +- **OS resolver nameservers** (e.g., DHCP-assigned DNS): ctrld's recovery/bootstrap path may query these on port 53. Without exemption, these queries loop back to ctrld. +- **VPN DNS servers**: When ctrld forwards VPN-specific domains (split DNS) to the VPN's internal DNS, those queries must reach the VPN DNS server directly. + +Exemptions use `pass out quick` with `group _ctrld` **before** the `route-to` rule. The `group _ctrld` constraint ensures that **only ctrld's own process** can bypass the redirect — other applications cannot circumvent DNS interception by querying the exempted IPs directly. Because pf evaluates filter rules in order and `quick` terminates evaluation, the exempted packet goes directly out the real interface and never hits the `route-to` or `rdr`. + +### The `_ctrld` Group + +To scope pf exemptions to ctrld's process only, we use a dedicated macOS system group: + +1. **Creation**: On startup, `ensureCtrldGroup()` creates a `_ctrld` system group via `dscl` (macOS Directory Services) if it doesn't already exist. The GID is chosen from the 350-450 range to avoid conflicts with Apple's reserved ranges. The function is idempotent. + +2. **Process GID**: Before loading pf rules, ctrld sets its effective GID to `_ctrld` via `syscall.Setegid()`. All sockets created by ctrld after this point are tagged with this GID. + +3. **pf matching**: Exemption rules include `group _ctrld`, so pf only allows bypass for packets from processes with this effective GID. Other processes querying the same exempt IPs are still redirected to ctrld. + +4. **Lifecycle**: The group is **never removed** on shutdown or uninstall. It's a harmless system group, and leaving it avoids race conditions during rapid restart cycles. It is recreated (no-op if exists) on every start. + +## Anchor Injection into pf.conf + +The trickiest part. macOS only processes anchors declared in the active pf ruleset. We must inject our anchor references into the running config. + +### What We Do + +1. Read `/etc/pf.conf` +2. If our anchor reference already exists, reload as-is +3. Otherwise, inject `rdr-anchor "com.controld.ctrld"` in the translation section and `anchor "com.controld.ctrld"` in the filter section +4. Write to a **temp file** and load with `pfctl -f ` +5. **We never modify `/etc/pf.conf` on disk** — changes are runtime-only and don't survive reboot (ctrld re-injects on every start) + +### Injection Logic + +Finding the right insertion point requires understanding the existing pf.conf structure. The algorithm: + +1. **Scan** for existing `rdr-anchor`/`nat-anchor`/`binat-anchor` lines (translation section) and `anchor` lines (filter section) +2. **Insert `rdr-anchor`**: + - Before the first existing `rdr-anchor` line (if any exist) + - Else before the first `anchor` line (translation must come before filtering) + - Else before the first `pass`/`block` line + - Last resort: append (but this should never happen with a valid pf.conf) +3. **Insert `anchor`**: + - Before the first existing `anchor` line (if any) + - Else before the first `pass`/`block` line + - Last resort: append + +### Real-World pf.conf Scenarios + +We test against these configurations: + +#### Default macOS (Sequoia/Sonoma) +``` +scrub-anchor "com.apple/*" +nat-anchor "com.apple/*" +rdr-anchor "com.apple/*" +anchor "com.apple/*" +load anchor "com.apple" from "/etc/pf.anchors/com.apple" +``` +Our `rdr-anchor` goes before `rdr-anchor "com.apple/*"`, our `anchor` goes before `anchor "com.apple/*"`. + +#### Little Snitch +Adds `rdr-anchor "com.obdev.littlesnitch"` and `anchor "com.obdev.littlesnitch"` in the appropriate sections. Our anchors coexist — pf processes multiple anchors in order. + +#### Lulu Firewall (Objective-See) +Adds `anchor "com.objective-see.lulu"`. We insert `rdr-anchor` before it (translation before filtering) and `anchor` before it. + +#### Cisco AnyConnect +Adds `nat-anchor "com.cisco.anyconnect"`, `rdr-anchor "com.cisco.anyconnect"`, `anchor "com.cisco.anyconnect"`. Our anchors insert alongside Cisco's in their respective sections. + +#### Minimal pf.conf (no anchors) +Just `set skip on lo0` and `pass all`. We insert `rdr-anchor` and `anchor` before the `pass` line. + +#### Empty pf.conf +Both anchors appended. This is a degenerate case that shouldn't occur in practice. + +## Failure Modes and Safety + +### What happens if our injection fails? +- `ensurePFAnchorReference` returns an error, logged as a warning +- ctrld continues running but DNS interception may not work +- The anchor file and rules are cleaned up on shutdown +- **No damage to existing pf config** — we never modify files on disk + +### What happens if ctrld crashes (SIGKILL)? +- pf anchor rules persist in kernel memory +- DNS is redirected to 127.0.0.1:53 but nothing is listening → DNS breaks +- On next `ctrld start`, we detect the stale anchor file, flush the anchor, and start fresh +- Without ctrld restart: `sudo pfctl -a com.controld.ctrld -F all` manually clears it + +### What if another program flushes all pf rules? +- Our anchor references are removed from the running config +- DNS interception stops (traffic goes direct again — fails open, not closed) +- The periodic watchdog (30s) detects missing rules and restores them +- ctrld continues working for queries sent to 127.0.0.1 directly + +### What if another program reloads pf.conf (corrupting translation state)? +Programs like Parallels Desktop reload `/etc/pf.conf` when creating or destroying +virtual network interfaces (bridge100, vmenet0). This can corrupt pf's internal +translation engine — **rdr rules survive in text form but stop evaluating**. +The watchdog's rule-text checks say "intact" while DNS is silently broken. + +**Detection:** ctrld detects interface appearance/disappearance in the network +change handler and spawns an asynchronous interception probe monitor: + +1. A subprocess sends a DNS query WITHOUT the `_ctrld` group GID, so pf + intercept rules apply to it +2. If ctrld receives the query → pf interception is working +3. If the query times out (1s) → pf translation is broken +4. On failure: `forceReloadPFMainRuleset()` does `pfctl -f -` with the current + running ruleset, resetting pf's translation engine + +The monitor probes with exponential backoff (0, 0.5, 1, 2, 4s) to win the race +against async pf reloads. Only one monitor runs at a time (singleton). The +watchdog also runs the probe every 30s as a safety net. + +The full pf reload is VPN-safe: it reassembles from `pfctl -sr` + `pfctl -sn` +(the current running state), preserving all existing anchors and rules. + +### What if another program adds conflicting rdr rules? +- pf processes anchors in declaration order +- If another program redirects port 53 before our anchor, their redirect wins +- If after, ours wins (first match with `quick` or `rdr pass`) +- Our maximum-weight sublayer approach on Windows (WFP) doesn't apply to pf — pf uses rule ordering, not weights + +### What about `set skip on lo0`? +Some pf.conf files include `set skip on lo0` which tells pf to skip ALL processing on loopback. **This would break our approach** since both the `rdr on lo0` and `pass in on lo0` rules would be skipped. + +**Mitigation:** When injecting anchor references via `ensurePFAnchorReference()`, +we strip `lo0` from any `set skip on` directives before reloading. The watchdog +also checks for `set skip on lo0` and triggers a restore if detected. The +interception probe provides an additional safety net — if `set skip on lo0` gets +re-applied by another program, the probe will fail and trigger a full reload. + +## Cleanup + +On shutdown (`stopDNSIntercept`): +1. `pfctl -a com.controld.ctrld -F all` — flush all rules from our anchor +2. Remove `/etc/pf.anchors/com.controld.ctrld` anchor file +3. `pfctl -f /etc/pf.conf` — reload original pf.conf, removing our injected anchor references from the running config + +This is clean: no files modified on disk, no residual rules. + +## Comparison with Other Approaches + +| Approach | Intercepts local DNS? | Survives VPN DNS override? | Risk of loops? | Complexity | +|----------|----------------------|---------------------------|----------------|------------| +| `rdr on ! lo0` | ❌ No | Yes | Low | Low | +| `route-to lo0` + `rdr on lo0` | ✅ Yes | Yes | Medium (need exemptions) | Medium | +| `/etc/resolver/` | Partial (per-domain only) | No (VPN can overwrite) | Low | Low | +| `NEDNSProxyProvider` | ✅ Yes | Yes | Low | High (needs app bundle) | +| NRPT (Windows only) | N/A | Partial | Low | Medium | + +We chose `route-to + rdr` as the best balance of effectiveness and deployability (no app bundle needed, no kernel extension, works with existing ctrld binary). + +## Key pf Nuances Learned + +1. **`rdr` doesn't match locally-originated traffic** — this is the biggest gotcha +2. **Rule ordering is enforced** — translation before filtering, always +3. **Anchors must be declared in the main ruleset** — just loading an anchor file isn't enough +4. **`rdr` without `pass`** — redirected packets must go through filter rules so `pass in keep state` can create response state. `rdr pass` alone is insufficient for response delivery. +5. **State handling is nuanced** — route-to uses `keep state` (state is floating). `pass out on lo0` must use `no state` (prevents rdr bypass). `pass in on lo0` uses `keep state` + `reply-to lo0` (creates response state AND forces response through loopback instead of VPN tunnel). Getting any of these wrong breaks either the forward or return path. +6. **`quick` terminates evaluation** — exemption rules must use `quick` and appear before the route-to rule +7. **Piping to `pfctl -f -` can fail** — special characters in pf.conf content cause issues; use temp files +8. **`set skip on lo0` would break us** — but it's not in default macOS pf.conf +9. **`pass out quick` exemptions work with route-to** — they fire in the same phase (filter), so `quick` + rule ordering means exempted packets never hit the route-to rule diff --git a/test-scripts/README.md b/test-scripts/README.md new file mode 100644 index 0000000..7ae5fd6 --- /dev/null +++ b/test-scripts/README.md @@ -0,0 +1,44 @@ +# DNS Intercept Test Scripts + +Manual test scripts for verifying DNS intercept mode behavior. These require root/admin privileges and a running ctrld instance. + +## Structure + +``` +test-scripts/ +├── darwin/ +│ ├── test-recovery-bypass.sh # Captive portal recovery simulation +│ ├── test-dns-intercept.sh # Basic pf intercept verification +│ ├── test-pf-group-exemption.sh # Group-based pf exemption test +│ └── validate-pf-rules.sh # Dry-run pf rule validation +└── windows/ + ├── test-recovery-bypass.ps1 # Captive portal recovery simulation + └── test-dns-intercept.ps1 # Basic WFP intercept verification +``` + +## Prerequisites + +- ctrld running with `--intercept-mode dns` (or `--intercept-mode hard`) +- Verbose logging: `-v 1 --log /tmp/dns.log` (macOS) or `--log C:\temp\dns.log` (Windows) +- Root (macOS) or Administrator (Windows) +- For recovery tests: disconnect VPNs (e.g., Tailscale) that provide alternative routes + +## Recovery Bypass Test + +Simulates a captive portal by blackholing ctrld's upstream DoH IPs and cycling wifi. Verifies that ctrld's recovery bypass activates, discovers DHCP nameservers, and forwards queries to them until the upstream recovers. + +### macOS +```bash +sudo bash test-scripts/darwin/test-recovery-bypass.sh en0 +``` + +### Windows (PowerShell as Administrator) +```powershell +.\test-scripts\windows\test-recovery-bypass.ps1 -WifiAdapter "Wi-Fi" +``` + +## Safety + +All scripts clean up on exit (including Ctrl+C): +- **macOS**: Removes route blackholes, re-enables wifi +- **Windows**: Removes firewall rules, re-enables adapter diff --git a/test-scripts/darwin/test-dns-intercept.sh b/test-scripts/darwin/test-dns-intercept.sh new file mode 100755 index 0000000..b54e9c1 --- /dev/null +++ b/test-scripts/darwin/test-dns-intercept.sh @@ -0,0 +1,556 @@ +#!/bin/bash +# ============================================================================= +# DNS Intercept Mode Test Script — macOS (pf) +# ============================================================================= +# Run as root: sudo bash test-dns-intercept-mac.sh +# +# Tests the dns-intercept feature end-to-end with validation at each step. +# Logs are read from /tmp/dns.log (ctrld log location on test machine). +# +# Manual steps marked with [MANUAL] require human interaction. +# ============================================================================= + +set -euo pipefail + +CTRLD_LOG="/tmp/dns.log" +PF_ANCHOR="com.controld.ctrld" +PASS=0 +FAIL=0 +WARN=0 +RESULTS=() + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' + +header() { echo -e "\n${CYAN}${BOLD}━━━ $1 ━━━${NC}"; } +info() { echo -e " ${BOLD}ℹ${NC} $1"; } +pass() { echo -e " ${GREEN}✅ PASS${NC}: $1"; PASS=$((PASS+1)); RESULTS+=("PASS: $1"); } +fail() { echo -e " ${RED}❌ FAIL${NC}: $1"; FAIL=$((FAIL+1)); RESULTS+=("FAIL: $1"); } +warn() { echo -e " ${YELLOW}⚠️ WARN${NC}: $1"; WARN=$((WARN+1)); RESULTS+=("WARN: $1"); } +manual() { echo -e " ${YELLOW}[MANUAL]${NC} $1"; } +separator() { echo -e "${CYAN}─────────────────────────────────────────────────────${NC}"; } + +check_root() { + if [[ $EUID -ne 0 ]]; then + echo -e "${RED}This script must be run as root (sudo).${NC}" + exit 1 + fi +} + +wait_for_key() { + echo -e "\n Press ${BOLD}Enter${NC} to continue..." + read -r +} + +# Grep recent log entries (last N lines) +log_grep() { + local pattern="$1" + local lines="${2:-200}" + tail -n "$lines" "$CTRLD_LOG" 2>/dev/null | grep -i "$pattern" 2>/dev/null || true +} + +log_grep_count() { + local pattern="$1" + local lines="${2:-200}" + tail -n "$lines" "$CTRLD_LOG" 2>/dev/null | grep -ci "$pattern" 2>/dev/null || echo "0" +} + +# ============================================================================= +# TEST SECTIONS +# ============================================================================= + +test_prereqs() { + header "0. Prerequisites" + + if command -v pfctl &>/dev/null; then + pass "pfctl available" + else + fail "pfctl not found" + exit 1 + fi + + if [[ -f "$CTRLD_LOG" ]]; then + pass "ctrld log exists at $CTRLD_LOG" + else + warn "ctrld log not found at $CTRLD_LOG — log checks will be skipped" + fi + + if command -v dig &>/dev/null; then + pass "dig available" + else + fail "dig not found — install bind tools" + exit 1 + fi + + info "Default route interface: $(route -n get default 2>/dev/null | grep interface | awk '{print $2}' || echo 'unknown')" + info "Current DNS servers:" + scutil --dns | grep "nameserver\[" | head -5 | sed 's/^/ /' +} + +test_pf_state() { + header "1. PF State Validation" + + # Is pf enabled? + local pf_status + pf_status=$(pfctl -si 2>&1 | grep "Status:" || true) + if echo "$pf_status" | grep -q "Enabled"; then + pass "pf is enabled" + else + fail "pf is NOT enabled (status: $pf_status)" + fi + + # Is our anchor referenced in the running ruleset? + local sr_match sn_match + sr_match=$(pfctl -sr 2>&1 | grep "$PF_ANCHOR" || true) + sn_match=$(pfctl -sn 2>&1 | grep "$PF_ANCHOR" || true) + + if [[ -n "$sr_match" ]]; then + pass "anchor '$PF_ANCHOR' found in filter rules (pfctl -sr)" + info " $sr_match" + else + fail "anchor '$PF_ANCHOR' NOT in filter rules — main ruleset doesn't reference it" + fi + + if [[ -n "$sn_match" ]]; then + pass "rdr-anchor '$PF_ANCHOR' found in NAT rules (pfctl -sn)" + info " $sn_match" + else + fail "rdr-anchor '$PF_ANCHOR' NOT in NAT rules — redirect won't work" + fi + + # Check anchor rules + separator + info "Anchor filter rules (pfctl -a '$PF_ANCHOR' -sr):" + local anchor_sr + anchor_sr=$(pfctl -a "$PF_ANCHOR" -sr 2>&1 | grep -v "ALTQ" || true) + if [[ -n "$anchor_sr" ]]; then + echo "$anchor_sr" | sed 's/^/ /' + # Check for route-to rules + if echo "$anchor_sr" | grep -q "route-to"; then + pass "route-to lo0 rules present (needed for local traffic interception)" + else + warn "No route-to rules found — local DNS may not be intercepted" + fi + else + fail "No filter rules in anchor" + fi + + info "Anchor redirect rules (pfctl -a '$PF_ANCHOR' -sn):" + local anchor_sn + anchor_sn=$(pfctl -a "$PF_ANCHOR" -sn 2>&1 | grep -v "ALTQ" || true) + if [[ -n "$anchor_sn" ]]; then + echo "$anchor_sn" | sed 's/^/ /' + if echo "$anchor_sn" | grep -q "rdr.*lo0.*port = 53"; then + pass "rdr rules on lo0 present (redirect DNS to ctrld)" + else + warn "rdr rules don't match expected pattern" + fi + else + fail "No redirect rules in anchor" + fi + + # Check anchor file exists + if [[ -f "/etc/pf.anchors/$PF_ANCHOR" ]]; then + pass "Anchor file exists: /etc/pf.anchors/$PF_ANCHOR" + else + fail "Anchor file missing: /etc/pf.anchors/$PF_ANCHOR" + fi + + # Check pf.conf was NOT modified + if grep -q "$PF_ANCHOR" /etc/pf.conf 2>/dev/null; then + warn "pf.conf contains '$PF_ANCHOR' reference — should NOT be modified on disk" + else + pass "pf.conf NOT modified on disk (anchor injected at runtime only)" + fi +} + +test_dns_interception() { + header "2. DNS Interception Tests" + + # Mark position in log + local log_lines_before=0 + if [[ -f "$CTRLD_LOG" ]]; then + log_lines_before=$(wc -l < "$CTRLD_LOG") + fi + + # Test 1: Query to external resolver should be intercepted + info "Test: dig @8.8.8.8 example.com (should be intercepted by ctrld)" + local dig_result + dig_result=$(dig @8.8.8.8 example.com +short +timeout=5 2>&1 || true) + + if [[ -n "$dig_result" ]] && ! echo "$dig_result" | grep -q "timed out"; then + pass "dig @8.8.8.8 returned result: $dig_result" + else + fail "dig @8.8.8.8 failed or timed out" + fi + + # Check if ctrld logged the query + sleep 1 + if [[ -f "$CTRLD_LOG" ]]; then + local intercepted + intercepted=$(tail -n +$((log_lines_before+1)) "$CTRLD_LOG" | grep -c "example.com" || echo "0") + if [[ "$intercepted" -gt 0 ]]; then + pass "ctrld logged the intercepted query for example.com" + else + fail "ctrld did NOT log query for example.com — interception may not be working" + fi + fi + + # Check dig reports ctrld answered (not 8.8.8.8) + local full_dig + full_dig=$(dig @8.8.8.8 example.com +timeout=5 2>&1 || true) + local server_line + server_line=$(echo "$full_dig" | grep "SERVER:" || true) + info "dig SERVER line: $server_line" + if echo "$server_line" | grep -q "127.0.0.1"; then + pass "Response came from 127.0.0.1 (ctrld intercepted)" + elif echo "$server_line" | grep -q "8.8.8.8"; then + fail "Response came from 8.8.8.8 directly — NOT intercepted" + else + warn "Could not determine response server from dig output" + fi + + separator + + # Test 2: Query to another external resolver + info "Test: dig @1.1.1.1 cloudflare.com (should also be intercepted)" + local dig2 + dig2=$(dig @1.1.1.1 cloudflare.com +short +timeout=5 2>&1 || true) + if [[ -n "$dig2" ]] && ! echo "$dig2" | grep -q "timed out"; then + pass "dig @1.1.1.1 returned result" + else + fail "dig @1.1.1.1 failed or timed out" + fi + + separator + + # Test 3: Query to localhost should work (not double-redirected) + info "Test: dig @127.0.0.1 example.org (direct to ctrld, should NOT be redirected)" + local dig3 + dig3=$(dig @127.0.0.1 example.org +short +timeout=5 2>&1 || true) + if [[ -n "$dig3" ]] && ! echo "$dig3" | grep -q "timed out"; then + pass "dig @127.0.0.1 works (no loop)" + else + fail "dig @127.0.0.1 failed — possible redirect loop" + fi + + separator + + # Test 4: System DNS resolution + info "Test: host example.net (system resolver, should go through ctrld)" + local host_result + host_result=$(host example.net 2>&1 || true) + if echo "$host_result" | grep -q "has address"; then + pass "System DNS resolution works via host command" + else + fail "System DNS resolution failed" + fi + + separator + + # Test 5: TCP DNS query + info "Test: dig @9.9.9.9 example.com +tcp (TCP DNS should also be intercepted)" + local dig_tcp + dig_tcp=$(dig @9.9.9.9 example.com +tcp +short +timeout=5 2>&1 || true) + if [[ -n "$dig_tcp" ]] && ! echo "$dig_tcp" | grep -q "timed out"; then + pass "TCP DNS query intercepted and resolved" + else + warn "TCP DNS query failed (may not be critical if UDP works)" + fi +} + +test_non_dns_unaffected() { + header "3. Non-DNS Traffic Unaffected" + + # HTTPS should work fine + info "Test: curl https://example.com (HTTPS port 443 should NOT be affected)" + local curl_result + curl_result=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 https://example.com 2>&1 || echo "000") + if [[ "$curl_result" == "200" ]] || [[ "$curl_result" == "301" ]] || [[ "$curl_result" == "302" ]]; then + pass "HTTPS works (HTTP $curl_result)" + else + fail "HTTPS failed (HTTP $curl_result) — pf may be affecting non-DNS traffic" + fi + + # SSH-style connection test (port 22 should be unaffected) + info "Test: nc -z -w5 github.com 22 (SSH port should NOT be affected)" + if nc -z -w5 github.com 22 2>/dev/null; then + pass "SSH port reachable (non-DNS traffic unaffected)" + else + warn "SSH port unreachable (may be firewall, not necessarily our fault)" + fi +} + +test_ctrld_log_health() { + header "4. ctrld Log Health Check" + + if [[ ! -f "$CTRLD_LOG" ]]; then + warn "Skipping log checks — $CTRLD_LOG not found" + return + fi + + # Check for intercept initialization + if log_grep "DNS intercept.*initializing" 500 | grep -q "."; then + pass "DNS intercept initialization logged" + else + fail "No DNS intercept initialization in recent logs" + fi + + # Check for successful anchor load + if log_grep "pf anchor.*active" 500 | grep -q "."; then + pass "PF anchor reported as active" + else + fail "PF anchor not reported as active" + fi + + # Check for anchor reference injection + if log_grep "anchor reference active" 500 | grep -q "."; then + pass "Anchor reference injected into running ruleset" + else + fail "Anchor reference NOT injected — this is the critical step" + fi + + # Check for errors + separator + info "Recent errors/warnings in ctrld log:" + local errors + errors=$(log_grep '"level":"error"' 500) + if [[ -n "$errors" ]]; then + echo "$errors" | tail -5 | sed 's/^/ /' + warn "Errors found in recent logs (see above)" + else + pass "No errors in recent logs" + fi + + local warnings + warnings=$(log_grep '"level":"warn"' 500 | grep -v "skipping self-upgrade" || true) + if [[ -n "$warnings" ]]; then + echo "$warnings" | tail -5 | sed 's/^/ /' + info "(warnings above may be expected)" + fi + + # Check for recovery bypass state + if log_grep "recoveryBypass\|recovery bypass\|prepareForRecovery" 500 | grep -q "."; then + info "Recovery bypass activity detected in logs" + log_grep "recovery" 500 | tail -3 | sed 's/^/ /' + fi + + # Check for VPN DNS detection + if log_grep "VPN DNS" 500 | grep -q "."; then + info "VPN DNS activity in logs:" + log_grep "VPN DNS" 500 | tail -5 | sed 's/^/ /' + else + info "No VPN DNS activity (expected if no VPN is connected)" + fi +} + +test_pf_counters() { + header "5. PF Statistics & Counters" + + info "PF info (pfctl -si):" + pfctl -si 2>&1 | grep -v "ALTQ" | head -15 | sed 's/^/ /' + + info "PF state table entries:" + pfctl -ss 2>&1 | grep -c "." | sed 's/^/ States: /' + + # Count evaluations of our anchor + info "Anchor-specific stats (if available):" + local anchor_info + anchor_info=$(pfctl -a "$PF_ANCHOR" -si 2>&1 | grep -v "ALTQ" || true) + if [[ -n "$anchor_info" ]]; then + echo "$anchor_info" | head -10 | sed 's/^/ /' + else + info " (no per-anchor stats available)" + fi +} + +test_cleanup_on_stop() { + header "6. Cleanup Validation (After ctrld Stop)" + + manual "Stop ctrld now (Ctrl+C or 'ctrld stop'), then press Enter" + wait_for_key + + # Check anchor is flushed + local anchor_rules_after + anchor_rules_after=$(pfctl -a "$PF_ANCHOR" -sr 2>&1 | grep -v "ALTQ" | grep -v "^$" || true) + if [[ -z "$anchor_rules_after" ]]; then + pass "Anchor filter rules flushed after stop" + else + fail "Anchor filter rules still present after stop" + echo "$anchor_rules_after" | sed 's/^/ /' + fi + + local anchor_rdr_after + anchor_rdr_after=$(pfctl -a "$PF_ANCHOR" -sn 2>&1 | grep -v "ALTQ" | grep -v "^$" || true) + if [[ -z "$anchor_rdr_after" ]]; then + pass "Anchor redirect rules flushed after stop" + else + fail "Anchor redirect rules still present after stop" + fi + + # Check anchor file removed + if [[ ! -f "/etc/pf.anchors/$PF_ANCHOR" ]]; then + pass "Anchor file removed after stop" + else + fail "Anchor file still exists: /etc/pf.anchors/$PF_ANCHOR" + fi + + # Check pf.conf is clean + if ! grep -q "$PF_ANCHOR" /etc/pf.conf 2>/dev/null; then + pass "pf.conf is clean (no ctrld references)" + else + fail "pf.conf still has ctrld references after stop" + fi + + # DNS should work normally without ctrld + info "Test: dig example.com (should resolve via system DNS)" + local dig_after + dig_after=$(dig example.com +short +timeout=5 2>&1 || true) + if [[ -n "$dig_after" ]] && ! echo "$dig_after" | grep -q "timed out"; then + pass "DNS works after ctrld stop" + else + fail "DNS broken after ctrld stop — cleanup may have failed" + fi +} + +test_restart_resilience() { + header "7. Restart Resilience" + + manual "Start ctrld again with --dns-intercept, then press Enter" + wait_for_key + + sleep 3 + + # Re-run pf state checks + local sr_match sn_match + sr_match=$(pfctl -sr 2>&1 | grep "$PF_ANCHOR" || true) + sn_match=$(pfctl -sn 2>&1 | grep "$PF_ANCHOR" || true) + + if [[ -n "$sr_match" ]] && [[ -n "$sn_match" ]]; then + pass "Anchor references restored after restart" + else + fail "Anchor references NOT restored after restart" + fi + + # Quick interception test + local dig_after_restart + dig_after_restart=$(dig @8.8.8.8 example.com +short +timeout=5 2>&1 || true) + if [[ -n "$dig_after_restart" ]] && ! echo "$dig_after_restart" | grep -q "timed out"; then + pass "DNS interception works after restart" + else + fail "DNS interception broken after restart" + fi +} + +test_network_change() { + header "8. Network Change Recovery" + + info "This test verifies recovery after network changes." + manual "Switch Wi-Fi networks (or disconnect/reconnect Ethernet), then press Enter" + wait_for_key + + sleep 5 + + # Check pf rules still active + local sr_after sn_after + sr_after=$(pfctl -sr 2>&1 | grep "$PF_ANCHOR" || true) + sn_after=$(pfctl -sn 2>&1 | grep "$PF_ANCHOR" || true) + + if [[ -n "$sr_after" ]] && [[ -n "$sn_after" ]]; then + pass "Anchor references survived network change" + else + fail "Anchor references lost after network change" + fi + + # Check interception still works + local dig_after_net + dig_after_net=$(dig @8.8.8.8 example.com +short +timeout=10 2>&1 || true) + if [[ -n "$dig_after_net" ]] && ! echo "$dig_after_net" | grep -q "timed out"; then + pass "DNS interception works after network change" + else + fail "DNS interception broken after network change" + fi + + # Check logs for recovery bypass activity + if [[ -f "$CTRLD_LOG" ]]; then + local recovery_logs + recovery_logs=$(log_grep "recovery\|network change\|network monitor" 100) + if [[ -n "$recovery_logs" ]]; then + info "Recovery/network change log entries:" + echo "$recovery_logs" | tail -5 | sed 's/^/ /' + fi + fi +} + +# ============================================================================= +# SUMMARY +# ============================================================================= + +print_summary() { + header "TEST SUMMARY" + echo "" + for r in "${RESULTS[@]}"; do + if [[ "$r" == PASS* ]]; then + echo -e " ${GREEN}✅${NC} ${r#PASS: }" + elif [[ "$r" == FAIL* ]]; then + echo -e " ${RED}❌${NC} ${r#FAIL: }" + elif [[ "$r" == WARN* ]]; then + echo -e " ${YELLOW}⚠️${NC} ${r#WARN: }" + fi + done + echo "" + separator + echo -e " ${GREEN}Passed: $PASS${NC} | ${RED}Failed: $FAIL${NC} | ${YELLOW}Warnings: $WARN${NC}" + separator + + if [[ $FAIL -gt 0 ]]; then + echo -e "\n ${RED}${BOLD}Some tests failed.${NC} Check output above for details." + echo -e " Useful debug commands:" + echo -e " pfctl -a '$PF_ANCHOR' -sr # anchor filter rules" + echo -e " pfctl -a '$PF_ANCHOR' -sn # anchor redirect rules" + echo -e " pfctl -sr | grep controld # main ruleset references" + echo -e " tail -100 $CTRLD_LOG # recent ctrld logs" + else + echo -e "\n ${GREEN}${BOLD}All tests passed!${NC}" + fi +} + +# ============================================================================= +# MAIN +# ============================================================================= + +echo -e "${BOLD}╔═══════════════════════════════════════════════════════╗${NC}" +echo -e "${BOLD}║ ctrld DNS Intercept Mode — macOS Test Suite ║${NC}" +echo -e "${BOLD}║ Tests pf-based DNS interception (route-to + rdr) ║${NC}" +echo -e "${BOLD}╚═══════════════════════════════════════════════════════╝${NC}" + +check_root + +echo "" +echo "Make sure ctrld is running with --dns-intercept before starting." +echo "Log location: $CTRLD_LOG" +wait_for_key + +test_prereqs +test_pf_state +test_dns_interception +test_non_dns_unaffected +test_ctrld_log_health +test_pf_counters + +separator +echo "" +echo "The next tests require manual steps (stop/start ctrld, network changes)." +echo "Press Enter to continue, or Ctrl+C to skip and see results so far." +wait_for_key + +test_cleanup_on_stop +test_restart_resilience +test_network_change + +print_summary diff --git a/test-scripts/darwin/test-pf-group-exemption.sh b/test-scripts/darwin/test-pf-group-exemption.sh new file mode 100755 index 0000000..9f47805 --- /dev/null +++ b/test-scripts/darwin/test-pf-group-exemption.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# Test: pf group-based exemption for DNS intercept +# Run as root: sudo bash test-pf-group-exemption.sh + +set -e + +GROUP_NAME="_ctrld" +ANCHOR="com.controld.test" +TEST_DNS="1.1.1.1" + +echo "=== Step 1: Create test group ===" +if dscl . -read /Groups/$GROUP_NAME PrimaryGroupID &>/dev/null; then + echo "Group $GROUP_NAME already exists" +else + # Find an unused GID in 350-450 range + USED_GIDS=$(dscl . -list /Groups PrimaryGroupID 2>/dev/null | awk '{print $2}' | sort -n) + GROUP_ID="" + for gid in $(seq 350 450); do + if ! echo "$USED_GIDS" | grep -q "^${gid}$"; then + GROUP_ID=$gid + break + fi + done + if [ -z "$GROUP_ID" ]; then + echo "ERROR: Could not find unused GID in 350-450 range" + exit 1 + fi + dscl . -create /Groups/$GROUP_NAME + dscl . -create /Groups/$GROUP_NAME PrimaryGroupID $GROUP_ID + dscl . -create /Groups/$GROUP_NAME RealName "Control D DNS Intercept" + echo "Created group $GROUP_NAME (GID $GROUP_ID)" +fi + +ACTUAL_GID=$(dscl . -read /Groups/$GROUP_NAME PrimaryGroupID | awk '{print $2}') +echo "GID: $ACTUAL_GID" + +echo "" +echo "=== Step 2: Enable pf ===" +pfctl -e 2>&1 || true + +echo "" +echo "=== Step 3: Set up pf anchor with group exemption ===" + +cat > /tmp/pf-group-test-anchor.conf << RULES +# Translation: redirect DNS on loopback to our listener +rdr pass on lo0 inet proto udp from any to ! 127.0.0.1 port 53 -> 127.0.0.1 port 53 +rdr pass on lo0 inet proto tcp from any to ! 127.0.0.1 port 53 -> 127.0.0.1 port 53 + +# Exemption: only group _ctrld can talk to $TEST_DNS directly +pass out quick on ! lo0 inet proto { udp, tcp } from any to $TEST_DNS port 53 group $GROUP_NAME + +# Intercept everything else +pass out quick on ! lo0 route-to lo0 inet proto udp from any to ! 127.0.0.1 port 53 +pass out quick on ! lo0 route-to lo0 inet proto tcp from any to ! 127.0.0.1 port 53 +pass in quick on lo0 inet proto { udp, tcp } from any to 127.0.0.1 port 53 +RULES + +pfctl -a $ANCHOR -f /tmp/pf-group-test-anchor.conf 2>/dev/null +echo "Loaded anchor $ANCHOR" + +# Inject anchor refs into running ruleset +NAT_RULES=$(pfctl -sn 2>/dev/null | grep -v "ALTQ" | grep -v "^$") +FILTER_RULES=$(pfctl -sr 2>/dev/null | grep -v "ALTQ" | grep -v "^$") +SCRUB_RULES=$(echo "$FILTER_RULES" | grep "^scrub" || true) +PURE_FILTER=$(echo "$FILTER_RULES" | grep -v "^scrub" | grep -v "com.controld.test" || true) +CLEAN_NAT=$(echo "$NAT_RULES" | grep -v "com.controld.test" || true) + +{ + [ -n "$SCRUB_RULES" ] && echo "$SCRUB_RULES" + [ -n "$CLEAN_NAT" ] && echo "$CLEAN_NAT" + echo "rdr-anchor \"$ANCHOR\"" + echo "anchor \"$ANCHOR\"" + [ -n "$PURE_FILTER" ] && echo "$PURE_FILTER" +} | pfctl -f - 2>/dev/null + +echo "Injected anchor references (no duplicates)" + +echo "" +echo "=== Step 4: Verify rules ===" +echo "NAT rules:" +pfctl -sn 2>/dev/null | grep -v ALTQ +echo "" +echo "Anchor filter rules:" +pfctl -a $ANCHOR -sr 2>/dev/null | grep -v ALTQ +echo "" +echo "Anchor NAT rules:" +pfctl -a $ANCHOR -sn 2>/dev/null | grep -v ALTQ + +echo "" +echo "=== Step 5: Build setgid test binary ===" +# We need a binary that runs with effective group _ctrld. +# sudo -g doesn't work on macOS, so we use a setgid binary. +cat > /tmp/test-dns-group.c << 'EOF' +#include +int main() { + char *args[] = {"dig", "+short", "+timeout=3", "+tries=1", "@1.1.1.1", "popads.net", NULL}; + execvp("dig", args); + return 1; +} +EOF +cc -o /tmp/test-dns-group /tmp/test-dns-group.c +chgrp $GROUP_NAME /tmp/test-dns-group +chmod g+s /tmp/test-dns-group +echo "Built setgid binary /tmp/test-dns-group (group: $GROUP_NAME)" + +echo "" +echo "=== Step 6: Test as regular user (should be INTERCEPTED) ===" +echo "Running: dig @$TEST_DNS popads.net (as root / group wheel — no group exemption)" +echo "If nothing listens on 127.0.0.1:53, this should timeout." +DIG_RESULT=$(dig +short +timeout=3 +tries=1 @$TEST_DNS popads.net 2>&1 || true) +echo "Result: ${DIG_RESULT:-TIMEOUT/INTERCEPTED}" + +echo "" +echo "=== Step 7: Test as group _ctrld (should BYPASS) ===" +echo "Running: setgid binary (effective group: $GROUP_NAME)" +BYPASS_RESULT=$(/tmp/test-dns-group 2>&1 || true) +echo "Result: ${BYPASS_RESULT:-TIMEOUT/BLOCKED}" + +echo "" +echo "=== Results ===" +PASS=true +if [[ -z "$DIG_RESULT" || "$DIG_RESULT" == *"timed out"* || "$DIG_RESULT" == *"connection refused"* ]]; then + echo "✅ Regular query INTERCEPTED (redirected away from $TEST_DNS)" +else + echo "❌ Regular query NOT intercepted — got: $DIG_RESULT" + PASS=false +fi + +if [[ -n "$BYPASS_RESULT" && "$BYPASS_RESULT" != *"timed out"* && "$BYPASS_RESULT" != *"connection refused"* && "$BYPASS_RESULT" != *"TIMEOUT"* ]]; then + echo "✅ Group _ctrld query BYPASSED — got: $BYPASS_RESULT" +else + echo "❌ Group _ctrld query was also intercepted — got: ${BYPASS_RESULT:-TIMEOUT}" + PASS=false +fi + +if $PASS; then + echo "" + echo "🎉 GROUP EXEMPTION WORKS — this approach is viable for dns-intercept mode" +fi + +echo "" +echo "=== Cleanup ===" +pfctl -a $ANCHOR -F all 2>/dev/null +pfctl -f /etc/pf.conf 2>/dev/null +rm -f /tmp/pf-group-test-anchor.conf /tmp/test-dns-group /tmp/test-dns-group.c +echo "Cleaned up. Group $GROUP_NAME left in place." +echo "To remove: sudo dscl . -delete /Groups/$GROUP_NAME" diff --git a/test-scripts/darwin/test-recovery-bypass.sh b/test-scripts/darwin/test-recovery-bypass.sh new file mode 100755 index 0000000..f5aad7e --- /dev/null +++ b/test-scripts/darwin/test-recovery-bypass.sh @@ -0,0 +1,301 @@ +#!/bin/bash +# test-recovery-bypass.sh — Test DNS intercept recovery bypass (captive portal simulation) +# +# Simulates a captive portal by: +# 1. Discovering ctrld's upstream IPs from active connections +# 2. Blackholing ALL of them via route table +# 3. Cycling wifi to trigger network change → recovery flow +# 4. Verifying recovery bypass forwards to OS/DHCP resolver +# 5. Unblocking and verifying normal operation resumes +# +# SAFE: Uses route add/delete + networksetup — cleaned up on exit (including Ctrl+C). +# +# Usage: sudo bash test-recovery-bypass.sh [wifi_interface] +# wifi_interface defaults to en0 +# +# Prerequisites: +# - ctrld running with --dns-intercept and -v 1 --log /tmp/dns.log +# - Run as root (sudo) + +set -euo pipefail + +WIFI_IFACE="${1:-en0}" +CTRLD_LOG="/tmp/dns.log" +BLOCKED_IPS=() + +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; CYAN='\033[0;36m'; NC='\033[0m' +log() { echo -e "${CYAN}[$(date +%H:%M:%S)]${NC} $*"; } +pass() { echo -e "${GREEN}[PASS]${NC} $*"; } +fail() { echo -e "${RED}[FAIL]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } + +# ── Safety: always clean up on exit ────────────────────────────────────────── +cleanup() { + echo "" + log "═══ CLEANUP ═══" + + # Ensure wifi is on + log "Ensuring wifi is on..." + networksetup -setairportpower "$WIFI_IFACE" on 2>/dev/null || true + + # Remove all blackhole routes + for ip in "${BLOCKED_IPS[@]}"; do + route delete -host "$ip" 2>/dev/null && log "Removed route for $ip" || true + done + + log "Cleanup complete. Internet should be restored." + log "(If not, run: sudo networksetup -setairportpower $WIFI_IFACE on)" +} +trap cleanup EXIT INT TERM + +# ── Pre-checks ─────────────────────────────────────────────────────────────── +if [[ $EUID -ne 0 ]]; then + echo "Run as root: sudo bash $0 $*" + exit 1 +fi + +if [[ ! -f "$CTRLD_LOG" ]]; then + fail "ctrld log not found at $CTRLD_LOG" + echo "Start ctrld with: ctrld run --dns-intercept --cd -v 1 --log $CTRLD_LOG" + exit 1 +fi + +# Check wifi interface exists +if ! networksetup -getairportpower "$WIFI_IFACE" >/dev/null 2>&1; then + fail "Wifi interface $WIFI_IFACE not found" + echo "Try: networksetup -listallhardwareports" + exit 1 +fi + +log "═══════════════════════════════════════════════════════════" +log " Recovery Bypass Test (Captive Portal Simulation)" +log "═══════════════════════════════════════════════════════════" +log "Wifi interface: $WIFI_IFACE" +log "ctrld log: $CTRLD_LOG" +echo "" + +# ── Phase 1: Discover upstream IPs ────────────────────────────────────────── +log "Phase 1: Discovering ctrld upstream IPs from active connections" + +# Find ctrld's established connections (DoH uses port 443) +CTRLD_CONNS=$(lsof -i -n -P 2>/dev/null | grep -i ctrld | grep ESTABLISHED || true) +if [[ -z "$CTRLD_CONNS" ]]; then + warn "No established ctrld connections found via lsof" + warn "Trying: ss/netstat fallback..." + CTRLD_CONNS=$(netstat -an 2>/dev/null | grep "\.443 " | grep ESTABLISHED || true) +fi + +echo "$CTRLD_CONNS" | head -10 | while read -r line; do + log " $line" +done + +# Extract unique remote IPs from ctrld connections +UPSTREAM_IPS=() +while IFS= read -r ip; do + [[ -n "$ip" ]] && UPSTREAM_IPS+=("$ip") +done < <(echo "$CTRLD_CONNS" | grep -oE '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | sort -u | while read -r ip; do + # Filter out local/private IPs — we only want the upstream DoH server IPs + if [[ ! "$ip" =~ ^(127\.|10\.|192\.168\.|172\.(1[6-9]|2[0-9]|3[01])\.) ]]; then + echo "$ip" + fi +done) + +# Also try to resolve known Control D DoH endpoints +for host in dns.controld.com freedns.controld.com; do + for ip in $(dig +short "$host" 2>/dev/null || true); do + if [[ "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + UPSTREAM_IPS+=("$ip") + fi + done +done + +# Deduplicate +UPSTREAM_IPS=($(printf '%s\n' "${UPSTREAM_IPS[@]}" | sort -u)) + +if [[ ${#UPSTREAM_IPS[@]} -eq 0 ]]; then + fail "Could not discover any upstream IPs!" + echo "Check: lsof -i -n -P | grep ctrld" + exit 1 +fi + +log "Found ${#UPSTREAM_IPS[@]} upstream IP(s):" +for ip in "${UPSTREAM_IPS[@]}"; do + log " $ip" +done +echo "" + +# ── Phase 2: Baseline check ───────────────────────────────────────────────── +log "Phase 2: Baseline — verify DNS works normally" +BASELINE=$(dig +short +timeout=5 example.com @127.0.0.1 2>/dev/null || true) +if [[ -z "$BASELINE" ]]; then + fail "DNS not working before test!" + exit 1 +fi +pass "Baseline: example.com → $BASELINE" + +LOG_LINES_BEFORE=$(wc -l < "$CTRLD_LOG" | tr -d ' ') +log "Log position: line $LOG_LINES_BEFORE" +echo "" + +# ── Phase 3: Block all upstream IPs ───────────────────────────────────────── +log "Phase 3: Blackholing all upstream IPs" +for ip in "${UPSTREAM_IPS[@]}"; do + route delete -host "$ip" 2>/dev/null || true # clean slate + route add -host "$ip" 127.0.0.1 2>/dev/null + BLOCKED_IPS+=("$ip") + log " Blocked: $ip → 127.0.0.1" +done +pass "All ${#UPSTREAM_IPS[@]} upstream IPs blackholed" +echo "" + +# ── Phase 4: Cycle wifi to trigger network change ─────────────────────────── +log "Phase 4: Cycling wifi to trigger network change event" +log " Turning wifi OFF..." +networksetup -setairportpower "$WIFI_IFACE" off +sleep 3 + +log " Turning wifi ON..." +networksetup -setairportpower "$WIFI_IFACE" on + +log " Waiting for wifi to reconnect (up to 15s)..." +WIFI_UP=false +for i in $(seq 1 15); do + # Check if we have an IP on the wifi interface + IF_IP=$(ipconfig getifaddr "$WIFI_IFACE" 2>/dev/null || true) + if [[ -n "$IF_IP" ]]; then + WIFI_UP=true + pass "Wifi reconnected: $WIFI_IFACE → $IF_IP" + break + fi + sleep 1 +done + +if [[ "$WIFI_UP" == "false" ]]; then + fail "Wifi did not reconnect in 15s!" + warn "Cleaning up and exiting..." + exit 1 +fi + +log " Waiting 5s for ctrld network monitor to fire..." +sleep 5 +echo "" + +# ── Phase 5: Query and watch for recovery ──────────────────────────────────── +log "Phase 5: Sending queries — upstream is blocked, recovery should activate" +log " (ctrld should detect upstream failure → enable recovery bypass → use DHCP DNS)" +echo "" + +RECOVERY_DETECTED=false +BYPASS_ACTIVE=false +DNS_DURING_BYPASS=false +QUERY_COUNT=0 + +for i in $(seq 1 30); do + QUERY_COUNT=$((QUERY_COUNT + 1)) + RESULT=$(dig +short +timeout=3 "example.com" @127.0.0.1 2>/dev/null || true) + + if [[ -n "$RESULT" ]]; then + log " Query #$QUERY_COUNT: example.com → $RESULT ✓" + else + log " Query #$QUERY_COUNT: example.com → FAIL ✗" + fi + + # Check logs + NEW_LOGS=$(tail -n +$((LOG_LINES_BEFORE + 1)) "$CTRLD_LOG" 2>/dev/null || true) + + if [[ "$RECOVERY_DETECTED" == "false" ]] && echo "$NEW_LOGS" | grep -qiE "enabling DHCP bypass|triggering recovery|No healthy"; then + echo "" + pass "🎯 Recovery flow triggered!" + RECOVERY_DETECTED=true + echo "$NEW_LOGS" | grep -iE "recovery|bypass|DHCP|No healthy|network change" | tail -8 | while read -r line; do + echo " 📋 $line" + done + echo "" + fi + + if [[ "$BYPASS_ACTIVE" == "false" ]] && echo "$NEW_LOGS" | grep -qi "Recovery bypass active"; then + pass "🔄 Recovery bypass is forwarding queries to OS/DHCP resolver" + BYPASS_ACTIVE=true + fi + + if [[ "$RECOVERY_DETECTED" == "true" && -n "$RESULT" ]]; then + pass "✅ DNS resolves during recovery bypass: example.com → $RESULT" + DNS_DURING_BYPASS=true + break + fi + + sleep 2 +done + +# ── Phase 6: Show all recovery-related log entries ────────────────────────── +echo "" +log "Phase 6: All recovery-related ctrld log entries" +log "────────────────────────────────────────────────" +NEW_LOGS=$(tail -n +$((LOG_LINES_BEFORE + 1)) "$CTRLD_LOG" 2>/dev/null || true) +RELEVANT=$(echo "$NEW_LOGS" | grep -iE "recovery|bypass|DHCP|unhealthy|upstream.*fail|No healthy|network change|network monitor|OS resolver" || true) +if [[ -n "$RELEVANT" ]]; then + echo "$RELEVANT" | head -40 | while read -r line; do + echo " $line" + done +else + warn "No recovery-related log entries found!" + log "Last 15 lines of ctrld log:" + tail -15 "$CTRLD_LOG" | while read -r line; do + echo " $line" + done +fi + +# ── Phase 7: Unblock and verify full recovery ─────────────────────────────── +echo "" +log "Phase 7: Unblocking upstream IPs" +for ip in "${BLOCKED_IPS[@]}"; do + route delete -host "$ip" 2>/dev/null && log " Unblocked: $ip" || true +done +BLOCKED_IPS=() # clear so cleanup doesn't double-delete +pass "All upstream IPs unblocked" + +log "Waiting for ctrld to recover (up to 30s)..." +LOG_LINES_UNBLOCK=$(wc -l < "$CTRLD_LOG" | tr -d ' ') +RECOVERY_COMPLETE=false + +for i in $(seq 1 15); do + dig +short +timeout=3 example.com @127.0.0.1 >/dev/null 2>&1 || true + POST_LOGS=$(tail -n +$((LOG_LINES_UNBLOCK + 1)) "$CTRLD_LOG" 2>/dev/null || true) + + if echo "$POST_LOGS" | grep -qiE "recovery complete|disabling DHCP bypass|Upstream.*recovered"; then + RECOVERY_COMPLETE=true + pass "ctrld recovered — normal operation resumed" + echo "$POST_LOGS" | grep -iE "recovery|recovered|bypass|disabling" | head -5 | while read -r line; do + echo " 📋 $line" + done + break + fi + sleep 2 +done + +[[ "$RECOVERY_COMPLETE" == "false" ]] && warn "Recovery completion not detected (may need more time)" + +# Final check +echo "" +log "Phase 8: Final DNS verification" +sleep 2 +FINAL=$(dig +short +timeout=5 example.com @127.0.0.1 2>/dev/null || true) +if [[ -n "$FINAL" ]]; then + pass "DNS working: example.com → $FINAL" +else + fail "DNS not resolving" +fi + +# ── Summary ────────────────────────────────────────────────────────────────── +echo "" +log "═══════════════════════════════════════════════════════════" +log " Test Summary" +log "═══════════════════════════════════════════════════════════" +[[ "$RECOVERY_DETECTED" == "true" ]] && pass "Recovery bypass activated" || fail "Recovery bypass NOT activated" +[[ "$BYPASS_ACTIVE" == "true" ]] && pass "Queries forwarded to OS/DHCP resolver" || warn "OS resolver forwarding not confirmed" +[[ "$DNS_DURING_BYPASS" == "true" ]] && pass "DNS resolved during bypass (proof of OS resolver leak)" || warn "DNS during bypass not confirmed" +[[ "$RECOVERY_COMPLETE" == "true" ]] && pass "Normal operation resumed after unblock" || warn "Recovery completion not confirmed" +[[ -n "${FINAL:-}" ]] && pass "DNS functional at end of test" || fail "DNS broken at end of test" +echo "" +log "Full log since test: tail -n +$LOG_LINES_BEFORE $CTRLD_LOG" +log "Recovery entries: tail -n +$LOG_LINES_BEFORE $CTRLD_LOG | grep -i recovery" diff --git a/test-scripts/darwin/validate-pf-rules.sh b/test-scripts/darwin/validate-pf-rules.sh new file mode 100755 index 0000000..7cd0d0a --- /dev/null +++ b/test-scripts/darwin/validate-pf-rules.sh @@ -0,0 +1,272 @@ +#!/bin/bash +# validate-pf-rules.sh +# Standalone test of the pf redirect rules for dns-intercept mode. +# Does NOT require ctrld. Loads the pf anchor, validates interception, cleans up. +# Run as root (sudo). + +set -e + +GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; CYAN='\033[0;36m'; NC='\033[0m' +ok() { echo -e "${GREEN}[OK]${NC} $1"; } +fail() { echo -e "${RED}[FAIL]${NC} $1"; FAILURES=$((FAILURES+1)); } +warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +FAILURES=0 + +ANCHOR="com.controld.ctrld.test" +ANCHOR_FILE="/tmp/pf-dns-intercept-test.conf" +# Use a local DNS listener to prove redirect works (python one-liner) +LISTENER_PID="" + +cleanup() { + echo "" + echo -e "${CYAN}--- Cleanup ---${NC}" + # Remove anchor rules + pfctl -a "$ANCHOR" -F all 2>/dev/null && echo " Flushed anchor $ANCHOR" || true + # Remove anchor file + rm -f "$ANCHOR_FILE" "/tmp/pf-combined-test.conf" && echo " Removed temp files" || true + # Reload original pf.conf to remove anchor reference + pfctl -f /etc/pf.conf 2>/dev/null && echo " Reloaded original pf.conf" || true + # Kill test listener + if [ -n "$LISTENER_PID" ]; then + kill "$LISTENER_PID" 2>/dev/null && echo " Stopped test DNS listener" || true + fi + echo " Cleanup complete" +} +trap cleanup EXIT + +resolve() { + dig "@${1}" "$2" A +short +timeout=3 +tries=1 2>/dev/null | grep -E '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | head -1 +} + +echo -e "${CYAN}=== pf DNS Redirect Rule Validation ===${NC}" +echo " This loads the exact pf rules from the dns-intercept MR," +echo " starts a tiny DNS listener on 127.0.0.1:53, and verifies" +echo " that queries to external IPs get redirected." +echo "" + +# 0. Check we're root +if [ "$(id -u)" -ne 0 ]; then + fail "Must run as root (sudo)" + exit 1 +fi + +# 1. Start a minimal DNS listener on 127.0.0.1:53 +# Uses socat to echo a fixed response — enough to prove redirect works. +# If port 53 is already in use (mDNSResponder), we'll use that instead. +echo "--- Step 1: DNS Listener on 127.0.0.1:53 ---" +if lsof -i :53 -sTCP:LISTEN 2>/dev/null | grep -q "." || lsof -i UDP:53 2>/dev/null | grep -q "."; then + ok "Something already listening on port 53 (likely mDNSResponder or ctrld)" + HAVE_LISTENER=true +else + # Start a simple Python DNS proxy that forwards to 1.1.1.1 + python3 -c " +import socket, threading, sys +def proxy(data, addr, sock): + try: + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.settimeout(3) + s.sendto(data, ('1.1.1.1', 53)) + resp, _ = s.recvfrom(4096) + sock.sendto(resp, addr) + s.close() + except: pass + +sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) +sock.bind(('127.0.0.1', 53)) +print('READY', flush=True) +while True: + data, addr = sock.recvfrom(4096) + threading.Thread(target=proxy, args=(data, addr, sock), daemon=True).start() +" & + LISTENER_PID=$! + sleep 1 + if kill -0 "$LISTENER_PID" 2>/dev/null; then + ok "Started test DNS proxy on 127.0.0.1:53 (PID $LISTENER_PID, forwards to 1.1.1.1)" + HAVE_LISTENER=true + else + fail "Could not start DNS listener on port 53 — port may be in use" + HAVE_LISTENER=false + fi +fi +echo "" + +# 2. Verify baseline: direct query to 8.8.8.8 works (before pf rules) +echo "--- Step 2: Baseline (before pf rules) ---" +IP=$(resolve "8.8.8.8" "example.com") +if [ -n "$IP" ]; then + ok "Direct DNS to 8.8.8.8 works (baseline): $IP" +else + warn "Direct DNS to 8.8.8.8 failed — may be blocked by existing firewall" +fi +echo "" + +# 3. Write and load the pf anchor (exact rules from MR) +echo "--- Step 3: Load pf Anchor Rules ---" +TEST_UPSTREAM="1.1.1.1" +cat > "$ANCHOR_FILE" << PFRULES +# ctrld DNS Intercept Mode (test anchor) +# Two-step: route-to lo0 + rdr on lo0 +# +# In production, ctrld uses DoH (port 443) for upstreams so they're not +# affected by port 53 rules. For this test, we exempt our upstream ($TEST_UPSTREAM) +# explicitly — same mechanism ctrld uses for OS resolver exemptions. + +# --- Translation rules (rdr) --- +rdr pass on lo0 inet proto udp from any to ! 127.0.0.1 port 53 -> 127.0.0.1 port 53 +rdr pass on lo0 inet proto tcp from any to ! 127.0.0.1 port 53 -> 127.0.0.1 port 53 + +# --- Filtering rules (pass) --- +# Exempt test upstream (in production: ctrld uses DoH, so this isn't needed). +pass out quick on ! lo0 inet proto { udp, tcp } from any to $TEST_UPSTREAM port 53 + +# Force remaining outbound DNS through loopback for interception. +pass out quick on ! lo0 route-to lo0 inet proto udp from any to ! 127.0.0.1 port 53 no state +pass out quick on ! lo0 route-to lo0 inet proto tcp from any to ! 127.0.0.1 port 53 no state + +# Allow redirected traffic through on loopback. +pass in quick on lo0 inet proto { udp, tcp } from any to 127.0.0.1 port 53 no state +PFRULES + +ok "Wrote anchor file: $ANCHOR_FILE" +cat "$ANCHOR_FILE" | sed 's/^/ /' +echo "" + +# Load anchor +OUTPUT=$(pfctl -a "$ANCHOR" -f "$ANCHOR_FILE" 2>&1) || { + fail "Failed to load anchor: $OUTPUT" + exit 1 +} +ok "Loaded anchor: $ANCHOR" + +# Inject anchor references into running pf config. +# pf enforces strict rule ordering: options, normalization, queueing, translation, filtering. +# We must insert rdr-anchor with other rdr-anchors and anchor with other anchors. +TMPCONF="/tmp/pf-combined-test.conf" +python3 -c " +import sys +lines = open('/etc/pf.conf').read().splitlines() +anchor = '$ANCHOR' +rdr_ref = 'rdr-anchor \"' + anchor + '\"' +anchor_ref = 'anchor \"' + anchor + '\"' +out = [] +rdr_done = False +anc_done = False +for line in lines: + s = line.strip() + # Insert our rdr-anchor before the first existing rdr-anchor + if not rdr_done and s.startswith('rdr-anchor'): + out.append(rdr_ref) + rdr_done = True + # Insert our anchor before the first existing anchor (filter-phase) + if not anc_done and s.startswith('anchor') and not s.startswith('anchor \"com.apple'): + out.append(anchor_ref) + anc_done = True + out.append(line) +# Fallback if no existing anchors found +if not rdr_done: + # Insert before first non-comment, non-blank after any 'set' or 'scrub' lines + out.insert(0, rdr_ref) +if not anc_done: + out.append(anchor_ref) +open('$TMPCONF', 'w').write('\n'.join(out) + '\n') +" || { fail "Failed to build combined pf config"; exit 1; } + +INJECT_OUT=$(pfctl -f "$TMPCONF" 2>&1) || { + fail "Failed to inject anchor reference: $INJECT_OUT" + rm -f "$TMPCONF" + exit 1 +} +rm -f "$TMPCONF" +ok "Injected anchor references into running pf ruleset" + +# Enable pf +pfctl -e 2>/dev/null || true + +# Show loaded rules +echo "" +echo " Active NAT rules:" +pfctl -a "$ANCHOR" -sn 2>/dev/null | sed 's/^/ /' +echo " Active filter rules:" +pfctl -a "$ANCHOR" -sr 2>/dev/null | sed 's/^/ /' +echo "" + +# 4. Test: DNS to 8.8.8.8 should now be redirected to 127.0.0.1:53 +echo "--- Step 4: Redirect Test ---" +if [ "$HAVE_LISTENER" = true ]; then + IP=$(resolve "8.8.8.8" "example.com" 5) + if [ -n "$IP" ]; then + ok "DNS to 8.8.8.8 redirected through 127.0.0.1:53: $IP" + else + fail "DNS to 8.8.8.8 failed — redirect may not be working" + fi + + # Also test another random IP + IP2=$(resolve "9.9.9.9" "example.com" 5) + if [ -n "$IP2" ]; then + ok "DNS to 9.9.9.9 also redirected: $IP2" + else + fail "DNS to 9.9.9.9 failed" + fi +else + warn "No listener on port 53 — cannot test redirect" +fi +echo "" + +# 5. Test: DNS to 127.0.0.1 still works (not double-redirected) +echo "--- Step 5: Localhost DNS (no loop) ---" +if [ "$HAVE_LISTENER" = true ]; then + IP=$(resolve "127.0.0.1" "example.com" 5) + if [ -n "$IP" ]; then + ok "DNS to 127.0.0.1 works normally (not caught by redirect): $IP" + else + fail "DNS to 127.0.0.1 failed — possible redirect loop" + fi +fi +echo "" + +# 6. Simulate VPN DNS override +echo "--- Step 6: VPN DNS Override Simulation ---" +IFACE=$(route -n get default 2>/dev/null | awk '/interface:/{print $2}') +SVC="" +for try_svc in "Wi-Fi" "Ethernet" "Thunderbolt Ethernet"; do + if networksetup -getdnsservers "$try_svc" 2>/dev/null >/dev/null; then + SVC="$try_svc" + break + fi +done + +if [ -n "$SVC" ] && [ "$HAVE_LISTENER" = true ]; then + ORIG_DNS=$(networksetup -getdnsservers "$SVC" 2>/dev/null || echo "") + echo " Service: $SVC" + echo " Current DNS: $ORIG_DNS" + + networksetup -setdnsservers "$SVC" 10.50.10.77 + dscacheutil -flushcache 2>/dev/null || true + killall -HUP mDNSResponder 2>/dev/null || true + echo " Set DNS to 10.50.10.77 (simulating F5 VPN)" + sleep 2 + + IP=$(resolve "10.50.10.77" "google.com" 5) + if [ -n "$IP" ]; then + ok "Query to fake VPN DNS (10.50.10.77) redirected to ctrld: $IP" + else + fail "Query to fake VPN DNS failed" + fi + + # Restore + if echo "$ORIG_DNS" | grep -q "There aren't any DNS Servers"; then + networksetup -setdnsservers "$SVC" Empty + else + networksetup -setdnsservers "$SVC" $ORIG_DNS + fi + echo " Restored DNS" +else + warn "Skipping VPN simulation (no service found or no listener)" +fi + +echo "" +if [ "$FAILURES" -eq 0 ]; then + echo -e "${GREEN}=== All tests passed ===${NC}" +else + echo -e "${RED}=== $FAILURES test(s) failed ===${NC}" +fi diff --git a/test-scripts/macos/diag-lo0-capture.sh b/test-scripts/macos/diag-lo0-capture.sh new file mode 100644 index 0000000..902cafd --- /dev/null +++ b/test-scripts/macos/diag-lo0-capture.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# diag-lo0-capture.sh — Capture DNS on lo0 to see where the pf chain breaks +# Usage: sudo bash diag-lo0-capture.sh +# Run while Windscribe + ctrld are both active, then dig from another terminal + +set -u +PCAP="/tmp/lo0-dns-$(date +%s).pcap" +echo "=== lo0 DNS Packet Capture ===" +echo "Capturing to: $PCAP" +echo "" + +# Show current rules (verify build) +echo "--- ctrld anchor rdr rules ---" +pfctl -a com.controld.ctrld -sn 2>/dev/null +echo "" +echo "--- ctrld anchor filter rules (lo0 only) ---" +pfctl -a com.controld.ctrld -sr 2>/dev/null | grep lo0 +echo "" + +# Check pf state table for port 53 before +echo "--- port 53 states BEFORE dig ---" +pfctl -ss 2>/dev/null | grep ':53' | head -10 +echo "(total: $(pfctl -ss 2>/dev/null | grep -c ':53'))" +echo "" + +# Start capture on lo0 +echo "Starting tcpdump on lo0 port 53..." +echo ">>> In another terminal, run: dig example.com" +echo ">>> Then press Ctrl-C here" +echo "" +tcpdump -i lo0 -n -v port 53 -w "$PCAP" 2>&1 & +TCPDUMP_PID=$! + +# Also show live output +tcpdump -i lo0 -n port 53 2>&1 & +LIVE_PID=$! + +# Wait for Ctrl-C +trap "kill $TCPDUMP_PID $LIVE_PID 2>/dev/null; echo ''; echo '--- port 53 states AFTER dig ---'; pfctl -ss 2>/dev/null | grep ':53' | head -20; echo '(total: '$(pfctl -ss 2>/dev/null | grep -c ':53')')'; echo ''; echo 'Capture saved to: $PCAP'; echo 'Read with: tcpdump -r $PCAP -n -v'; exit 0" INT +wait diff --git a/test-scripts/macos/diag-pf-poll.sh b/test-scripts/macos/diag-pf-poll.sh new file mode 100644 index 0000000..7a7cb63 --- /dev/null +++ b/test-scripts/macos/diag-pf-poll.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# diag-pf-poll.sh — Polls pf rules, options, states, and DNS every 2s +# Usage: sudo bash diag-pf-poll.sh | tee /tmp/pf-poll.log +# Steps: 1) Run script 2) Connect Windscribe 3) Start ctrld 4) Ctrl-C when done + +set -u +LOG="/tmp/pf-poll-$(date +%s).log" +echo "=== PF Poll Diagnostic — logging to $LOG ===" +echo "Press Ctrl-C to stop" +echo "" + +poll() { + local ts=$(date '+%H:%M:%S.%3N') + echo "======== [$ts] POLL ========" + + # 1. pf options — looking for "set skip on lo0" + echo "--- pf options ---" + pfctl -so 2>/dev/null | grep -i skip || echo "(no skip rules)" + + # 2. Main ruleset anchors — where is ctrld relative to block drop all? + echo "--- main filter rules (summary) ---" + pfctl -sr 2>/dev/null | head -30 + + # 3. Main NAT/rdr rules + echo "--- main nat/rdr rules (summary) ---" + pfctl -sn 2>/dev/null | head -20 + + # 4. ctrld anchor content + echo "--- ctrld anchor (filter) ---" + pfctl -a com.apple.internet-sharing/ctrld -sr 2>/dev/null || echo "(no anchor)" + echo "--- ctrld anchor (nat/rdr) ---" + pfctl -a com.apple.internet-sharing/ctrld -sn 2>/dev/null || echo "(no anchor)" + + # 5. State count for rdr target (10.255.255.3) and loopback + echo "--- states summary ---" + local total=$(pfctl -ss 2>/dev/null | wc -l | tr -d ' ') + local rdr=$(pfctl -ss 2>/dev/null | grep -c '10\.255\.255\.3' || true) + local lo0=$(pfctl -ss 2>/dev/null | grep -c 'lo0' || true) + echo "total=$total rdr_target=$rdr lo0=$lo0" + + # 6. Quick DNS test (1s timeout) + echo "--- DNS tests ---" + local direct=$(dig +short +time=1 +tries=1 example.com @127.0.0.1 2>&1 | head -1) + local system=$(dig +short +time=1 +tries=1 example.com 2>&1 | head -1) + echo "direct @127.0.0.1: $direct" + echo "system DNS: $system" + + # 7. Windscribe tunnel interface + echo "--- tunnel interfaces ---" + ifconfig -l | tr ' ' '\n' | grep -E '^utun' | while read iface; do + echo -n "$iface: " + ifconfig "$iface" 2>/dev/null | grep 'inet ' | awk '{print $2}' || echo "no ip" + done + + echo "" +} + +# Main loop +while true; do + poll 2>&1 | tee -a "$LOG" + sleep 2 +done diff --git a/test-scripts/macos/diag-windscribe-connect.sh b/test-scripts/macos/diag-windscribe-connect.sh new file mode 100644 index 0000000..176f77f --- /dev/null +++ b/test-scripts/macos/diag-windscribe-connect.sh @@ -0,0 +1,183 @@ +#!/bin/bash +# diag-windscribe-connect.sh — Diagnostic script for testing ctrld dns-intercept +# during Windscribe VPN connection on macOS. +# +# Usage: sudo ./diag-windscribe-connect.sh +# +# Run this BEFORE connecting Windscribe. It polls every 0.5s and captures: +# 1. pf anchor state (are ctrld anchors present?) +# 2. pf state table entries (rdr interception working?) +# 3. ctrld log events (watchdog, rebootstrap, errors) +# 4. scutil DNS resolver state +# 5. Active tunnel interfaces +# 6. dig test query results +# +# Output goes to /tmp/diag-windscribe-/ +# Press Ctrl-C to stop. A summary is printed at the end. + +set -e + +if [ "$(id -u)" -ne 0 ]; then + echo "ERROR: Must run as root (sudo)" + exit 1 +fi + +CTRLD_LOG="${CTRLD_LOG:-/tmp/dns.log}" +TIMESTAMP=$(date +%Y%m%d-%H%M%S) +OUTDIR="/tmp/diag-windscribe-${TIMESTAMP}" +mkdir -p "$OUTDIR" + +echo "=== Windscribe + ctrld DNS Intercept Diagnostic ===" +echo "Output: $OUTDIR" +echo "ctrld log: $CTRLD_LOG" +echo "" +echo "1. Start this script" +echo "2. Connect Windscribe" +echo "3. Wait ~30 seconds" +echo "4. Try: dig popads.net / dig @127.0.0.1 popads.net" +echo "5. Ctrl-C to stop and see summary" +echo "" +echo "Polling every 0.5s... Press Ctrl-C to stop." +echo "" + +# Track ctrld log position +if [ -f "$CTRLD_LOG" ]; then + LOG_START_LINE=$(wc -l < "$CTRLD_LOG") +else + LOG_START_LINE=0 +fi + +ITER=0 +DIG_FAIL=0 +DIG_OK=0 +ANCHOR_MISSING=0 +ANCHOR_PRESENT=0 +PF_WIPE_COUNT=0 +FORCE_REBOOT_COUNT=0 +LAST_TUNNEL_IFACES="" + +cleanup() { + echo "" + echo "=== Stopping diagnostic ===" + + # Capture final state + echo "--- Final pf state ---" > "$OUTDIR/final-pfctl.txt" + pfctl -sa 2>/dev/null >> "$OUTDIR/final-pfctl.txt" 2>&1 || true + + echo "--- Final scutil ---" > "$OUTDIR/final-scutil.txt" + scutil --dns >> "$OUTDIR/final-scutil.txt" 2>&1 || true + + # Extract ctrld log events since start + if [ -f "$CTRLD_LOG" ]; then + tail -n +$((LOG_START_LINE + 1)) "$CTRLD_LOG" > "$OUTDIR/ctrld-events.log" 2>/dev/null || true + + # Extract key events + echo "--- Watchdog events ---" > "$OUTDIR/summary-watchdog.txt" + grep -i "watchdog\|anchor.*missing\|anchor.*restored\|force-reset\|re-bootstrapping\|force re-bootstrapping" "$OUTDIR/ctrld-events.log" >> "$OUTDIR/summary-watchdog.txt" 2>/dev/null || true + + echo "--- Errors ---" > "$OUTDIR/summary-errors.txt" + grep '"level":"error"' "$OUTDIR/ctrld-events.log" >> "$OUTDIR/summary-errors.txt" 2>/dev/null || true + + echo "--- Network changes ---" > "$OUTDIR/summary-network.txt" + grep -i "Network change\|tunnel interface\|Ignoring interface" "$OUTDIR/ctrld-events.log" >> "$OUTDIR/summary-network.txt" 2>/dev/null || true + + echo "--- Transport resets ---" > "$OUTDIR/summary-transport.txt" + grep -i "re-bootstrap\|force.*bootstrap\|dialing to\|connected to" "$OUTDIR/ctrld-events.log" >> "$OUTDIR/summary-transport.txt" 2>/dev/null || true + + # Count key events + PF_WIPE_COUNT=$(grep -c "anchor.*missing\|restoring pf" "$OUTDIR/ctrld-events.log" 2>/dev/null || echo 0) + FORCE_REBOOT_COUNT=$(grep -c "force re-bootstrapping\|force-reset" "$OUTDIR/ctrld-events.log" 2>/dev/null || echo 0) + DEADLINE_COUNT=$(grep -c "context deadline exceeded" "$OUTDIR/ctrld-events.log" 2>/dev/null || echo 0) + FALLBACK_COUNT=$(grep -c "OS resolver retry query successful" "$OUTDIR/ctrld-events.log" 2>/dev/null || echo 0) + fi + + echo "" + echo "=========================================" + echo " DIAGNOSTIC SUMMARY" + echo "=========================================" + echo "Duration: $ITER iterations (~$((ITER / 2))s)" + echo "" + echo "pf Anchor Status:" + echo " Present: $ANCHOR_PRESENT times" + echo " Missing: $ANCHOR_MISSING times" + echo "" + echo "dig Tests (popads.net):" + echo " Success: $DIG_OK" + echo " Failed: $DIG_FAIL" + echo "" + echo "ctrld Log Events:" + echo " pf wipes detected: $PF_WIPE_COUNT" + echo " Force rebootstraps: $FORCE_REBOOT_COUNT" + echo " Context deadline errors: ${DEADLINE_COUNT:-0}" + echo " OS resolver fallbacks: ${FALLBACK_COUNT:-0}" + echo "" + echo "Last tunnel interfaces: ${LAST_TUNNEL_IFACES:-none}" + echo "" + echo "Files saved to: $OUTDIR/" + echo " final-pfctl.txt — full pfctl -sa at exit" + echo " final-scutil.txt — scutil --dns at exit" + echo " ctrld-events.log — ctrld log during test" + echo " summary-watchdog.txt — watchdog events" + echo " summary-errors.txt — errors" + echo " summary-transport.txt — transport reset events" + echo " timeline.log — per-iteration state" + echo "=========================================" + exit 0 +} + +trap cleanup INT TERM + +while true; do + ITER=$((ITER + 1)) + NOW=$(date '+%H:%M:%S.%3N' 2>/dev/null || date '+%H:%M:%S') + + # 1. Check pf anchor presence + ANCHOR_STATUS="MISSING" + if pfctl -sr 2>/dev/null | grep -q "com.controld.ctrld"; then + ANCHOR_STATUS="PRESENT" + ANCHOR_PRESENT=$((ANCHOR_PRESENT + 1)) + else + ANCHOR_MISSING=$((ANCHOR_MISSING + 1)) + fi + + # 2. Check tunnel interfaces + TUNNEL_IFACES=$(ifconfig -l 2>/dev/null | tr ' ' '\n' | grep -E '^(utun|ipsec|ppp|tap|tun)' | \ + while read iface; do + # Only list interfaces that are UP and have an IP + if ifconfig "$iface" 2>/dev/null | grep -q "inet "; then + echo -n "$iface " + fi + done) + TUNNEL_IFACES=$(echo "$TUNNEL_IFACES" | xargs) # trim + if [ -n "$TUNNEL_IFACES" ]; then + LAST_TUNNEL_IFACES="$TUNNEL_IFACES" + fi + + # 3. Count rdr states (three-part = intercepted) + RDR_COUNT=$(pfctl -ss 2>/dev/null | grep -c "127.0.0.1:53 <-" || echo 0) + + # 4. Quick dig test (0.5s timeout) + DIG_RESULT="SKIP" + if [ $((ITER % 4)) -eq 0 ]; then # every 2 seconds + if dig +time=1 +tries=1 popads.net A @127.0.0.1 +short >/dev/null 2>&1; then + DIG_RESULT="OK" + DIG_OK=$((DIG_OK + 1)) + else + DIG_RESULT="FAIL" + DIG_FAIL=$((DIG_FAIL + 1)) + fi + fi + + # 5. Check latest ctrld log for recent errors + RECENT_ERR="" + if [ -f "$CTRLD_LOG" ]; then + RECENT_ERR=$(tail -5 "$CTRLD_LOG" 2>/dev/null | grep -o '"message":"[^"]*deadline[^"]*"' | tail -1 || true) + fi + + # Output timeline + LINE="[$NOW] anchor=$ANCHOR_STATUS rdr_states=$RDR_COUNT tunnels=[$TUNNEL_IFACES] dig=$DIG_RESULT $RECENT_ERR" + echo "$LINE" + echo "$LINE" >> "$OUTDIR/timeline.log" + + sleep 0.5 +done diff --git a/test-scripts/windows/diag-intercept.ps1 b/test-scripts/windows/diag-intercept.ps1 new file mode 100644 index 0000000..05e0f1e --- /dev/null +++ b/test-scripts/windows/diag-intercept.ps1 @@ -0,0 +1,131 @@ +# diag-intercept.ps1 — Windows DNS Intercept Mode Diagnostic +# Run as Administrator in the same elevated prompt as ctrld +# Usage: .\diag-intercept.ps1 + +Write-Host "=== CTRLD INTERCEPT MODE DIAGNOSTIC ===" -ForegroundColor Cyan +Write-Host "Timestamp: $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')" +Write-Host "" + +# 1. Check NRPT rules +Write-Host "--- 1. NRPT Rules ---" -ForegroundColor Yellow +try { + $nrptRules = Get-DnsClientNrptRule -ErrorAction Stop + if ($nrptRules) { + $nrptRules | Format-Table Namespace, NameServers, DisplayName -AutoSize + } else { + Write-Host " NO NRPT RULES FOUND — this is the problem!" -ForegroundColor Red + } +} catch { + Write-Host " Get-DnsClientNrptRule failed: $_" -ForegroundColor Red +} +Write-Host "" + +# 2. Check NRPT registry directly +Write-Host "--- 2. NRPT Registry ---" -ForegroundColor Yellow +$regPath = "HKLM:\SOFTWARE\Policies\Microsoft\Windows NT\DNSClient\DnsPolicyConfig\CtrldCatchAll" +if (Test-Path $regPath) { + Write-Host " Registry key EXISTS" -ForegroundColor Green + Get-ItemProperty $regPath | Format-List Name, GenericDNSServers, ConfigOptions, Version +} else { + Write-Host " Registry key MISSING at $regPath" -ForegroundColor Red + # Check parent + $parentPath = "HKLM:\SOFTWARE\Policies\Microsoft\Windows NT\DNSClient\DnsPolicyConfig" + if (Test-Path $parentPath) { + Write-Host " Parent key exists. Children:" + Get-ChildItem $parentPath | ForEach-Object { Write-Host " $($_.PSChildName)" } + } else { + Write-Host " Parent DnsPolicyConfig key also missing" -ForegroundColor Red + } +} +Write-Host "" + +# 3. DNS Client service status +Write-Host "--- 3. DNS Client Service ---" -ForegroundColor Yellow +$dnsSvc = Get-Service Dnscache +Write-Host " Status: $($dnsSvc.Status) StartType: $($dnsSvc.StartType)" +Write-Host "" + +# 4. Interface DNS servers +Write-Host "--- 4. Interface DNS Servers ---" -ForegroundColor Yellow +Get-DnsClientServerAddress | Format-Table InterfaceAlias, InterfaceIndex, AddressFamily, ServerAddresses -AutoSize +Write-Host "" + +# 5. WFP filters check +Write-Host "--- 5. WFP Filters (ctrld sublayer) ---" -ForegroundColor Yellow +try { + $wfpOutput = netsh wfp show filters + if (Test-Path "filters.xml") { + $xml = [xml](Get-Content "filters.xml") + $ctrldFilters = $xml.wfpdiag.filters.item | Where-Object { + $_.displayData.name -like "ctrld:*" + } + if ($ctrldFilters) { + Write-Host " Found $($ctrldFilters.Count) ctrld WFP filter(s):" -ForegroundColor Green + $ctrldFilters | ForEach-Object { + Write-Host " $($_.displayData.name) — action: $($_.action.type)" + } + } else { + Write-Host " NO ctrld WFP filters found" -ForegroundColor Red + } + Remove-Item "filters.xml" -ErrorAction SilentlyContinue + } +} catch { + Write-Host " WFP check failed: $_" -ForegroundColor Red +} +Write-Host "" + +# 6. DNS resolution tests +Write-Host "--- 6. DNS Resolution Tests ---" -ForegroundColor Yellow + +# Test A: Resolve-DnsName (uses DNS Client = respects NRPT) +Write-Host " Test A: Resolve-DnsName google.com (DNS Client path)" -ForegroundColor White +try { + $result = Resolve-DnsName google.com -Type A -DnsOnly -ErrorAction Stop + Write-Host " OK: $($result.IPAddress -join ', ')" -ForegroundColor Green +} catch { + Write-Host " FAILED: $_" -ForegroundColor Red +} + +# Test B: Resolve-DnsName to specific server (127.0.0.1) +Write-Host " Test B: Resolve-DnsName google.com -Server 127.0.0.1" -ForegroundColor White +try { + $result = Resolve-DnsName google.com -Type A -Server 127.0.0.1 -DnsOnly -ErrorAction Stop + Write-Host " OK: $($result.IPAddress -join ', ')" -ForegroundColor Green +} catch { + Write-Host " FAILED: $_" -ForegroundColor Red +} + +# Test C: Resolve-DnsName blocked domain (should return 0.0.0.0 or NXDOMAIN via Control D) +Write-Host " Test C: Resolve-DnsName popads.net (should be blocked by Control D)" -ForegroundColor White +try { + $result = Resolve-DnsName popads.net -Type A -DnsOnly -ErrorAction Stop + Write-Host " Result: $($result.IPAddress -join ', ')" -ForegroundColor Yellow +} catch { + Write-Host " FAILED/Blocked: $_" -ForegroundColor Yellow +} + +# Test D: nslookup (bypasses NRPT - expected to fail with intercept) +Write-Host " Test D: nslookup google.com 127.0.0.1 (direct, bypasses NRPT)" -ForegroundColor White +$nslookup = & nslookup google.com 127.0.0.1 2>&1 +Write-Host " $($nslookup -join "`n ")" + +Write-Host "" + +# 7. Try forcing NRPT reload +Write-Host "--- 7. Force NRPT Reload ---" -ForegroundColor Yellow +Write-Host " Running: gpupdate /target:computer /force" -ForegroundColor White +& gpupdate /target:computer /force 2>&1 | ForEach-Object { Write-Host " $_" } +Write-Host "" + +# Re-test after gpupdate +Write-Host " Re-test: Resolve-DnsName google.com" -ForegroundColor White +try { + $result = Resolve-DnsName google.com -Type A -DnsOnly -ErrorAction Stop + Write-Host " OK: $($result.IPAddress -join ', ')" -ForegroundColor Green +} catch { + Write-Host " STILL FAILED: $_" -ForegroundColor Red +} + +Write-Host "" +Write-Host "=== DIAGNOSTIC COMPLETE ===" -ForegroundColor Cyan +Write-Host "Copy all output above and send it back." diff --git a/test-scripts/windows/test-dns-intercept.ps1 b/test-scripts/windows/test-dns-intercept.ps1 new file mode 100644 index 0000000..fc4cc3f --- /dev/null +++ b/test-scripts/windows/test-dns-intercept.ps1 @@ -0,0 +1,544 @@ +# ============================================================================= +# DNS Intercept Mode Test Script — Windows (WFP) +# ============================================================================= +# Run as Administrator: powershell -ExecutionPolicy Bypass -File test-dns-intercept-win.ps1 +# +# Tests the dns-intercept feature end-to-end with validation at each step. +# Logs are read from C:\tmp\dns.log (ctrld log location on test machine). +# +# Manual steps marked with [MANUAL] require human interaction. +# ============================================================================= + +$ErrorActionPreference = "Continue" + +$CtrldLog = "C:\tmp\dns.log" +$WfpSubLayerName = "ctrld DNS Intercept" +$Pass = 0 +$Fail = 0 +$Warn = 0 +$Results = @() + +# --- Helpers --- + +function Header($text) { Write-Host "`n━━━ $text ━━━" -ForegroundColor Cyan } +function Info($text) { Write-Host " ℹ $text" } +function Manual($text) { Write-Host " [MANUAL] $text" -ForegroundColor Yellow } +function Separator() { Write-Host "─────────────────────────────────────────────────────" -ForegroundColor Cyan } + +function Pass($text) { + Write-Host " ✅ PASS: $text" -ForegroundColor Green + $script:Pass++ + $script:Results += "PASS: $text" +} + +function Fail($text) { + Write-Host " ❌ FAIL: $text" -ForegroundColor Red + $script:Fail++ + $script:Results += "FAIL: $text" +} + +function Warn($text) { + Write-Host " ⚠️ WARN: $text" -ForegroundColor Yellow + $script:Warn++ + $script:Results += "WARN: $text" +} + +function WaitForKey { + Write-Host "`n Press Enter to continue..." -NoNewline + Read-Host +} + +function LogGrep($pattern, $lines = 200) { + if (Test-Path $CtrldLog) { + Get-Content $CtrldLog -Tail $lines -ErrorAction SilentlyContinue | + Select-String -Pattern $pattern -ErrorAction SilentlyContinue + } +} + +function LogGrepCount($pattern, $lines = 200) { + $matches = LogGrep $pattern $lines + if ($matches) { return @($matches).Count } else { return 0 } +} + +# --- Check Admin --- + +function Check-Admin { + $identity = [Security.Principal.WindowsIdentity]::GetCurrent() + $principal = New-Object Security.Principal.WindowsPrincipal($identity) + if (-not $principal.IsInRole([Security.Principal.WindowsBuiltInRole]::Administrator)) { + Write-Host "This script must be run as Administrator." -ForegroundColor Red + exit 1 + } +} + +# ============================================================================= +# TEST SECTIONS +# ============================================================================= + +function Test-Prereqs { + Header "0. Prerequisites" + + if (Get-Command nslookup -ErrorAction SilentlyContinue) { + Pass "nslookup available" + } else { + Fail "nslookup not found" + } + + if (Get-Command netsh -ErrorAction SilentlyContinue) { + Pass "netsh available" + } else { + Fail "netsh not found" + } + + if (Test-Path $CtrldLog) { + Pass "ctrld log exists at $CtrldLog" + } else { + Warn "ctrld log not found at $CtrldLog — log checks will be skipped" + } + + # Show current DNS config + Info "Current DNS servers:" + Get-DnsClientServerAddress -AddressFamily IPv4 | + Where-Object { $_.ServerAddresses.Count -gt 0 } | + Format-Table InterfaceAlias, ServerAddresses -AutoSize | + Out-String | ForEach-Object { $_.Trim() } | Write-Host +} + +function Test-WfpState { + Header "1. WFP State Validation" + + # Export WFP filters and check for ctrld's sublayer/filters + $wfpExport = "$env:TEMP\wfp_filters.xml" + Info "Exporting WFP filters (this may take a few seconds)..." + + try { + netsh wfp show filters file=$wfpExport 2>$null | Out-Null + + if (Test-Path $wfpExport) { + $wfpContent = Get-Content $wfpExport -Raw -ErrorAction SilentlyContinue + + # Check for ctrld sublayer + if ($wfpContent -match "ctrld") { + Pass "WFP filters contain 'ctrld' references" + + # Count filters + $filterMatches = ([regex]::Matches($wfpContent, "ctrld")).Count + Info "Found $filterMatches 'ctrld' references in WFP export" + } else { + Fail "No 'ctrld' references found in WFP filters" + } + + # Check for DNS port 53 filters + if ($wfpContent -match "port.*53" -or $wfpContent -match "0x0035") { + Pass "Port 53 filter conditions found in WFP" + } else { + Warn "Could not confirm port 53 filters in WFP export" + } + + Remove-Item $wfpExport -ErrorAction SilentlyContinue + } else { + Warn "WFP export file not created" + } + } catch { + Warn "Could not export WFP filters: $_" + } + + Separator + + # Alternative: Check via PowerShell WFP cmdlets if available + Info "Checking WFP via netsh wfp show state..." + $wfpState = netsh wfp show state 2>$null + if ($wfpState) { + Info "WFP state export completed (check $env:TEMP for details)" + } + + # Check Windows Firewall service is running + $fwService = Get-Service -Name "mpssvc" -ErrorAction SilentlyContinue + if ($fwService -and $fwService.Status -eq "Running") { + Pass "Windows Firewall service (BFE/WFP) is running" + } else { + Fail "Windows Firewall service not running — WFP won't work" + } + + # Check BFE (Base Filtering Engine) + $bfeService = Get-Service -Name "BFE" -ErrorAction SilentlyContinue + if ($bfeService -and $bfeService.Status -eq "Running") { + Pass "Base Filtering Engine (BFE) is running" + } else { + Fail "BFE not running — WFP requires this service" + } +} + +function Test-DnsInterception { + Header "2. DNS Interception Tests" + + # Mark log position + $logLinesBefore = 0 + if (Test-Path $CtrldLog) { + $logLinesBefore = @(Get-Content $CtrldLog -ErrorAction SilentlyContinue).Count + } + + # Test 1: Query to external resolver should be intercepted + Info "Test: nslookup example.com 8.8.8.8 (should be intercepted by ctrld)" + $result = $null + try { + $result = nslookup example.com 8.8.8.8 2>&1 | Out-String + } catch { } + + if ($result -and $result -match "\d+\.\d+\.\d+\.\d+") { + Pass "nslookup @8.8.8.8 returned a result" + + # Check which server answered + if ($result -match "Server:\s+(\S+)") { + $server = $Matches[1] + Info "Answered by server: $server" + if ($server -match "127\.0\.0\.1|localhost") { + Pass "Response came from localhost (ctrld intercepted)" + } elseif ($server -match "8\.8\.8\.8") { + Fail "Response came from 8.8.8.8 directly — NOT intercepted" + } + } + } else { + Fail "nslookup @8.8.8.8 failed or returned no address" + } + + # Check ctrld logged it + Start-Sleep -Seconds 1 + if (Test-Path $CtrldLog) { + $newLines = Get-Content $CtrldLog -ErrorAction SilentlyContinue | + Select-Object -Skip $logLinesBefore + $intercepted = $newLines | Select-String "example.com" -ErrorAction SilentlyContinue + if ($intercepted) { + Pass "ctrld logged the intercepted query for example.com" + } else { + Fail "ctrld did NOT log query for example.com" + } + } + + Separator + + # Test 2: Another external resolver + Info "Test: nslookup cloudflare.com 1.1.1.1 (should also be intercepted)" + try { + $result2 = nslookup cloudflare.com 1.1.1.1 2>&1 | Out-String + if ($result2 -match "\d+\.\d+\.\d+\.\d+") { + Pass "nslookup @1.1.1.1 returned result" + } else { + Fail "nslookup @1.1.1.1 failed" + } + } catch { + Fail "nslookup @1.1.1.1 threw exception" + } + + Separator + + # Test 3: Query to localhost should work (no loop) + Info "Test: nslookup example.org 127.0.0.1 (direct to ctrld, no loop)" + try { + $result3 = nslookup example.org 127.0.0.1 2>&1 | Out-String + if ($result3 -match "\d+\.\d+\.\d+\.\d+") { + Pass "nslookup @127.0.0.1 works (no loop)" + } else { + Fail "nslookup @127.0.0.1 failed — possible loop" + } + } catch { + Fail "nslookup @127.0.0.1 exception — possible loop" + } + + Separator + + # Test 4: System DNS via Resolve-DnsName + Info "Test: Resolve-DnsName example.net (system resolver)" + try { + $result4 = Resolve-DnsName example.net -Type A -ErrorAction Stop + if ($result4) { + Pass "System DNS resolution works (Resolve-DnsName)" + } + } catch { + Fail "System DNS resolution failed: $_" + } + + Separator + + # Test 5: TCP DNS + Info "Test: nslookup -vc example.com 9.9.9.9 (TCP DNS)" + try { + $result5 = nslookup -vc example.com 9.9.9.9 2>&1 | Out-String + if ($result5 -match "\d+\.\d+\.\d+\.\d+") { + Pass "TCP DNS query intercepted and resolved" + } else { + Warn "TCP DNS query may not have been intercepted" + } + } catch { + Warn "TCP DNS test inconclusive" + } +} + +function Test-NonDnsUnaffected { + Header "3. Non-DNS Traffic Unaffected" + + # HTTPS + Info "Test: Invoke-WebRequest https://example.com (HTTPS should NOT be affected)" + try { + $web = Invoke-WebRequest -Uri "https://example.com" -UseBasicParsing -TimeoutSec 10 -ErrorAction Stop + if ($web.StatusCode -eq 200) { + Pass "HTTPS works (HTTP 200)" + } else { + Pass "HTTPS returned HTTP $($web.StatusCode)" + } + } catch { + Fail "HTTPS failed: $_" + } + + # Test non-53 port connectivity + Info "Test: Test-NetConnection to github.com:443 (non-DNS port)" + try { + $nc = Test-NetConnection -ComputerName "github.com" -Port 443 -WarningAction SilentlyContinue + if ($nc.TcpTestSucceeded) { + Pass "Port 443 reachable (non-DNS traffic unaffected)" + } else { + Warn "Port 443 unreachable (may be firewall)" + } + } catch { + Warn "Test-NetConnection failed: $_" + } +} + +function Test-CtrldLogHealth { + Header "4. ctrld Log Health Check" + + if (-not (Test-Path $CtrldLog)) { + Warn "Skipping log checks — $CtrldLog not found" + return + } + + # Check for WFP initialization + if (LogGrepCount "initializing Windows Filtering Platform" 500) { + Pass "WFP initialization logged" + } else { + Fail "No WFP initialization in recent logs" + } + + # Check for successful WFP engine open + if (LogGrepCount "WFP engine opened" 500) { + Pass "WFP engine opened successfully" + } else { + Fail "WFP engine open not found in logs" + } + + # Check for sublayer creation + if (LogGrepCount "WFP sublayer created" 500) { + Pass "WFP sublayer created" + } else { + Fail "WFP sublayer creation not logged" + } + + # Check for filter creation + $filterCount = LogGrepCount "added WFP.*filter" 500 + if ($filterCount -gt 0) { + Pass "WFP filters added ($filterCount filter log entries)" + } else { + Fail "No WFP filter creation logged" + } + + # Check for permit-localhost filters + if (LogGrepCount "permit.*localhost\|permit.*127\.0\.0\.1" 500) { + Pass "Localhost permit filters logged" + } else { + Warn "Localhost permit filters not explicitly logged" + } + + Separator + + # Check for errors + Info "Recent errors in ctrld log:" + $errors = LogGrep '"level":"error"' 500 + if ($errors) { + $errors | Select-Object -Last 5 | ForEach-Object { Write-Host " $_" } + Warn "Errors found in recent logs" + } else { + Pass "No errors in recent logs" + } + + # Warnings (excluding expected ones) + $warnings = LogGrep '"level":"warn"' 500 | Where-Object { + $_ -notmatch "skipping self-upgrade" + } + if ($warnings) { + Info "Warnings:" + $warnings | Select-Object -Last 5 | ForEach-Object { Write-Host " $_" } + } + + # VPN DNS detection + $vpnLogs = LogGrep "VPN DNS" 500 + if ($vpnLogs) { + Info "VPN DNS activity:" + $vpnLogs | Select-Object -Last 5 | ForEach-Object { Write-Host " $_" } + } else { + Info "No VPN DNS activity (expected if no VPN connected)" + } +} + +function Test-CleanupOnStop { + Header "5. Cleanup Validation (After ctrld Stop)" + + Manual "Stop ctrld now (ctrld stop or Ctrl+C), then press Enter" + WaitForKey + + Start-Sleep -Seconds 2 + + # Check WFP filters are removed + $wfpExport = "$env:TEMP\wfp_after_stop.xml" + try { + netsh wfp show filters file=$wfpExport 2>$null | Out-Null + if (Test-Path $wfpExport) { + $content = Get-Content $wfpExport -Raw -ErrorAction SilentlyContinue + if ($content -match "ctrld") { + Fail "WFP still contains 'ctrld' filters after stop" + } else { + Pass "WFP filters cleaned up after stop" + } + Remove-Item $wfpExport -ErrorAction SilentlyContinue + } + } catch { + Warn "Could not verify WFP cleanup" + } + + # DNS should work normally + Info "Test: nslookup example.com (should work via system DNS)" + try { + $result = nslookup example.com 2>&1 | Out-String + if ($result -match "\d+\.\d+\.\d+\.\d+") { + Pass "DNS works after ctrld stop" + } else { + Fail "DNS broken after ctrld stop" + } + } catch { + Fail "DNS exception after ctrld stop" + } +} + +function Test-RestartResilience { + Header "6. Restart Resilience" + + Manual "Start ctrld again with --dns-intercept, then press Enter" + WaitForKey + + Start-Sleep -Seconds 3 + + # Quick interception test + Info "Test: nslookup example.com 8.8.8.8 (should be intercepted after restart)" + try { + $result = nslookup example.com 8.8.8.8 2>&1 | Out-String + if ($result -match "\d+\.\d+\.\d+\.\d+") { + Pass "DNS interception works after restart" + } else { + Fail "DNS interception broken after restart" + } + } catch { + Fail "DNS test failed after restart" + } + + # Check WFP filters restored + if (LogGrepCount "WFP engine opened" 100) { + Pass "WFP re-initialized after restart" + } +} + +function Test-NetworkChange { + Header "7. Network Change Recovery" + + Info "This test verifies recovery after network changes." + Manual "Switch Wi-Fi networks, or disable/re-enable network adapter, then press Enter" + WaitForKey + + Start-Sleep -Seconds 5 + + # Test interception still works + Info "Test: nslookup example.com 8.8.8.8 (should still be intercepted)" + try { + $result = nslookup example.com 8.8.8.8 2>&1 | Out-String + if ($result -match "\d+\.\d+\.\d+\.\d+") { + Pass "DNS interception works after network change" + } else { + Fail "DNS interception broken after network change" + } + } catch { + Fail "DNS test failed after network change" + } + + # Check logs for recovery/network events + if (Test-Path $CtrldLog) { + $recoveryLogs = LogGrep "recovery|network change|network monitor" 100 + if ($recoveryLogs) { + Info "Recovery/network log entries:" + $recoveryLogs | Select-Object -Last 5 | ForEach-Object { Write-Host " $_" } + } + } +} + +# ============================================================================= +# SUMMARY +# ============================================================================= + +function Print-Summary { + Header "TEST SUMMARY" + Write-Host "" + foreach ($r in $Results) { + if ($r.StartsWith("PASS")) { + Write-Host " ✅ $($r.Substring(6))" -ForegroundColor Green + } elseif ($r.StartsWith("FAIL")) { + Write-Host " ❌ $($r.Substring(6))" -ForegroundColor Red + } elseif ($r.StartsWith("WARN")) { + Write-Host " ⚠️ $($r.Substring(6))" -ForegroundColor Yellow + } + } + Write-Host "" + Separator + Write-Host " Passed: $Pass | Failed: $Fail | Warnings: $Warn" + Separator + + if ($Fail -gt 0) { + Write-Host "`n Some tests failed. Debug commands:" -ForegroundColor Red + Write-Host " netsh wfp show filters # dump all WFP filters" + Write-Host " Get-Content $CtrldLog -Tail 100 # recent ctrld logs" + Write-Host " Get-DnsClientServerAddress # current DNS config" + Write-Host " netsh wfp show state # WFP state dump" + } else { + Write-Host "`n All tests passed!" -ForegroundColor Green + } +} + +# ============================================================================= +# MAIN +# ============================================================================= + +Write-Host "╔═══════════════════════════════════════════════════════╗" -ForegroundColor White +Write-Host "║ ctrld DNS Intercept Mode — Windows Test Suite ║" -ForegroundColor White +Write-Host "║ Tests WFP-based DNS interception ║" -ForegroundColor White +Write-Host "╚═══════════════════════════════════════════════════════╝" -ForegroundColor White + +Check-Admin + +Write-Host "" +Write-Host "Make sure ctrld is running with --dns-intercept before starting." +Write-Host "Log location: $CtrldLog" +WaitForKey + +Test-Prereqs +Test-WfpState +Test-DnsInterception +Test-NonDnsUnaffected +Test-CtrldLogHealth + +Separator +Write-Host "" +Write-Host "The next tests require manual steps (stop/start ctrld, network changes)." +Write-Host "Press Enter to continue, or Ctrl+C to skip and see results so far." +WaitForKey + +Test-CleanupOnStop +Test-RestartResilience +Test-NetworkChange + +Print-Summary diff --git a/test-scripts/windows/test-recovery-bypass.ps1 b/test-scripts/windows/test-recovery-bypass.ps1 new file mode 100644 index 0000000..005a7fe --- /dev/null +++ b/test-scripts/windows/test-recovery-bypass.ps1 @@ -0,0 +1,289 @@ +# test-recovery-bypass.ps1 — Test DNS intercept recovery bypass (captive portal simulation) +# +# Simulates a captive portal by: +# 1. Discovering ctrld's upstream IPs from active connections +# 2. Blocking them via Windows Firewall rules +# 3. Disabling/re-enabling the wifi adapter to trigger network change +# 4. Verifying recovery bypass forwards to OS/DHCP resolver +# 5. Removing firewall rules and verifying normal operation resumes +# +# SAFE: Uses named firewall rules that are cleaned up on exit. +# +# Usage (run as Administrator): +# .\test-recovery-bypass.ps1 [-WifiAdapter "Wi-Fi"] [-CtrldLog "C:\temp\dns.log"] +# +# Prerequisites: +# - ctrld running with --dns-intercept and -v 1 --log C:\temp\dns.log +# - Run as Administrator + +param( + [string]$WifiAdapter = "Wi-Fi", + [string]$CtrldLog = "C:\temp\dns.log", + [int]$BlockDurationSec = 60 +) + +$ErrorActionPreference = "Stop" +$FwRulePrefix = "ctrld-test-recovery-block" +$BlockedIPs = @() + +function Log($msg) { Write-Host "[$(Get-Date -Format 'HH:mm:ss')] $msg" -ForegroundColor Cyan } +function Pass($msg) { Write-Host "[PASS] $msg" -ForegroundColor Green } +function Fail($msg) { Write-Host "[FAIL] $msg" -ForegroundColor Red } +function Warn($msg) { Write-Host "[WARN] $msg" -ForegroundColor Yellow } + +# ── Safety: cleanup function ───────────────────────────────────────────────── +function Cleanup { + Log "═══ CLEANUP ═══" + + # Ensure wifi is enabled + Log "Ensuring wifi adapter is enabled..." + try { Enable-NetAdapter -Name $WifiAdapter -Confirm:$false -ErrorAction SilentlyContinue } catch {} + + # Remove all test firewall rules + Log "Removing test firewall rules..." + Get-NetFirewallRule -DisplayName "$FwRulePrefix*" -ErrorAction SilentlyContinue | + Remove-NetFirewallRule -ErrorAction SilentlyContinue + Log "Cleanup complete." +} + +# Register cleanup on script exit +$null = Register-EngineEvent -SourceIdentifier PowerShell.Exiting -Action { Cleanup } -ErrorAction SilentlyContinue +trap { Cleanup; break } + +# ── Pre-checks ─────────────────────────────────────────────────────────────── +$isAdmin = ([Security.Principal.WindowsPrincipal][Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole]::Administrator) +if (-not $isAdmin) { + Fail "Run as Administrator!" + exit 1 +} + +if (-not (Test-Path $CtrldLog)) { + Fail "ctrld log not found at $CtrldLog" + Write-Host "Start ctrld with: ctrld run --dns-intercept --cd -v 1 --log $CtrldLog" + exit 1 +} + +# Check wifi adapter exists +$adapter = Get-NetAdapter -Name $WifiAdapter -ErrorAction SilentlyContinue +if (-not $adapter) { + Fail "Wifi adapter '$WifiAdapter' not found" + Write-Host "Available adapters:" + Get-NetAdapter | Format-Table Name, Status, InterfaceDescription + exit 1 +} + +Log "═══════════════════════════════════════════════════════════" +Log " Recovery Bypass Test (Captive Portal Simulation)" +Log "═══════════════════════════════════════════════════════════" +Log "Wifi adapter: $WifiAdapter" +Log "ctrld log: $CtrldLog" +Write-Host "" + +# ── Phase 1: Discover upstream IPs ────────────────────────────────────────── +Log "Phase 1: Discovering ctrld upstream IPs from active connections" + +$ctrldConns = Get-NetTCPConnection -OwningProcess (Get-Process ctrld* -ErrorAction SilentlyContinue).Id -ErrorAction SilentlyContinue | + Where-Object { $_.State -eq "Established" -and $_.RemotePort -eq 443 } + +$upstreamIPs = @() +if ($ctrldConns) { + $upstreamIPs = $ctrldConns | Select-Object -ExpandProperty RemoteAddress -Unique | + Where-Object { $_ -notmatch "^(127\.|10\.|192\.168\.|172\.(1[6-9]|2[0-9]|3[01])\.)" } + + foreach ($conn in $ctrldConns) { + Log " $($conn.LocalAddress):$($conn.LocalPort) -> $($conn.RemoteAddress):$($conn.RemotePort)" + } +} + +# Also resolve known Control D endpoints +foreach ($host_ in @("dns.controld.com", "freedns.controld.com")) { + try { + $resolved = Resolve-DnsName $host_ -Type A -ErrorAction SilentlyContinue + $resolved | ForEach-Object { if ($_.IPAddress) { $upstreamIPs += $_.IPAddress } } + } catch {} +} + +$upstreamIPs = $upstreamIPs | Sort-Object -Unique + +if ($upstreamIPs.Count -eq 0) { + Fail "Could not discover any upstream IPs!" + exit 1 +} + +Log "Found $($upstreamIPs.Count) upstream IP(s):" +foreach ($ip in $upstreamIPs) { Log " $ip" } +Write-Host "" + +# ── Phase 2: Baseline ─────────────────────────────────────────────────────── +Log "Phase 2: Baseline — verify DNS works normally" +$baseline = Resolve-DnsName example.com -Server 127.0.0.1 -Type A -ErrorAction SilentlyContinue +if ($baseline) { + Pass "Baseline: example.com -> $($baseline[0].IPAddress)" +} else { + Fail "DNS not working!" + exit 1 +} + +$logLinesBefore = (Get-Content $CtrldLog).Count +Log "Log position: line $logLinesBefore" +Write-Host "" + +# ── Phase 3: Block upstream IPs via Windows Firewall ──────────────────────── +Log "Phase 3: Blocking upstream IPs via Windows Firewall" +foreach ($ip in $upstreamIPs) { + $ruleName = "$FwRulePrefix-$ip" + # Remove existing rule if any + Remove-NetFirewallRule -DisplayName $ruleName -ErrorAction SilentlyContinue + # Block outbound to this IP + New-NetFirewallRule -DisplayName $ruleName -Direction Outbound -Action Block ` + -RemoteAddress $ip -Protocol TCP -RemotePort 443 ` + -Description "Temporary test rule for ctrld recovery bypass test" | Out-Null + $BlockedIPs += $ip + Log " Blocked: $ip (outbound TCP 443)" +} +Pass "All $($upstreamIPs.Count) upstream IPs blocked" +Write-Host "" + +# ── Phase 4: Cycle wifi ───────────────────────────────────────────────────── +Log "Phase 4: Cycling wifi to trigger network change event" +Log " Disabling $WifiAdapter..." +Disable-NetAdapter -Name $WifiAdapter -Confirm:$false +Start-Sleep -Seconds 3 + +Log " Enabling $WifiAdapter..." +Enable-NetAdapter -Name $WifiAdapter -Confirm:$false + +Log " Waiting for wifi to reconnect (up to 20s)..." +$wifiUp = $false +for ($i = 0; $i -lt 20; $i++) { + $status = (Get-NetAdapter -Name $WifiAdapter).Status + if ($status -eq "Up") { + # Check for IP + $ipAddr = (Get-NetIPAddress -InterfaceAlias $WifiAdapter -AddressFamily IPv4 -ErrorAction SilentlyContinue).IPAddress + if ($ipAddr) { + $wifiUp = $true + Pass "Wifi reconnected: $WifiAdapter -> $ipAddr" + break + } + } + Start-Sleep -Seconds 1 +} + +if (-not $wifiUp) { + Fail "Wifi did not reconnect in 20s!" + Cleanup + exit 1 +} + +Log " Waiting 5s for ctrld network monitor..." +Start-Sleep -Seconds 5 +Write-Host "" + +# ── Phase 5: Query and watch for recovery ──────────────────────────────────── +Log "Phase 5: Sending queries — upstream blocked, recovery should activate" +Write-Host "" + +$recoveryDetected = $false +$bypassActive = $false +$dnsDuringBypass = $false + +for ($q = 1; $q -le 30; $q++) { + $result = $null + try { + $result = Resolve-DnsName "example.com" -Server 127.0.0.1 -Type A -DnsOnly -ErrorAction SilentlyContinue + } catch {} + + if ($result) { + Log " Query #$q`: example.com -> $($result[0].IPAddress) ✓" + } else { + Log " Query #$q`: example.com -> FAIL ✗" + } + + # Check ctrld log for recovery + $newLogs = Get-Content $CtrldLog | Select-Object -Skip $logLinesBefore + $logText = $newLogs -join "`n" + + if (-not $recoveryDetected -and ($logText -match "enabling DHCP bypass|triggering recovery|No healthy")) { + Write-Host "" + Pass "🎯 Recovery flow triggered!" + $recoveryDetected = $true + } + + if (-not $bypassActive -and ($logText -match "Recovery bypass active")) { + Pass "🔄 Recovery bypass forwarding to OS/DHCP resolver" + $bypassActive = $true + } + + if ($recoveryDetected -and $result) { + Pass "✅ DNS resolves during recovery: example.com -> $($result[0].IPAddress)" + $dnsDuringBypass = $true + break + } + + Start-Sleep -Seconds 2 +} + +# ── Phase 6: Show log entries ──────────────────────────────────────────────── +Write-Host "" +Log "Phase 6: Recovery-related ctrld log entries" +Log "────────────────────────────────────────────" +$newLogs = Get-Content $CtrldLog | Select-Object -Skip $logLinesBefore +$relevant = $newLogs | Where-Object { $_ -match "recovery|bypass|DHCP|unhealthy|upstream.*fail|No healthy|network change|OS resolver" } +if ($relevant) { + $relevant | Select-Object -First 30 | ForEach-Object { Write-Host " $_" } +} else { + Warn "No recovery-related log entries found" + Get-Content $CtrldLog | Select-Object -Last 10 | ForEach-Object { Write-Host " $_" } +} + +# ── Phase 7: Unblock and verify ───────────────────────────────────────────── +Write-Host "" +Log "Phase 7: Removing firewall blocks" +Get-NetFirewallRule -DisplayName "$FwRulePrefix*" -ErrorAction SilentlyContinue | + Remove-NetFirewallRule -ErrorAction SilentlyContinue +$BlockedIPs = @() +Pass "Firewall rules removed" + +Log "Waiting for recovery (up to 30s)..." +$logLinesUnblock = (Get-Content $CtrldLog).Count +$recoveryComplete = $false + +for ($i = 0; $i -lt 15; $i++) { + try { Resolve-DnsName example.com -Server 127.0.0.1 -Type A -DnsOnly -ErrorAction SilentlyContinue } catch {} + $postLogs = (Get-Content $CtrldLog | Select-Object -Skip $logLinesUnblock) -join "`n" + if ($postLogs -match "recovery complete|disabling DHCP bypass|Upstream.*recovered") { + $recoveryComplete = $true + Pass "ctrld recovered — normal operation resumed" + break + } + Start-Sleep -Seconds 2 +} + +if (-not $recoveryComplete) { Warn "Recovery completion not detected (may need more time)" } + +# ── Phase 8: Final check ──────────────────────────────────────────────────── +Write-Host "" +Log "Phase 8: Final DNS verification" +Start-Sleep -Seconds 2 +$final = Resolve-DnsName example.com -Server 127.0.0.1 -Type A -ErrorAction SilentlyContinue +if ($final) { + Pass "DNS working: example.com -> $($final[0].IPAddress)" +} else { + Fail "DNS not resolving" +} + +# ── Summary ────────────────────────────────────────────────────────────────── +Write-Host "" +Log "═══════════════════════════════════════════════════════════" +Log " Test Summary" +Log "═══════════════════════════════════════════════════════════" +if ($recoveryDetected) { Pass "Recovery bypass activated" } else { Fail "Recovery bypass NOT activated" } +if ($bypassActive) { Pass "Queries forwarded to OS/DHCP" } else { Warn "OS resolver forwarding not confirmed" } +if ($dnsDuringBypass) { Pass "DNS resolved during bypass" } else { Warn "DNS during bypass not confirmed" } +if ($recoveryComplete) { Pass "Normal operation resumed" } else { Warn "Recovery completion not confirmed" } +if ($final) { Pass "DNS functional at end of test" } else { Fail "DNS broken at end of test" } +Write-Host "" +Log "Full log: Get-Content $CtrldLog | Select-Object -Skip $logLinesBefore" + +# Cleanup runs via trap +Cleanup