Merge pull request #301 from Control-D-Inc/release-branch-v1.5.1

Release branch v1.5.1
2026-07-16 13:17:19 +02:00 · 2026-05-25 07:08:15 -04:00
parent 2926c76b76 5dd5846cca
commit 7e6f88b4ed
23 changed files with 1799 additions and 206 deletions
@@ -100,7 +100,7 @@ docker build -t controldns/ctrld . -f docker/Dockerfile


 # Usage
-The cli is self documenting, so free free to run `--help` on any sub-command to get specific usages. 
+The cli is self documenting, so feel free to run `--help` on any sub-command to get specific usages. 

 ## Arguments
 ```
@@ -266,5 +266,67 @@ The above will start a foreground process and:
 - Excluding `*.company.int` and `very-secure.local` matching queries, that are forwarded to `10.0.10.1:53`
 - Write a debug log to `/path/to/log.log`

+## DNS Intercept Mode
+When running `ctrld` alongside VPN software, DNS conflicts can cause intermittent failures, bypassed filtering, or configuration loops. DNS Intercept Mode prevents these issues by transparently capturing all DNS traffic on the system and routing it through `ctrld`, without modifying network adapter DNS settings.
+
+### When to Use
+Enable DNS Intercept Mode if you:
+- Use corporate VPN software (F5, Cisco AnyConnect, Palo Alto GlobalProtect, Zscaler)
+- Run overlay networks like Tailscale or WireGuard
+- Experience random DNS failures when VPN connects/disconnects
+- See gaps in your Control D analytics when VPN is active
+- Have endpoint security software that also manages DNS
+
+### Command
+
+Windows (Admin Shell)
+```shell
+ctrld.exe start --intercept-mode dns --cd RESOLVER_ID_HERE
+```
+
+macOS
+```shell
+sudo ctrld start --intercept-mode dns --cd RESOLVER_ID_HERE
+```
+
+`--intercept-mode dns` automatically detects VPN internal domains and routes them to the VPN's DNS server, while Control D handles everything else.
+
+To disable intercept mode on a service that already has it enabled:
+
+Windows (Admin Shell)
+```shell
+ctrld.exe start --intercept-mode off
+```
+
+macOS
+```shell
+sudo ctrld start --intercept-mode off
+```
+
+This removes the intercept rules and reverts to standard interface-based DNS configuration.
+
+### Platform Support
+| Platform | Supported | Mechanism |
+|----------|-----------|-----------|
+| Windows  | ✅        | NRPT (Name Resolution Policy Table) |
+| macOS    | ✅        | pf (packet filter) redirect |
+| Linux    | ❌        | Not currently supported |
+
+### Features
+- **VPN split routing** — VPN-specific domains are automatically detected and forwarded to the VPN's DNS server
+- **Captive portal recovery** — Wi-Fi login pages (hotels, airports, coffee shops) work automatically
+- **No network adapter changes** — DNS settings stay untouched, eliminating conflicts entirely
+- **Automatic port 53 conflict resolution** — if another process (e.g., `mDNSResponder` on macOS) is already using port 53, `ctrld` automatically listens on a different port. OS-level packet interception redirects all DNS traffic to `ctrld` transparently, so no manual configuration is needed. This only applies to intercept mode.
+
+### Tested VPN Software
+- F5 BIG-IP APM
+- Cisco AnyConnect
+- Palo Alto GlobalProtect
+- Tailscale (including Exit Nodes)
+- Windscribe
+- WireGuard
+
+For more details, see the [DNS Intercept Mode documentation](https://docs.controld.com/docs/dns-intercept).
+
 ## Contributing
 See [Contribution Guideline](./docs/contributing.md)
@@ -11,12 +11,14 @@ import (
 	"net/http"
 	"os"
 	"os/exec"
+	"os/signal"
 	"path/filepath"
 	"runtime"
 	"slices"
 	"sort"
 	"strconv"
 	"strings"
+	"syscall"
 	"time"

 	"github.com/docker/go-units"
@@ -146,6 +148,88 @@ func initLogCmd() *cobra.Command {
 			fmt.Println(logs.Data)
 		},
 	}
+	var tailLines int
+	logTailCmd := &cobra.Command{
+		Use:   "tail",
+		Short: "Tail live runtime debug logs",
+		Long:  "Stream live runtime debug logs to the terminal, similar to tail -f. Press Ctrl+C to stop.",
+		Args:  cobra.NoArgs,
+		PreRun: func(cmd *cobra.Command, args []string) {
+			checkHasElevatedPrivilege()
+		},
+		Run: func(cmd *cobra.Command, args []string) {
+
+			p := &prog{router: router.New(&cfg, false)}
+			s, _ := newService(p, svcConfig)
+
+			status, err := s.Status()
+			if errors.Is(err, service.ErrNotInstalled) {
+				mainLog.Load().Warn().Msg("service not installed")
+				return
+			}
+			if status == service.StatusStopped {
+				mainLog.Load().Warn().Msg("service is not running")
+				return
+			}
+
+			dir, err := socketDir()
+			if err != nil {
+				mainLog.Load().Fatal().Err(err).Msg("failed to find ctrld home dir")
+			}
+			cc := newControlClient(filepath.Join(dir, ctrldControlUnixSock))
+			tailPath := fmt.Sprintf("%s?lines=%d", tailLogsPath, tailLines)
+			resp, err := cc.postStream(tailPath, nil)
+			if err != nil {
+				mainLog.Load().Fatal().Err(err).Msg("failed to connect for log tailing")
+			}
+			defer resp.Body.Close()
+
+			switch resp.StatusCode {
+			case http.StatusMovedPermanently:
+				warnRuntimeLoggingNotEnabled()
+				return
+			case http.StatusOK:
+			default:
+				mainLog.Load().Fatal().Msgf("unexpected response status: %d", resp.StatusCode)
+				return
+			}
+
+			// Set up signal handling for clean shutdown.
+			ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
+			defer stop()
+
+			done := make(chan struct{})
+			go func() {
+				defer close(done)
+				// Stream output to stdout.
+				buf := make([]byte, 4096)
+				for {
+					n, readErr := resp.Body.Read(buf)
+					if n > 0 {
+						os.Stdout.Write(buf[:n])
+					}
+					if readErr != nil {
+						if readErr != io.EOF {
+							mainLog.Load().Error().Err(readErr).Msg("error reading log stream")
+						}
+						return
+					}
+				}
+			}()
+
+			select {
+			case <-ctx.Done():
+				if errors.Is(ctx.Err(), context.Canceled) {
+					msg := fmt.Sprintf("\nexiting: %s\n", context.Cause(ctx).Error())
+					os.Stdout.WriteString(msg)
+				}
+			case <-done:
+			}
+
+		},
+	}
+	logTailCmd.Flags().IntVarP(&tailLines, "lines", "n", 10, "Number of historical lines to show on connect")
+
 	logCmd := &cobra.Command{
 		Use:   "log",
 		Short: "Manage runtime debug logs",
@@ -156,6 +240,7 @@ func initLogCmd() *cobra.Command {
 	}
 	logCmd.AddCommand(logSendCmd)
 	logCmd.AddCommand(logViewCmd)
+	logCmd.AddCommand(logTailCmd)
 	rootCmd.AddCommand(logCmd)

 	return logCmd
@@ -32,6 +32,12 @@ func (c *controlClient) post(path string, data io.Reader) (*http.Response, error
 	return c.c.Post("http://unix"+path, contentTypeJson, data)
 }

+// postStream sends a POST request with no timeout, suitable for long-lived streaming connections.
+func (c *controlClient) postStream(path string, data io.Reader) (*http.Response, error) {
+	c.c.Timeout = 0
+	return c.c.Post("http://unix"+path, contentTypeJson, data)
+}
+
 // deactivationRequest represents request for validating deactivation pin.
 type deactivationRequest struct {
 	Pin int64 `json:"pin"`
@@ -10,6 +10,7 @@ import (
 	"os"
 	"reflect"
 	"sort"
+	"strconv"
 	"time"

 	"github.com/kardianos/service"
@@ -29,6 +30,7 @@ const (
 	ifacePath        = "/iface"
 	viewLogsPath     = "/log/view"
 	sendLogsPath     = "/log/send"
+	tailLogsPath     = "/log/tail"
 )

 type ifaceResponse struct {
@@ -344,6 +346,170 @@ func (p *prog) registerControlServerHandler() {
 		}
 		p.internalLogSent = time.Now()
 	}))
+	p.cs.register(tailLogsPath, http.HandlerFunc(func(w http.ResponseWriter, request *http.Request) {
+		flusher, ok := w.(http.Flusher)
+		if !ok {
+			http.Error(w, "streaming unsupported", http.StatusInternalServerError)
+			return
+		}
+
+		// Determine logging mode and validate before starting the stream.
+		var lw *logWriter
+		useInternalLog := p.needInternalLogging()
+		if useInternalLog {
+			p.mu.Lock()
+			lw = p.internalLogWriter
+			p.mu.Unlock()
+			if lw == nil {
+				w.WriteHeader(http.StatusMovedPermanently)
+				return
+			}
+		} else if p.cfg.Service.LogPath == "" {
+			// No logging configured at all.
+			w.WriteHeader(http.StatusMovedPermanently)
+			return
+		}
+
+		// Parse optional "lines" query param for initial context.
+		numLines := 10
+		if v := request.URL.Query().Get("lines"); v != "" {
+			if n, err := strconv.Atoi(v); err == nil && n >= 0 {
+				numLines = n
+			}
+		}
+
+		w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+		w.Header().Set("Transfer-Encoding", "chunked")
+		w.Header().Set("X-Content-Type-Options", "nosniff")
+		w.WriteHeader(http.StatusOK)
+
+		if useInternalLog {
+			// Internal logging mode: subscribe to the logWriter.
+
+			// Send last N lines as initial context.
+			if numLines > 0 {
+				if tail := lw.tailLastLines(numLines); len(tail) > 0 {
+					w.Write(tail)
+					flusher.Flush()
+				}
+			}
+
+			ch, unsub := lw.Subscribe()
+			defer unsub()
+			for {
+				select {
+				case data, ok := <-ch:
+					if !ok {
+						return
+					}
+					if _, err := w.Write(data); err != nil {
+						return
+					}
+					flusher.Flush()
+				case <-request.Context().Done():
+					return
+				}
+			}
+		} else {
+			// File-based logging mode: tail the log file.
+			logFile := normalizeLogFilePath(p.cfg.Service.LogPath)
+			f, err := os.Open(logFile)
+			if err != nil {
+				// Already committed 200, just return.
+				return
+			}
+			defer f.Close()
+
+			// Seek to show last N lines.
+			if numLines > 0 {
+				if tail := tailFileLastLines(f, numLines); len(tail) > 0 {
+					w.Write(tail)
+					flusher.Flush()
+				}
+			} else {
+				// Seek to end.
+				f.Seek(0, io.SeekEnd)
+			}
+
+			// Poll for new data.
+			buf := make([]byte, 4096)
+			ticker := time.NewTicker(200 * time.Millisecond)
+			defer ticker.Stop()
+			for {
+				select {
+				case <-ticker.C:
+					n, err := f.Read(buf)
+					if n > 0 {
+						if _, werr := w.Write(buf[:n]); werr != nil {
+							return
+						}
+						flusher.Flush()
+					}
+					if err != nil && err != io.EOF {
+						return
+					}
+				case <-request.Context().Done():
+					return
+				}
+			}
+		}
+	}))
+}
+
+// tailFileLastLines reads the last n lines from a file and returns them.
+// The file position is left at the end of the file after this call.
+func tailFileLastLines(f *os.File, n int) []byte {
+	stat, err := f.Stat()
+	if err != nil || stat.Size() == 0 {
+		return nil
+	}
+
+	// Read from the end in chunks to find the last n lines.
+	const chunkSize = 4096
+	fileSize := stat.Size()
+	var lines []byte
+	offset := fileSize
+	count := 0
+
+	for offset > 0 && count <= n {
+		readSize := int64(chunkSize)
+		if readSize > offset {
+			readSize = offset
+		}
+		offset -= readSize
+		buf := make([]byte, readSize)
+		nRead, err := f.ReadAt(buf, offset)
+		if err != nil && err != io.EOF {
+			break
+		}
+		buf = buf[:nRead]
+		lines = append(buf, lines...)
+
+		// Count newlines in this chunk.
+		for _, b := range buf {
+			if b == '\n' {
+				count++
+			}
+		}
+	}
+
+	// Trim to last n lines.
+	idx := 0
+	nlCount := 0
+	for i := len(lines) - 1; i >= 0; i-- {
+		if lines[i] == '\n' {
+			nlCount++
+			if nlCount == n+1 {
+				idx = i + 1
+				break
+			}
+		}
+	}
+	lines = lines[idx:]
+
+	// Seek to end of file for subsequent reads.
+	f.Seek(0, io.SeekEnd)
+	return lines
 }

 func jsonResponse(next http.Handler) http.Handler {
@@ -321,14 +321,13 @@ func (p *prog) startDNSIntercept() error {
 //	options → normalization (scrub) → queueing → translation (nat/rdr) → filtering (pass/block/anchor)
 //
 // "pfctl -sr" returns BOTH scrub-anchor (normalization) AND anchor/pass/block (filter) rules.
-// "pfctl -sn" returns nat-anchor AND rdr-anchor (translation) rules.
+// "pfctl -sn" returns rdr-anchor (translation) rules.
 // Both commands emit "No ALTQ support in kernel" warnings on stderr.
 //
 // We must reassemble in correct order: scrub → nat/rdr → filter.
 //
 // The anchor reference does not survive a reboot, but ctrld re-adds it on every start.
 func (p *prog) ensurePFAnchorReference() error {
-	natAnchorRef := fmt.Sprintf("nat-anchor \"%s\"", pfAnchorName)
 	rdrAnchorRef := fmt.Sprintf("rdr-anchor \"%s\"", pfAnchorName)
 	anchorRef := fmt.Sprintf("anchor \"%s\"", pfAnchorName)

@@ -347,11 +346,10 @@ func (p *prog) ensurePFAnchorReference() error {
 	natLines := pfFilterRuleLines(string(natOut))
 	filterLines := pfFilterRuleLines(string(filterOut))

-	hasNatAnchor := pfContainsRule(natLines, natAnchorRef)
 	hasRdrAnchor := pfContainsRule(natLines, rdrAnchorRef)
 	hasAnchor := pfContainsRule(filterLines, anchorRef)

-	if hasNatAnchor && hasRdrAnchor && hasAnchor {
+	if hasRdrAnchor && hasAnchor {
 		// Verify anchor ordering: our anchor should appear before other anchors
 		// for reliable DNS interception priority. Log a warning if out of order,
 		// but don't force a reload (the interface-specific rules in our anchor
@@ -380,15 +378,8 @@ func (p *prog) ensurePFAnchorReference() error {
 	// rules in whichever anchor appears first win. By prepending, our DNS
 	// intercept rules match port 53 traffic before a VPN app's broader
 	// "pass out quick on <iface> all" rules in their anchor.
-	if !hasNatAnchor || !hasRdrAnchor {
-		var newRefs []string
-		if !hasNatAnchor {
-			newRefs = append(newRefs, natAnchorRef)
-		}
-		if !hasRdrAnchor {
-			newRefs = append(newRefs, rdrAnchorRef)
-		}
-		natLines = append(newRefs, natLines...)
+	if !hasRdrAnchor {
+		natLines = append([]string{rdrAnchorRef}, natLines...)
 	}
 	if !hasAnchor {
 		pureFilterLines = append([]string{anchorRef}, pureFilterLines...)
@@ -590,7 +581,6 @@ func (p *prog) stopDNSIntercept() error {
 // The anchor itself is already flushed by stopDNSIntercept, so even if removal
 // fails, the empty anchor is a no-op.
 func (p *prog) removePFAnchorReference() error {
-	natAnchorRef := fmt.Sprintf("nat-anchor \"%s\"", pfAnchorName)
 	rdrAnchorRef := fmt.Sprintf("rdr-anchor \"%s\"", pfAnchorName)
 	anchorRef := fmt.Sprintf("anchor \"%s\"", pfAnchorName)

@@ -609,7 +599,7 @@ func (p *prog) removePFAnchorReference() error {

 	var cleanNat []string
 	for _, line := range natLines {
-		if !strings.Contains(line, rdrAnchorRef) && !strings.Contains(line, natAnchorRef) {
+		if !strings.Contains(line, rdrAnchorRef) {
 			cleanNat = append(cleanNat, line)
 		}
 	}
@@ -804,23 +794,13 @@ func (p *prog) buildPFAnchorRules(vpnExemptions []vpnDNSExemption) string {
 	// a stateful entry that handles response routing. Using "rdr pass" would skip filter
 	// evaluation, and its implicit state alone is insufficient for response delivery —
 	// proven by commit 51cf029 where responses were silently dropped.
-	rules.WriteString("# --- Translation rules (nat + rdr) ---\n")
-
-	// NAT source to ::1 for IPv6 DNS on loopback. macOS/BSD rejects sendmsg from
-	// [::1] to a global unicast IPv6 address (EINVAL), unlike IPv4 where sendmsg from
-	// 127.0.0.1 to local private IPs works fine. The rdr rewrites the destination but
-	// preserves the original source (machine's global IPv6). Without nat, ctrld cannot
-	// reply. pf reverses both translations on the response path.
-	// Note: nat must appear before rdr (pf evaluates nat first in translation phase).
-	listenerAddr6 := fmt.Sprintf("::1 port %d", listenerPort)
-	rules.WriteString("nat on lo0 inet6 proto udp from ! ::1 to ! ::1 port 53 -> ::1\n")
-	rules.WriteString("nat on lo0 inet6 proto tcp from ! ::1 to ! ::1 port 53 -> ::1\n")
+	rules.WriteString("# --- Translation rules (rdr) ---\n")

 	rules.WriteString("# Redirect DNS on loopback to ctrld's listener.\n")
 	rules.WriteString(fmt.Sprintf("rdr on lo0 inet proto udp from any to ! %s port 53 -> %s\n", listenerIP, listenerAddr))
 	rules.WriteString(fmt.Sprintf("rdr on lo0 inet proto tcp from any to ! %s port 53 -> %s\n", listenerIP, listenerAddr))
-	rules.WriteString(fmt.Sprintf("rdr on lo0 inet6 proto udp from any to ! ::1 port 53 -> %s\n", listenerAddr6))
-	rules.WriteString(fmt.Sprintf("rdr on lo0 inet6 proto tcp from any to ! ::1 port 53 -> %s\n\n", listenerAddr6))
+	// No IPv6 rdr — IPv6 DNS is blocked at the filter level (see below).
+	rules.WriteString("\n")

 	// --- Filtering rules ---
 	rules.WriteString("# --- Filtering rules (pass) ---\n\n")
@@ -982,8 +962,7 @@ func (p *prog) buildPFAnchorRules(vpnExemptions []vpnDNSExemption) string {
 			}
 			rules.WriteString(fmt.Sprintf("pass out quick on %s route-to lo0 inet proto udp from any to ! %s port 53\n", iface, listenerIP))
 			rules.WriteString(fmt.Sprintf("pass out quick on %s route-to lo0 inet proto tcp from any to ! %s port 53\n", iface, listenerIP))
-			rules.WriteString(fmt.Sprintf("pass out quick on %s route-to lo0 inet6 proto udp from any to ! ::1 port 53\n", iface))
-			rules.WriteString(fmt.Sprintf("pass out quick on %s route-to lo0 inet6 proto tcp from any to ! ::1 port 53\n", iface))
+			// No IPv6 route-to — IPv6 DNS is blocked, not intercepted.
 		}
 		rules.WriteString("\n")
 	}
@@ -1003,10 +982,14 @@ func (p *prog) buildPFAnchorRules(vpnExemptions []vpnDNSExemption) string {
 	rules.WriteString(fmt.Sprintf("pass out quick on ! lo0 route-to lo0 inet proto udp from any to ! %s port 53\n", listenerIP))
 	rules.WriteString(fmt.Sprintf("pass out quick on ! lo0 route-to lo0 inet proto tcp from any to ! %s port 53\n\n", listenerIP))

-	// Force remaining outbound IPv6 DNS through loopback for interception.
-	rules.WriteString("# Force remaining outbound IPv6 DNS through loopback for interception.\n")
-	rules.WriteString("pass out quick on ! lo0 route-to lo0 inet6 proto udp from any to ! ::1 port 53\n")
-	rules.WriteString("pass out quick on ! lo0 route-to lo0 inet6 proto tcp from any to ! ::1 port 53\n\n")
+	// Block all outbound IPv6 DNS. ctrld only intercepts IPv4 DNS via the loopback
+	// redirect. IPv6 DNS interception on macOS is not feasible because the kernel rejects
+	// sendmsg from [::1] to global unicast IPv6 (EINVAL), and pf's nat-on-lo0 doesn't
+	// fire for route-to'd packets. Blocking forces macOS to fall back to IPv4 DNS,
+	// which is fully intercepted. See docs/pf-dns-intercept.md for details.
+	rules.WriteString("# Block outbound IPv6 DNS — ctrld intercepts IPv4 only.\n")
+	rules.WriteString("# macOS falls back to IPv4 DNS automatically.\n")
+	rules.WriteString("block out quick on ! lo0 inet6 proto { udp, tcp } from any to any port 53\n\n")

 	// Allow route-to'd DNS packets to pass outbound on lo0.
 	// Without this, VPN firewalls with "block drop all" (e.g., Windscribe) drop the packet
@@ -1018,8 +1001,8 @@ func (p *prog) buildPFAnchorRules(vpnExemptions []vpnDNSExemption) string {
 	rules.WriteString("# Pass route-to'd DNS outbound on lo0 — no state to avoid bypassing rdr inbound.\n")
 	rules.WriteString(fmt.Sprintf("pass out quick on lo0 inet proto udp from any to ! %s port 53 no state\n", listenerIP))
 	rules.WriteString(fmt.Sprintf("pass out quick on lo0 inet proto tcp from any to ! %s port 53 no state\n", listenerIP))
-	rules.WriteString("pass out quick on lo0 inet6 proto udp from any to ! ::1 port 53 no state\n")
-	rules.WriteString("pass out quick on lo0 inet6 proto tcp from any to ! ::1 port 53 no state\n\n")
+	// No IPv6 lo0 pass — IPv6 DNS is blocked, not routed through lo0.
+	rules.WriteString("\n")

 	// Allow the redirected traffic through on loopback (inbound after rdr).
 	//
@@ -1034,7 +1017,7 @@ func (p *prog) buildPFAnchorRules(vpnExemptions []vpnDNSExemption) string {
 	// (source 127.0.0.1 → original DNS server IP, e.g., 10.255.255.3).
 	rules.WriteString("# Accept redirected DNS — reply-to lo0 forces response through loopback.\n")
 	rules.WriteString(fmt.Sprintf("pass in quick on lo0 reply-to lo0 inet proto { udp, tcp } from any to %s\n", listenerAddr))
-	rules.WriteString(fmt.Sprintf("pass in quick on lo0 reply-to lo0 inet6 proto { udp, tcp } from any to %s\n", listenerAddr6))
+	// No IPv6 pass-in — IPv6 DNS is blocked, not redirected to [::1].

 	return rules.String()
 }
@@ -1043,12 +1026,11 @@ func (p *prog) buildPFAnchorRules(vpnExemptions []vpnDNSExemption) string {
 // It verifies both the anchor references in the main ruleset and the rules within
 // our anchor. Failures are logged at ERROR level to make them impossible to miss.
 func (p *prog) verifyPFState() {
-	natAnchorRef := fmt.Sprintf("nat-anchor \"%s\"", pfAnchorName)
 	rdrAnchorRef := fmt.Sprintf("rdr-anchor \"%s\"", pfAnchorName)
 	anchorRef := fmt.Sprintf("anchor \"%s\"", pfAnchorName)
 	verified := true

-	// Check main ruleset for anchor references (nat-anchor + rdr-anchor in translation rules).
+	// Check main ruleset for anchor references (rdr-anchor in translation rules).
 	natOut, err := exec.Command("pfctl", "-sn").CombinedOutput()
 	if err != nil {
 		mainLog.Load().Error().Err(err).Msg("DNS intercept: VERIFICATION FAILED — could not dump NAT rules")
@@ -1059,10 +1041,6 @@ func (p *prog) verifyPFState() {
 			mainLog.Load().Error().Msg("DNS intercept: VERIFICATION FAILED — rdr-anchor reference missing from running NAT rules")
 			verified = false
 		}
-		if !strings.Contains(natStr, natAnchorRef) {
-			mainLog.Load().Error().Msg("DNS intercept: VERIFICATION FAILED — nat-anchor reference missing from running NAT rules")
-			verified = false
-		}
 	}

 	filterOut, err := exec.Command("pfctl", "-sr").CombinedOutput()
@@ -1229,6 +1207,7 @@ func stringSlicesEqual(a, b []string) bool {
 	return true
 }

+
 // pfStartStabilization enters stabilization mode, suppressing all pf restores
 // until the VPN's ruleset stops changing. This prevents a death spiral where
 // ctrld and the VPN repeatedly overwrite each other's pf rules.
@@ -1347,7 +1326,6 @@ func (p *prog) ensurePFAnchorActive() bool {
 		}
 	}

-	natAnchorRef := fmt.Sprintf("nat-anchor \"%s\"", pfAnchorName)
 	rdrAnchorRef := fmt.Sprintf("rdr-anchor \"%s\"", pfAnchorName)
 	anchorRef := fmt.Sprintf("anchor \"%s\"", pfAnchorName)
 	needsRestore := false
@@ -1363,10 +1341,6 @@ func (p *prog) ensurePFAnchorActive() bool {
 		mainLog.Load().Warn().Msg("DNS intercept watchdog: rdr-anchor reference missing from running ruleset")
 		needsRestore = true
 	}
-	if !strings.Contains(natStr, natAnchorRef) {
-		mainLog.Load().Warn().Msg("DNS intercept watchdog: nat-anchor reference missing from running ruleset")
-		needsRestore = true
-	}

 	if !needsRestore {
 		filterOut, err := exec.Command("pfctl", "-sr").CombinedOutput()
@@ -1762,7 +1736,6 @@ func (p *prog) pfInterceptMonitor() {
 // The reload is safe for VPN interop because it reassembles from the current running
 // ruleset (pfctl -sr/-sn), preserving all existing anchors and rules.
 func (p *prog) forceReloadPFMainRuleset() {
-	natAnchorRef := fmt.Sprintf("nat-anchor \"%s\"", pfAnchorName)
 	rdrAnchorRef := fmt.Sprintf("rdr-anchor \"%s\"", pfAnchorName)
 	anchorRef := fmt.Sprintf("anchor \"%s\"", pfAnchorName)

@@ -1793,9 +1766,6 @@ func (p *prog) forceReloadPFMainRuleset() {
 	}

 	// Ensure our anchor references are present (they may have been wiped).
-	if !pfContainsRule(natLines, natAnchorRef) {
-		natLines = append([]string{natAnchorRef}, natLines...)
-	}
 	if !pfContainsRule(natLines, rdrAnchorRef) {
 		natLines = append([]string{rdrAnchorRef}, natLines...)
 	}
@@ -1846,3 +1816,7 @@ func (p *prog) forceReloadPFMainRuleset() {

 	mainLog.Load().Info().Msg("DNS intercept: force reload — pf ruleset and anchor reloaded successfully")
 }
+
+// osHealthcheckSuppressed always returns false on darwin — WFP loopback
+// protect (the trigger for suppression) is Windows-only.
+func (p *prog) osHealthcheckSuppressed() bool { return false }
@@ -37,3 +37,7 @@ func (p *prog) scheduleDelayedRechecks() {}

 // pfInterceptMonitor is a no-op on unsupported platforms.
 func (p *prog) pfInterceptMonitor() {}
+
+// osHealthcheckSuppressed always returns false on non-Windows platforms —
+// WFP loopback protect (the trigger for suppression) is Windows-only.
+func (p *prog) osHealthcheckSuppressed() bool { return false }
@@ -9,6 +9,7 @@ import (
 	"net"
 	"os/exec"
 	"runtime"
+	"sync"
 	"sync/atomic"
 	"time"
 	"unsafe"
@@ -118,6 +119,14 @@ const (

 	// DNS port.
 	dnsPort uint16 = 53
+
+	// FWPM_FILTER_FLAG constants from fwpmtypes.h.
+	// See: https://learn.microsoft.com/en-us/windows/win32/api/fwpmtypes/ns-fwpmtypes-fwpm_filter0
+	//
+	// FWPM_FILTER_FLAG_CLEAR_ACTION_RIGHT (0x08) prevents lower-weight sublayers
+	// from overriding this filter's PERMIT action ("hard permit"). Used in DNS
+	// mode to override third-party WFP blocks (e.g., OpenVPN's block-outside-dns).
+	fwpmFilterFlagClearActionRight uint32 = 0x00000008
 )

 // WFP API structures. These mirror the C structures from fwpmtypes.h and fwptypes.h.
@@ -258,6 +267,17 @@ type wfpState struct {
 	listenerIP string
 	// stopCh is used to shut down the NRPT health monitor goroutine.
 	stopCh chan struct{}
+	// mu protects loopbackProtectActive, loopbackPermitIDs, and engineHandle
+	// from concurrent access between nrptProbeAndHeal (goroutine) and
+	// stopDNSIntercept / cleanupWFPFilters (main goroutine).
+	mu sync.Mutex
+	// loopbackProtectActive is true when DNS mode has activated a minimal WFP
+	// session to permit loopback DNS. This counters third-party WFP block filters
+	// (e.g., OpenVPN's block-outside-dns) that prevent NRPT from routing queries
+	// to ctrld's listener on 127.0.0.1. See issue #526.
+	loopbackProtectActive bool
+	// loopbackPermitIDs stores the filter IDs for the loopback protect permits.
+	loopbackPermitIDs []uint64
 }

 // Lazy-loaded WFP DLL procedures.
@@ -607,6 +627,17 @@ func (p *prog) startDNSIntercept() error {
 		}
 	} else {
 		mainLog.Load().Info().Msg("DNS intercept: dns mode — NRPT only, no WFP filters (graceful)")
+		// Proactively add loopback WFP permit filters to protect the NRPT
+		// → 127.0.0.1 path from third-party DNS block filters (e.g., OpenVPN's
+		// block-outside-dns). These are narrowly scoped (port 53 to localhost
+		// only) and use CLEAR_ACTION_RIGHT to override any block from other
+		// sublayers. Adding them at startup eliminates the DNS outage window
+		// that would otherwise occur between VPN connect and reactive activation.
+		if err := p.activateLoopbackWFPProtect(state); err != nil {
+			// Non-fatal: loopback protect is a defense-in-depth measure.
+			// NRPT still works when no third-party WFP blocks are present.
+			mainLog.Load().Warn().Err(err).Msg("DNS intercept: failed to activate proactive loopback WFP protect — will retry on probe failure")
+		}
 	}

 	p.dnsInterceptState = state
@@ -878,14 +909,13 @@ func (p *prog) addWFPPermitLocalhostFilter(engineHandle uintptr, name string, la
 	return filterID, nil
 }

-// addWFPPermitSubnetFilter adds a WFP filter that permits outbound DNS to a given
-// IPv4 subnet (addr/mask in host byte order). Used to exempt RFC1918 and CGNAT ranges
-// so VPN DNS servers on private IPs are not blocked.
-func (p *prog) addWFPPermitSubnetFilter(engineHandle uintptr, name string, proto uint8, addr, mask uint32) (uint64, error) {
+// addWFPPermitDNSFilter is the unified helper for adding a WFP permit filter for
+// outbound DNS (port 53) with caller-specified address condition, flags, and weight.
+// Both subnet permits (RFC1918/CGNAT, flags=0, weight=10) and hard loopback permits
+// (CLEAR_ACTION_RIGHT, weight=15) use this to avoid code drift.
+func (p *prog) addWFPPermitDNSFilter(engineHandle uintptr, name string, layerKey windows.GUID, proto uint8, addrCond fwpmFilterCondition0, flags uint32, weight uint8) (uint64, error) {
 	filterName, _ := windows.UTF16PtrFromString("ctrld: " + name)

-	addrMask := fwpV4AddrAndMask{addr: addr, mask: mask}
-
 	conditions := make([]fwpmFilterCondition0, 3)

 	conditions[0] = fwpmFilterCondition0{
@@ -902,22 +932,18 @@ func (p *prog) addWFPPermitSubnetFilter(engineHandle uintptr, name string, proto
 	conditions[1].condValue.valueType = fwpUint16
 	conditions[1].condValue.value = uint64(dnsPort)

-	conditions[2] = fwpmFilterCondition0{
-		fieldKey:  fwpmConditionIPRemoteAddress,
-		matchType: fwpMatchEqual,
-	}
-	conditions[2].condValue.valueType = fwpV4AddrMask
-	conditions[2].condValue.value = uint64(uintptr(unsafe.Pointer(&addrMask)))
+	conditions[2] = addrCond

 	filter := fwpmFilter0{
-		layerKey:        fwpmLayerALEAuthConnectV4,
+		flags:           flags,
+		layerKey:        layerKey,
 		subLayerKey:     ctrldSubLayerGUID,
 		numFilterConds:  3,
 		filterCondition: &conditions[0],
 	}
 	filter.displayData.name = filterName
 	filter.weight.valueType = fwpUint8
-	filter.weight.value = 10
+	filter.weight.value = uint64(weight)
 	filter.action.actionType = fwpActionPermit

 	var filterID uint64
@@ -927,7 +953,6 @@ func (p *prog) addWFPPermitSubnetFilter(engineHandle uintptr, name string, proto
 		0,
 		uintptr(unsafe.Pointer(&filterID)),
 	)
-	runtime.KeepAlive(&addrMask)
 	runtime.KeepAlive(conditions)
 	if r1 != 0 {
 		return 0, fmt.Errorf("FwpmFilterAdd0 failed: HRESULT 0x%x", r1)
@@ -935,6 +960,24 @@ func (p *prog) addWFPPermitSubnetFilter(engineHandle uintptr, name string, proto
 	return filterID, nil
 }

+// addWFPPermitSubnetFilter adds a WFP filter that permits outbound DNS to a given
+// IPv4 subnet (addr/mask in host byte order). Used to exempt RFC1918 and CGNAT ranges
+// so VPN DNS servers on private IPs are not blocked.
+func (p *prog) addWFPPermitSubnetFilter(engineHandle uintptr, name string, proto uint8, addr, mask uint32) (uint64, error) {
+	addrMask := fwpV4AddrAndMask{addr: addr, mask: mask}
+
+	addrCond := fwpmFilterCondition0{
+		fieldKey:  fwpmConditionIPRemoteAddress,
+		matchType: fwpMatchEqual,
+	}
+	addrCond.condValue.valueType = fwpV4AddrMask
+	addrCond.condValue.value = uint64(uintptr(unsafe.Pointer(&addrMask)))
+
+	filterID, err := p.addWFPPermitDNSFilter(engineHandle, name, fwpmLayerALEAuthConnectV4, proto, addrCond, 0, 10)
+	runtime.KeepAlive(&addrMask)
+	return filterID, err
+}
+
 // wfpSublayerExists checks whether our WFP sublayer still exists in the engine.
 // Used by the watchdog to detect if another program removed our filters.
 func wfpSublayerExists(engineHandle uintptr) bool {
@@ -962,6 +1005,21 @@ func (p *prog) cleanupWFPFilters(state *wfpState) {
 		return
 	}

+	// Clean up loopback protect filters (DNS mode VPN workaround).
+	state.mu.Lock()
+	loopbackIDs := state.loopbackPermitIDs
+	state.loopbackPermitIDs = nil
+	state.loopbackProtectActive = false
+	state.mu.Unlock()
+	for _, filterID := range loopbackIDs {
+		r1, _, _ := procFwpmFilterDeleteById0.Call(state.engineHandle, uintptr(filterID))
+		if r1 != 0 {
+			mainLog.Load().Warn().Msgf("DNS intercept: failed to remove loopback protect filter (ID: %d, code: 0x%x)", filterID, r1)
+		} else {
+			mainLog.Load().Debug().Msgf("DNS intercept: removed loopback protect filter (ID: %d)", filterID)
+		}
+	}
+
 	for _, filterID := range state.vpnPermitFilterIDs {
 		r1, _, _ := procFwpmFilterDeleteById0.Call(state.engineHandle, uintptr(filterID))
 		if r1 != 0 {
@@ -1024,6 +1082,174 @@ func (p *prog) cleanupWFPFilters(state *wfpState) {
 	}
 }

+// activateLoopbackWFPProtect opens a minimal WFP session and adds "hard permit"
+// filters for DNS to localhost. This is used in DNS mode when NRPT probe failures
+// are detected, typically caused by third-party VPN software (e.g., OpenVPN) that
+// installs WFP block filters via block-outside-dns. The hard permit (with
+// FWPM_FILTER_FLAG_CLEAR_ACTION_RIGHT) in a max-weight sublayer overrides the
+// third-party blocks without affecting their protection for non-loopback DNS.
+//
+// See: https://gitlab.int.windscribe.com/controld/clients/ctrld/-/issues/526
+func (p *prog) activateLoopbackWFPProtect(state *wfpState) error {
+	state.mu.Lock()
+	defer state.mu.Unlock()
+
+	if state.loopbackProtectActive {
+		mainLog.Load().Debug().Msg("DNS intercept: loopback WFP protect already active")
+		return nil
+	}
+	// Only activate in DNS mode. Hard mode manages its own full WFP state
+	// (block + permit filters in the same sublayer). Activating loopback
+	// protect would delete the hard mode sublayer and all its filters.
+	if hardIntercept {
+		mainLog.Load().Debug().Msg("DNS intercept: skipping loopback WFP protect in hard mode")
+		return nil
+	}
+
+	mainLog.Load().Info().Msg("DNS intercept: activating loopback WFP protect (countering third-party DNS block filters)")
+
+	// Open WFP engine if not already open (DNS mode doesn't open it normally).
+	if state.engineHandle == 0 {
+		var engineHandle uintptr
+		session := fwpmSession0{}
+		sessionName, _ := windows.UTF16PtrFromString("ctrld DNS Loopback Protect")
+		session.displayData.name = sessionName
+
+		const rpcCAuthnDefault = 0xFFFFFFFF
+		r1, _, _ := procFwpmEngineOpen0.Call(
+			0,
+			uintptr(rpcCAuthnDefault),
+			0,
+			uintptr(unsafe.Pointer(&session)),
+			uintptr(unsafe.Pointer(&engineHandle)),
+		)
+		if r1 != 0 {
+			return fmt.Errorf("FwpmEngineOpen0 failed: HRESULT 0x%x", r1)
+		}
+		mainLog.Load().Info().Msgf("DNS intercept: WFP engine opened for loopback protect (handle: 0x%x)", engineHandle)
+		state.engineHandle = engineHandle
+	}
+
+	// Clean up any stale sublayer from a previous session.
+	procFwpmSubLayerDeleteByKey0.Call(
+		state.engineHandle,
+		uintptr(unsafe.Pointer(&ctrldSubLayerGUID)),
+	)
+
+	// Create sublayer at maximum priority.
+	sublayer := fwpmSublayer0{
+		subLayerKey: ctrldSubLayerGUID,
+		weight:      0xFFFF,
+	}
+	sublayerName, _ := windows.UTF16PtrFromString("ctrld DNS Loopback Protect Sublayer")
+	sublayerDesc, _ := windows.UTF16PtrFromString("Permits DNS to localhost, overriding third-party VPN block filters")
+	sublayer.displayData.name = sublayerName
+	sublayer.displayData.description = sublayerDesc
+
+	r1, _, _ := procFwpmSubLayerAdd0.Call(
+		state.engineHandle,
+		uintptr(unsafe.Pointer(&sublayer)),
+		0,
+	)
+	if r1 != 0 {
+		return fmt.Errorf("FwpmSubLayerAdd0 failed: HRESULT 0x%x", r1)
+	}
+
+	// Add hard permit filters for loopback DNS (v4+v6, UDP+TCP).
+	permitFilters := []struct {
+		name  string
+		layer windows.GUID
+		proto uint8
+	}{
+		{"Loopback Protect: Permit DNS to localhost (IPv4/UDP)", fwpmLayerALEAuthConnectV4, ipprotoUDP},
+		{"Loopback Protect: Permit DNS to localhost (IPv4/TCP)", fwpmLayerALEAuthConnectV4, ipprotoTCP},
+		{"Loopback Protect: Permit DNS to localhost (IPv6/UDP)", fwpmLayerALEAuthConnectV6, ipprotoUDP},
+		{"Loopback Protect: Permit DNS to localhost (IPv6/TCP)", fwpmLayerALEAuthConnectV6, ipprotoTCP},
+	}
+
+	for _, pf := range permitFilters {
+		filterID, err := p.addWFPHardPermitLocalhostFilter(state.engineHandle, pf.name, pf.layer, pf.proto, state.listenerIP)
+		if err != nil {
+			// Partial failure — clean up what we added (already holding mu).
+			p.deactivateLoopbackWFPProtectLocked(state)
+			return fmt.Errorf("failed to add loopback protect filter %q: %w", pf.name, err)
+		}
+		state.loopbackPermitIDs = append(state.loopbackPermitIDs, filterID)
+		mainLog.Load().Debug().Str("filter", pf.name).Uint64("id", filterID).Msg("DNS intercept: added loopback protect filter")
+	}
+
+	state.loopbackProtectActive = true
+	mainLog.Load().Info().Int("filters", len(state.loopbackPermitIDs)).
+		Msg("DNS intercept: loopback WFP protect activated — localhost DNS permitted with CLEAR_ACTION_RIGHT")
+	return nil
+}
+
+// osHealthcheckSuppressed reports whether the upstream.os healthcheck should
+// be skipped because DNS intercept mode is active and the WFP loopback protect
+// has been engaged. Loopback protect is only activated when an external WFP
+// block filter (e.g. OpenVPN's block-outside-dns) is interfering with DNS,
+// which is the same condition that makes the OS resolver healthcheck fail
+// every 2s with i/o timeout — so suppressing the check avoids the log spam
+// described in issue #526.
+func (p *prog) osHealthcheckSuppressed() bool {
+	if !dnsIntercept || p.dnsInterceptState == nil {
+		return false
+	}
+	state, ok := p.dnsInterceptState.(*wfpState)
+	if !ok || state == nil {
+		return false
+	}
+	state.mu.Lock()
+	defer state.mu.Unlock()
+	return state.loopbackProtectActive
+}
+
+// deactivateLoopbackWFPProtectLocked is the lock-free inner implementation.
+// Caller must hold state.mu.
+func (p *prog) deactivateLoopbackWFPProtectLocked(state *wfpState) {
+	if !state.loopbackProtectActive && len(state.loopbackPermitIDs) == 0 {
+		return
+	}
+
+	for _, filterID := range state.loopbackPermitIDs {
+		if state.engineHandle != 0 {
+			r1, _, _ := procFwpmFilterDeleteById0.Call(state.engineHandle, uintptr(filterID))
+			if r1 != 0 {
+				mainLog.Load().Warn().Msgf("DNS intercept: failed to remove loopback protect filter (ID: %d, code: 0x%x)", filterID, r1)
+			}
+		}
+	}
+	state.loopbackPermitIDs = nil
+	state.loopbackProtectActive = false
+	mainLog.Load().Info().Msg("DNS intercept: loopback WFP protect deactivated")
+}
+
+// addWFPHardPermitLocalhostFilter adds a WFP permit filter for DNS to localhost with
+// FWPM_FILTER_FLAG_CLEAR_ACTION_RIGHT. This "hard permit" prevents lower-priority
+// sublayers (e.g., OpenVPN's block-outside-dns sublayer) from blocking DNS to
+// ctrld's loopback listener. Weight is set to 15 (above hard mode's permit=10).
+// For IPv4, the address is derived from listenerIP (e.g., 127.0.0.1 or 127.0.0.2).
+func (p *prog) addWFPHardPermitLocalhostFilter(engineHandle uintptr, name string, layerKey windows.GUID, proto uint8, listenerIP string) (uint64, error) {
+	addrCond := fwpmFilterCondition0{
+		fieldKey:  fwpmConditionIPRemoteAddress,
+		matchType: fwpMatchEqual,
+	}
+
+	ipv6Loopback := [16]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}
+
+	if layerKey == fwpmLayerALEAuthConnectV4 {
+		addrCond.condValue.valueType = fwpUint32
+		addrCond.condValue.value = uint64(parseIPv4AsUint32(listenerIP))
+	} else {
+		addrCond.condValue.valueType = fwpByteArray16Type
+		addrCond.condValue.value = uint64(uintptr(unsafe.Pointer(&ipv6Loopback)))
+	}
+
+	filterID, err := p.addWFPPermitDNSFilter(engineHandle, name, layerKey, proto, addrCond, fwpmFilterFlagClearActionRight, 15)
+	runtime.KeepAlive(&ipv6Loopback)
+	return filterID, err
+}
+
 // stopDNSIntercept removes all WFP filters and shuts down the DNS interception.
 func (p *prog) stopDNSIntercept() error {
 	if p.dnsInterceptState == nil {
@@ -1050,7 +1276,7 @@ func (p *prog) stopDNSIntercept() error {
 		state.nrptActive = false
 	}

-	// Only clean up WFP if we actually opened the engine (hard mode).
+	// Clean up WFP if the engine was opened (hard mode or loopback protect).
 	if state.engineHandle != 0 {
 		mainLog.Load().Info().Msg("DNS intercept: shutting down WFP filters")
 		p.cleanupWFPFilters(state)
@@ -1079,10 +1305,13 @@ func (p *prog) exemptVPNDNSServers(exemptions []vpnDNSExemption) error {
 	if !ok || state == nil {
 		return fmt.Errorf("DNS intercept state not available")
 	}
-	// In dns mode (no WFP), VPN DNS exemptions are not needed — there are no
-	// block filters to exempt from.
-	if state.engineHandle == 0 {
-		mainLog.Load().Debug().Msg("DNS intercept: dns mode — skipping VPN DNS exemptions (no WFP filters)")
+	// In dns mode (no WFP) or loopback-protect-only mode, VPN DNS exemptions
+	// are not needed — there are no ctrld block filters to exempt from.
+	// Loopback protect only adds hard-permit filters for localhost DNS;
+	// VPN DNS traffic uses the tunnel interface and is already permitted by
+	// the VPN's own WFP rules.
+	if state.engineHandle == 0 || state.loopbackProtectActive {
+		mainLog.Load().Debug().Msg("DNS intercept: dns mode — skipping VPN DNS exemptions (no WFP block filters)")
 		return nil
 	}

@@ -1634,6 +1863,40 @@ func (p *prog) nrptProbeAndHeal() {
 	}

 	logNRPTParentKeyState("probe-failed-final")
-	mainLog.Load().Error().Msg("DNS intercept: NRPT verification failed after all retries including two-phase recovery — " +
+	mainLog.Load().Warn().Msg("DNS intercept: NRPT verification failed after all retries including two-phase recovery")
+
+	// Last resort: activate WFP loopback protection.
+	// Third-party VPN software (e.g., OpenVPN with block-outside-dns) may have
+	// installed WFP filters that block DNS to non-tunnel interfaces, including
+	// loopback. A high-priority "hard permit" for localhost DNS overrides these
+	// blocks and restores NRPT routing to ctrld's listener.
+	// See: https://gitlab.int.windscribe.com/controld/clients/ctrld/-/issues/526
+	loopbackState, ok := p.dnsInterceptState.(*wfpState)
+	if !ok || loopbackState == nil {
+		mainLog.Load().Error().Msg("DNS intercept: no state available for loopback WFP protect")
+		return
+	}
+
+	// Bail out if shutdown is in progress — avoid racing with cleanupWFPFilters.
+	select {
+	case <-loopbackState.stopCh:
+		mainLog.Load().Info().Msg("DNS intercept: shutdown in progress, skipping loopback WFP protect activation")
+		return
+	default:
+	}
+
+	if err := p.activateLoopbackWFPProtect(loopbackState); err != nil {
+		mainLog.Load().Error().Err(err).Msg("DNS intercept: failed to activate loopback WFP protect — " +
+			"DNS queries may not be routed through ctrld. A network interface toggle may be needed.")
+		return
+	}
+
+	// Retry NRPT probe now that loopback DNS is explicitly permitted through WFP.
+	time.Sleep(500 * time.Millisecond)
+	if p.probeNRPT() {
+		mainLog.Load().Info().Msg("DNS intercept: NRPT verified working after loopback WFP protect activation")
+		return
+	}
+	mainLog.Load().Error().Msg("DNS intercept: NRPT probe still failing after loopback WFP protect — " +
 		"DNS queries may not be routed through ctrld. A network interface toggle may be needed.")
 }
@@ -744,6 +744,7 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *proxyResponse {
 				var reason RecoveryReason
 				if upstreams[0] == upstreamOS {
 					reason = RecoveryReasonOSFailure
+
 				} else {
 					reason = RecoveryReasonRegularFailure
 				}
@@ -903,16 +904,13 @@ func needLocalIPv6Listener(interceptMode string) bool {
 		mainLog.Load().Debug().Msg("IPv6 listener: enabled (Windows)")
 		return true
 	}
-	// On macOS in intercept mode, pf can't redirect IPv6 DNS to an IPv4 listener (cross-AF rdr
-	// not supported), and blocking IPv6 DNS causes ~1s timeouts (BSD doesn't deliver ICMP errors
-	// to unconnected UDP sockets). Listening on [::1] lets us intercept IPv6 DNS directly.
-	//
-	// NOTE: We accept the intercept mode string as a parameter instead of reading the global
-	// dnsIntercept bool, because dnsIntercept is derived later in prog.run() — after the
-	// listener goroutines are already spawned. Same pattern as the port 5354 fallback fix (MR !860).
-	if (interceptMode == "dns" || interceptMode == "hard") && runtime.GOOS == "darwin" {
-		mainLog.Load().Debug().Msg("IPv6 listener: enabled (macOS intercept mode)")
-		return true
+	// macOS: IPv6 DNS is blocked at the pf level (not intercepted). The [::1] listener
+	// is not needed — macOS falls back to IPv4 DNS automatically. See #507 and
+	// docs/pf-dns-intercept.md for why IPv6 interception on macOS is not feasible
+	// (sendmsg EINVAL from ::1 to global unicast, nat-on-lo0 doesn't fire for route-to).
+	if runtime.GOOS == "darwin" {
+		mainLog.Load().Debug().Msg("IPv6 listener: not needed (macOS — IPv6 DNS blocked at pf, fallback to IPv4)")
+		return false
 	}
 	mainLog.Load().Debug().Str("os", runtime.GOOS).Str("interceptMode", interceptMode).Msg("IPv6 listener: not needed")
 	return false
@@ -1583,7 +1581,7 @@ func (p *prog) monitorNetworkChanges() error {

 		// we only trigger recovery flow for network changes on non router devices
 		if router.Name() == "" {
-			p.handleRecovery(RecoveryReasonNetworkChange)
+			p.debounceRecovery()
 		}

 		// After network changes, verify our pf anchor is still active and
@@ -1660,6 +1658,8 @@ func interfaceIPsEqual(a, b []netip.Prefix) bool {
 	return true
 }

+var errOsHealthcheckSuppressed = errors.New("upstream os health check suppressed")
+
 // checkUpstreamOnce sends a test query to the specified upstream.
 // Returns nil if the upstream responds successfully.
 func (p *prog) checkUpstreamOnce(upstream string, uc *ctrld.UpstreamConfig) error {
@@ -1689,11 +1689,48 @@ func (p *prog) checkUpstreamOnce(upstream string, uc *ctrld.UpstreamConfig) erro
 	duration := time.Since(start)

 	if err != nil {
+		// Demote upstream.os check failures to debug while WFP loopback
+		// protect is active: an external WFP block filter is interfering
+		// with plain DNS so repeated failures here are expected. Other
+		// upstreams keep error level so real outages stay visible.
+		if upstream == upstreamOS && p.osHealthcheckSuppressed() {
+			mainLog.Load().Debug().Err(err).Msgf("Upstream %s check failed after %v (WFP loopback protect active)", upstream, duration)
+			return errOsHealthcheckSuppressed
+		}
 		mainLog.Load().Error().Err(err).Msgf("Upstream %s check failed after %v", upstream, duration)
-	} else {
-		mainLog.Load().Debug().Msgf("Upstream %s responded successfully in %v", upstream, duration)
+		return err
 	}
-	return err
+	mainLog.Load().Debug().Msgf("Upstream %s responded successfully in %v", upstream, duration)
+	return nil
+}
+
+// recoveryDebounceWindow is the time to wait after the last network change
+// before triggering handleRecovery. This coalesces rapid consecutive network
+// changes (e.g., hotspot→LAN causing en1 drop + en0 pickup + en1 re-pickup)
+// into a single recovery pass, avoiding the cancel-and-restart race that
+// leaves DoH transports in a stale state.
+const recoveryDebounceWindow = 500 * time.Millisecond
+
+// debounceRecovery schedules a handleRecovery(NetworkChange) call after a debounce
+// window. If called again before the window expires, the timer is reset so that
+// recovery runs once with the final network state. All other state updates (IP,
+// pf anchor, VPN DNS, tunnel checks) run immediately — only the recovery flow
+// with its upstream probing and DHCP bypass logic is debounced.
+func (p *prog) debounceRecovery() {
+	p.recoveryDebounceMu.Lock()
+	defer p.recoveryDebounceMu.Unlock()
+
+	if p.recoveryDebounceTimer != nil {
+		p.recoveryDebounceTimer.Stop()
+		mainLog.Load().Debug().Msg("Recovery debounce: resetting timer (rapid network change)")
+	}
+	p.recoveryDebounceTimer = time.AfterFunc(recoveryDebounceWindow, func() {
+		p.recoveryDebounceMu.Lock()
+		p.recoveryDebounceTimer = nil
+		p.recoveryDebounceMu.Unlock()
+		p.handleRecovery(RecoveryReasonNetworkChange)
+	})
+	mainLog.Load().Debug().Msg("Recovery debounce: scheduled (500ms window)")
 }

 // handleRecovery performs a unified recovery by removing DNS settings,
@@ -1723,6 +1760,21 @@ func (p *prog) handleRecovery(reason RecoveryReason) {
 		p.recoveryCancelMu.Unlock()
 	}

+	// For network changes, force-reset all upstream transports synchronously.
+	// The lazy ReBootstrap() called earlier in the network change callback only
+	// sets a flag — the old transport's dead connections can still be used by
+	// recovery probes, causing context deadline timeouts. ForceReBootstrap()
+	// closes old connections and creates fresh transports so probes succeed on
+	// first attempt.
+	if reason == RecoveryReasonNetworkChange {
+		for _, uc := range p.cfg.Upstream {
+			if uc != nil {
+				uc.ForceReBootstrap()
+			}
+		}
+		mainLog.Load().Info().Msg("Force-reset upstream transports for network change recovery")
+	}
+
 	// Create a new recovery context without a fixed timeout.
 	p.recoveryCancelMu.Lock()
 	recoveryCtx, cancel := context.WithCancel(context.Background())
@@ -1868,7 +1920,8 @@ func (p *prog) waitForUpstreamRecovery(ctx context.Context, upstreams map[string
 				default:
 					attempts++
 					// checkUpstreamOnce will reset any failure counters on success.
-					if err := p.checkUpstreamOnce(name, uc); err == nil {
+					err := p.checkUpstreamOnce(name, uc)
+					if err == nil || errors.Is(err, errOsHealthcheckSuppressed) {
 						mainLog.Load().Debug().Msgf("Upstream %s recovered successfully", name)
 						select {
 						case recoveredCh <- name:
@@ -0,0 +1,339 @@
+package cli
+
+import (
+	"io"
+	"os"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+)
+
+// =============================================================================
+// logWriter.tailLastLines tests
+// =============================================================================
+
+func Test_logWriter_tailLastLines_Empty(t *testing.T) {
+	lw := newLogWriterWithSize(4096)
+	if got := lw.tailLastLines(10); got != nil {
+		t.Fatalf("expected nil for empty buffer, got %q", got)
+	}
+}
+
+func Test_logWriter_tailLastLines_ZeroLines(t *testing.T) {
+	lw := newLogWriterWithSize(4096)
+	lw.Write([]byte("line1\nline2\n"))
+	if got := lw.tailLastLines(0); got != nil {
+		t.Fatalf("expected nil for n=0, got %q", got)
+	}
+}
+
+func Test_logWriter_tailLastLines_NegativeLines(t *testing.T) {
+	lw := newLogWriterWithSize(4096)
+	lw.Write([]byte("line1\nline2\n"))
+	if got := lw.tailLastLines(-1); got != nil {
+		t.Fatalf("expected nil for n=-1, got %q", got)
+	}
+}
+
+func Test_logWriter_tailLastLines_FewerThanN(t *testing.T) {
+	lw := newLogWriterWithSize(4096)
+	lw.Write([]byte("line1\nline2\n"))
+	got := string(lw.tailLastLines(10))
+	want := "line1\nline2\n"
+	if got != want {
+		t.Fatalf("got %q, want %q", got, want)
+	}
+}
+
+func Test_logWriter_tailLastLines_ExactN(t *testing.T) {
+	lw := newLogWriterWithSize(4096)
+	lw.Write([]byte("line1\nline2\nline3\n"))
+	got := string(lw.tailLastLines(3))
+	want := "line1\nline2\nline3\n"
+	if got != want {
+		t.Fatalf("got %q, want %q", got, want)
+	}
+}
+
+func Test_logWriter_tailLastLines_MoreThanN(t *testing.T) {
+	lw := newLogWriterWithSize(4096)
+	lw.Write([]byte("line1\nline2\nline3\nline4\nline5\n"))
+	got := string(lw.tailLastLines(2))
+	want := "line4\nline5\n"
+	if got != want {
+		t.Fatalf("got %q, want %q", got, want)
+	}
+}
+
+func Test_logWriter_tailLastLines_NoTrailingNewline(t *testing.T) {
+	lw := newLogWriterWithSize(4096)
+	lw.Write([]byte("line1\nline2\nline3"))
+	// Without trailing newline, "line3" is a partial line.
+	// Asking for 1 line returns the last newline-terminated line plus the partial.
+	got := string(lw.tailLastLines(1))
+	want := "line2\nline3"
+	if got != want {
+		t.Fatalf("got %q, want %q", got, want)
+	}
+}
+
+func Test_logWriter_tailLastLines_SingleLineNoNewline(t *testing.T) {
+	lw := newLogWriterWithSize(4096)
+	lw.Write([]byte("only line"))
+	got := string(lw.tailLastLines(5))
+	want := "only line"
+	if got != want {
+		t.Fatalf("got %q, want %q", got, want)
+	}
+}
+
+func Test_logWriter_tailLastLines_SingleLineWithNewline(t *testing.T) {
+	lw := newLogWriterWithSize(4096)
+	lw.Write([]byte("only line\n"))
+	got := string(lw.tailLastLines(1))
+	want := "only line\n"
+	if got != want {
+		t.Fatalf("got %q, want %q", got, want)
+	}
+}
+
+// =============================================================================
+// logWriter.Subscribe tests
+// =============================================================================
+
+func Test_logWriter_Subscribe_Basic(t *testing.T) {
+	lw := newLogWriterWithSize(4096)
+	ch, unsub := lw.Subscribe()
+	defer unsub()
+
+	msg := []byte("hello world\n")
+	lw.Write(msg)
+
+	select {
+	case got := <-ch:
+		if string(got) != string(msg) {
+			t.Fatalf("got %q, want %q", got, msg)
+		}
+	case <-time.After(time.Second):
+		t.Fatal("timed out waiting for subscriber data")
+	}
+}
+
+func Test_logWriter_Subscribe_MultipleSubscribers(t *testing.T) {
+	lw := newLogWriterWithSize(4096)
+	ch1, unsub1 := lw.Subscribe()
+	defer unsub1()
+	ch2, unsub2 := lw.Subscribe()
+	defer unsub2()
+
+	msg := []byte("broadcast\n")
+	lw.Write(msg)
+
+	for i, ch := range []<-chan []byte{ch1, ch2} {
+		select {
+		case got := <-ch:
+			if string(got) != string(msg) {
+				t.Fatalf("subscriber %d: got %q, want %q", i, got, msg)
+			}
+		case <-time.After(time.Second):
+			t.Fatalf("subscriber %d: timed out", i)
+		}
+	}
+}
+
+func Test_logWriter_Subscribe_Unsubscribe(t *testing.T) {
+	lw := newLogWriterWithSize(4096)
+	ch, unsub := lw.Subscribe()
+
+	// Verify subscribed.
+	lw.Write([]byte("before unsub\n"))
+	select {
+	case <-ch:
+	case <-time.After(time.Second):
+		t.Fatal("timed out before unsub")
+	}
+
+	unsub()
+
+	// Channel should be closed after unsub.
+	if _, ok := <-ch; ok {
+		t.Fatal("channel should be closed after unsubscribe")
+	}
+
+	// Verify subscriber list is empty.
+	lw.mu.Lock()
+	count := len(lw.subscribers)
+	lw.mu.Unlock()
+	if count != 0 {
+		t.Fatalf("expected 0 subscribers after unsub, got %d", count)
+	}
+}
+
+func Test_logWriter_Subscribe_UnsubscribeIdempotent(t *testing.T) {
+	lw := newLogWriterWithSize(4096)
+	_, unsub := lw.Subscribe()
+	unsub()
+	// Second unsub should not panic.
+	unsub()
+}
+
+func Test_logWriter_Subscribe_SlowSubscriberDropped(t *testing.T) {
+	lw := newLogWriterWithSize(4096)
+	ch, unsub := lw.Subscribe()
+	defer unsub()
+
+	// Fill the subscriber channel (buffer size is 256).
+	for i := 0; i < 300; i++ {
+		lw.Write([]byte("msg\n"))
+	}
+
+	// Should have 256 buffered messages, rest dropped.
+	count := 0
+	for {
+		select {
+		case <-ch:
+			count++
+		default:
+			goto done
+		}
+	}
+done:
+	if count != 256 {
+		t.Fatalf("expected 256 buffered messages, got %d", count)
+	}
+}
+
+func Test_logWriter_Subscribe_ConcurrentWriteAndRead(t *testing.T) {
+	lw := newLogWriterWithSize(64 * 1024)
+	ch, unsub := lw.Subscribe()
+	defer unsub()
+
+	const numWrites = 100
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for i := 0; i < numWrites; i++ {
+			lw.Write([]byte("concurrent write\n"))
+		}
+	}()
+
+	received := 0
+	timeout := time.After(5 * time.Second)
+	for received < numWrites {
+		select {
+		case <-ch:
+			received++
+		case <-timeout:
+			t.Fatalf("timed out after receiving %d/%d messages", received, numWrites)
+		}
+	}
+	wg.Wait()
+}
+
+// =============================================================================
+// tailFileLastLines tests
+// =============================================================================
+
+func writeTempFile(t *testing.T, content string) *os.File {
+	t.Helper()
+	f, err := os.CreateTemp(t.TempDir(), "tail-test-*")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if _, err := f.WriteString(content); err != nil {
+		t.Fatal(err)
+	}
+	return f
+}
+
+func Test_tailFileLastLines_Empty(t *testing.T) {
+	f := writeTempFile(t, "")
+	defer f.Close()
+	if got := tailFileLastLines(f, 10); got != nil {
+		t.Fatalf("expected nil for empty file, got %q", got)
+	}
+}
+
+func Test_tailFileLastLines_FewerThanN(t *testing.T) {
+	f := writeTempFile(t, "line1\nline2\n")
+	defer f.Close()
+	got := string(tailFileLastLines(f, 10))
+	want := "line1\nline2\n"
+	if got != want {
+		t.Fatalf("got %q, want %q", got, want)
+	}
+}
+
+func Test_tailFileLastLines_ExactN(t *testing.T) {
+	f := writeTempFile(t, "a\nb\nc\n")
+	defer f.Close()
+	got := string(tailFileLastLines(f, 3))
+	want := "a\nb\nc\n"
+	if got != want {
+		t.Fatalf("got %q, want %q", got, want)
+	}
+}
+
+func Test_tailFileLastLines_MoreThanN(t *testing.T) {
+	f := writeTempFile(t, "line1\nline2\nline3\nline4\nline5\n")
+	defer f.Close()
+	got := string(tailFileLastLines(f, 2))
+	want := "line4\nline5\n"
+	if got != want {
+		t.Fatalf("got %q, want %q", got, want)
+	}
+}
+
+func Test_tailFileLastLines_NoTrailingNewline(t *testing.T) {
+	f := writeTempFile(t, "line1\nline2\nline3")
+	defer f.Close()
+	// Without trailing newline, partial last line comes with the previous line.
+	got := string(tailFileLastLines(f, 1))
+	want := "line2\nline3"
+	if got != want {
+		t.Fatalf("got %q, want %q", got, want)
+	}
+}
+
+func Test_tailFileLastLines_LargerThanChunk(t *testing.T) {
+	// Build content larger than the 4096 chunk size to exercise multi-chunk reads.
+	var sb strings.Builder
+	for i := 0; i < 200; i++ {
+		sb.WriteString(strings.Repeat("x", 50))
+		sb.WriteByte('\n')
+	}
+	f := writeTempFile(t, sb.String())
+	defer f.Close()
+	got := string(tailFileLastLines(f, 3))
+	lines := strings.Split(strings.TrimRight(got, "\n"), "\n")
+	if len(lines) != 3 {
+		t.Fatalf("expected 3 lines, got %d: %q", len(lines), got)
+	}
+	expectedLine := strings.Repeat("x", 50)
+	for _, line := range lines {
+		if line != expectedLine {
+			t.Fatalf("unexpected line content: %q", line)
+		}
+	}
+}
+
+func Test_tailFileLastLines_SeeksToEnd(t *testing.T) {
+	f := writeTempFile(t, "line1\nline2\nline3\n")
+	defer f.Close()
+	tailFileLastLines(f, 1)
+
+	// After tailFileLastLines, file position should be at the end.
+	pos, err := f.Seek(0, io.SeekCurrent)
+	if err != nil {
+		t.Fatal(err)
+	}
+	stat, err := f.Stat()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if pos != stat.Size() {
+		t.Fatalf("expected file position at end (%d), got %d", stat.Size(), pos)
+	}
+}
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"path/filepath"
 	"strings"
 	"sync"
 	"time"
@@ -22,6 +23,9 @@ const (
 	logWriterSentInterval  = time.Minute
 	logWriterInitEndMarker = "\n\n=== INIT_END ===\n\n"
 	logWriterLogEndMarker  = "\n\n=== LOG_END ===\n\n"
+
+	logFileName    = "ctrld.log"
+	logFileMaxSize = 1024 * 1024 * 5 // 5 MB
 )

 type logViewResponse struct {
@@ -38,11 +42,24 @@ type logReader struct {
 	size int64
 }

+// logSubscriber represents a subscriber to live log output.
+type logSubscriber struct {
+	ch chan []byte
+}
+
 // logWriter is an internal buffer to keep track of runtime log when no logging is enabled.
+// When a file path is configured via setLogFile, writes are also persisted to
+// a rotated file on disk (max logFileMaxSize, 1 backup) so logs survive restarts.
 type logWriter struct {
-	mu   sync.Mutex
-	buf  bytes.Buffer
-	size int
+	mu          sync.Mutex
+	buf         bytes.Buffer
+	size        int
+	subscribers []*logSubscriber
+
+	// File persistence fields.
+	logFile     *os.File
+	logFilePath string
+	logFileSize int64
 }

 // newLogWriter creates an internal log writer.
@@ -61,10 +78,154 @@ func newLogWriterWithSize(size int) *logWriter {
 	return lw
 }

+// setLogFile configures file-backed persistence for the log writer.
+// The directory is created if it does not exist. An existing file is
+// opened in append mode and its current size is tracked for rotation.
+func (lw *logWriter) setLogFile(path string) error {
+	dir := filepath.Dir(path)
+	if err := os.MkdirAll(dir, 0750); err != nil {
+		return fmt.Errorf("creating log directory: %w", err)
+	}
+	f, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0600)
+	if err != nil {
+		return fmt.Errorf("opening log file: %w", err)
+	}
+	st, err := f.Stat()
+	if err != nil {
+		f.Close()
+		return fmt.Errorf("stat log file: %w", err)
+	}
+	lw.mu.Lock()
+	defer lw.mu.Unlock()
+	lw.logFile = f
+	lw.logFilePath = path
+	lw.logFileSize = st.Size()
+	return nil
+}
+
+// rotateLogFile rotates the current log file to a .1 backup.
+// It returns true if lw.logFile is usable after the call, false otherwise.
+// Must be called with lw.mu held.
+func (lw *logWriter) rotateLogFile() bool {
+	if lw.logFile == nil {
+		return false
+	}
+	lw.logFile.Close()
+	backupPath := lw.logFilePath + ".1"
+	// Best effort: rename current to backup (overwrites old backup).
+	os.Rename(lw.logFilePath, backupPath)
+	f, err := os.OpenFile(lw.logFilePath, os.O_CREATE|os.O_RDWR|os.O_TRUNC, 0600)
+	if err != nil {
+		// If we can't reopen, disable file logging.
+		lw.logFile = nil
+		lw.logFileSize = 0
+		return false
+	}
+	lw.logFile = f
+	lw.logFileSize = 0
+	return true
+}
+
+// closeLogFile closes the backing file if open.
+func (lw *logWriter) closeLogFile() {
+	lw.mu.Lock()
+	defer lw.mu.Unlock()
+	if lw.logFile != nil {
+		lw.logFile.Close()
+		lw.logFile = nil
+	}
+}
+
+// logFilePaths returns the paths to the current log file and its backup
+// (if they exist) for inclusion in log send payloads.
+func (lw *logWriter) logFilePaths() (current, backup string) {
+	lw.mu.Lock()
+	defer lw.mu.Unlock()
+	if lw.logFilePath == "" {
+		return "", ""
+	}
+	current = lw.logFilePath
+	bp := lw.logFilePath + ".1"
+	if _, err := os.Stat(bp); err == nil {
+		backup = bp
+	}
+	return current, backup
+}
+
+// Subscribe returns a channel that receives new log data as it's written,
+// and an unsubscribe function to clean up when done.
+func (lw *logWriter) Subscribe() (<-chan []byte, func()) {
+	lw.mu.Lock()
+	defer lw.mu.Unlock()
+	sub := &logSubscriber{ch: make(chan []byte, 256)}
+	lw.subscribers = append(lw.subscribers, sub)
+	unsub := func() {
+		lw.mu.Lock()
+		defer lw.mu.Unlock()
+		for i, s := range lw.subscribers {
+			if s == sub {
+				lw.subscribers = append(lw.subscribers[:i], lw.subscribers[i+1:]...)
+				close(sub.ch)
+				break
+			}
+		}
+	}
+	return sub.ch, unsub
+}
+
+// tailLastLines returns the last n lines from the current buffer.
+func (lw *logWriter) tailLastLines(n int) []byte {
+	lw.mu.Lock()
+	defer lw.mu.Unlock()
+	data := lw.buf.Bytes()
+	if n <= 0 || len(data) == 0 {
+		return nil
+	}
+	// Find the last n newlines from the end.
+	count := 0
+	pos := len(data)
+	for pos > 0 {
+		pos--
+		if data[pos] == '\n' {
+			count++
+			if count == n+1 {
+				pos++ // move past this newline
+				break
+			}
+		}
+	}
+	result := make([]byte, len(data)-pos)
+	copy(result, data[pos:])
+	return result
+}
+
 func (lw *logWriter) Write(p []byte) (int, error) {
 	lw.mu.Lock()
 	defer lw.mu.Unlock()

+	// Fan-out to subscribers (non-blocking).
+	if len(lw.subscribers) > 0 {
+		cp := make([]byte, len(p))
+		copy(cp, p)
+		for _, sub := range lw.subscribers {
+			select {
+			case sub.ch <- cp:
+			default:
+				// Drop if subscriber is slow to avoid blocking the logger.
+			}
+		}
+	}
+
+	// Write to backing file if configured.
+	if lw.logFile != nil {
+		needsRotation := lw.logFileSize+int64(len(p)) > logFileMaxSize
+		if !needsRotation || lw.rotateLogFile() {
+			if n, err := lw.logFile.Write(p); err == nil {
+				lw.logFileSize += int64(n)
+			}
+		}
+	}
+
 	// If writing p causes overflows, discard old data.
 	if lw.buf.Len()+len(p) > lw.size {
 		buf := lw.buf.Bytes()
@@ -102,6 +263,12 @@ func (p *prog) initLogging(backup bool) {
 	p.initInternalLogging(logWriters)
 }

+// internalLogFilePath returns the path for persisted internal logs.
+// The file lives in the ctrld home directory alongside other runtime state.
+func internalLogFilePath() string {
+	return absHomeDir(logFileName)
+}
+
 // initInternalLogging performs internal logging if there's no log enabled.
 func (p *prog) initInternalLogging(writers []io.Writer) {
 	if !p.needInternalLogging() {
@@ -112,6 +279,14 @@ func (p *prog) initInternalLogging(writers []io.Writer) {
 		p.internalLogWriter = newLogWriter()
 		p.internalLogSent = time.Now().Add(-logWriterSentInterval)
 		p.internalWarnLogWriter = newSmallLogWriter()
+		// Persist internal logs to disk so they survive restarts.
+		if path := internalLogFilePath(); path != "" {
+			if err := p.internalLogWriter.setLogFile(path); err != nil {
+				mainLog.Load().Warn().Err(err).Msg("could not enable persistent internal logging")
+			} else {
+				mainLog.Load().Notice().Msgf("internal log file: %s", path)
+			}
+		}
 	})
 	p.mu.Lock()
 	lw := p.internalLogWriter
@@ -166,7 +341,15 @@ func (p *prog) logReader() (*logReader, error) {
 		if wlw == nil {
 			return nil, errors.New("nil internal warn log writer")
 		}
-		// Normal log content.
+
+		// If we have a persisted log file, read from disk (includes data
+		// from previous runs that the in-memory buffer wouldn't have).
+		current, backup := lw.logFilePaths()
+		if current != "" {
+			return p.logReaderFromFiles(current, backup, wlw)
+		}
+
+		// Fall back to in-memory buffer.
 		lw.mu.Lock()
 		lwReader := bytes.NewReader(lw.buf.Bytes())
 		lwSize := lw.buf.Len()
@@ -202,3 +385,83 @@ func (p *prog) logReader() (*logReader, error) {
 	}
 	return lr, nil
 }
+
+// logReaderFromFiles builds a logReader that concatenates the backup file
+// (if it exists), the current log file, and the in-memory warn log buffer.
+func (p *prog) logReaderFromFiles(current, backup string, wlw *logWriter) (*logReader, error) {
+	var rcs []io.ReadCloser
+	var totalSize int64
+
+	closeAll := func() {
+		for _, rc := range rcs {
+			rc.Close()
+		}
+	}
+
+	// Read backup file first (older entries).
+	if backup != "" {
+		if bf, err := os.Open(backup); err == nil {
+			if st, err := bf.Stat(); err == nil {
+				totalSize += st.Size()
+			}
+			rcs = append(rcs, bf)
+		}
+	}
+
+	// Read current file.
+	cf, err := os.Open(current)
+	if err != nil {
+		closeAll()
+		return nil, fmt.Errorf("opening current log file: %w", err)
+	}
+	if st, err := cf.Stat(); err == nil {
+		totalSize += st.Size()
+	}
+	rcs = append(rcs, cf)
+
+	// Append warn log content from memory.
+	wlw.mu.Lock()
+	warnData := make([]byte, wlw.buf.Len())
+	copy(warnData, wlw.buf.Bytes())
+	wlw.mu.Unlock()
+
+	if len(warnData) > 0 {
+		rcs = append(rcs, io.NopCloser(bytes.NewReader([]byte(logWriterLogEndMarker))))
+		rcs = append(rcs, io.NopCloser(bytes.NewReader(warnData)))
+		totalSize += int64(len(logWriterLogEndMarker) + len(warnData))
+	}
+
+	if totalSize == 0 {
+		closeAll()
+		return nil, errors.New("internal log is empty")
+	}
+
+	readers := make([]io.Reader, len(rcs))
+	closers := make([]io.Closer, len(rcs))
+	for i, rc := range rcs {
+		readers[i] = rc
+		closers[i] = rc
+	}
+	combined := io.MultiReader(readers...)
+	lr := &logReader{
+		r:    &multiCloser{Reader: combined, closers: closers},
+		size: totalSize,
+	}
+	return lr, nil
+}
+
+// multiCloser wraps an io.Reader and closes multiple underlying closers.
+type multiCloser struct {
+	io.Reader
+	closers []io.Closer
+}
+
+func (mc *multiCloser) Close() error {
+	var firstErr error
+	for _, c := range mc.closers {
+		if err := c.Close(); err != nil && firstErr == nil {
+			firstErr = err
+		}
+	}
+	return firstErr
+}
@@ -1,6 +1,8 @@
 package cli

 import (
+	"os"
+	"path/filepath"
 	"strings"
 	"sync"
 	"testing"
@@ -83,3 +85,126 @@ func Test_logWriter_MarkerInitEnd(t *testing.T) {
 		t.Fatalf("unexpected log content: %s", lw.buf.String())
 	}
 }
+
+func Test_logWriter_SetLogFile(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "test.log")
+	lw := newLogWriterWithSize(logWriterSize)
+	if err := lw.setLogFile(path); err != nil {
+		t.Fatalf("setLogFile: %v", err)
+	}
+	defer lw.closeLogFile()
+
+	msg := "hello file\n"
+	lw.Write([]byte(msg))
+
+	// Verify data in memory buffer.
+	if lw.buf.String() != msg {
+		t.Fatalf("buffer: got %q, want %q", lw.buf.String(), msg)
+	}
+	// Verify data on disk.
+	data, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("ReadFile: %v", err)
+	}
+	if string(data) != msg {
+		t.Fatalf("file: got %q, want %q", data, msg)
+	}
+}
+
+func Test_logWriter_FileRotation(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "test.log")
+	// Use a tiny max size to trigger rotation quickly.
+	lw := newLogWriterWithSize(logWriterSize)
+	if err := lw.setLogFile(path); err != nil {
+		t.Fatalf("setLogFile: %v", err)
+	}
+	defer lw.closeLogFile()
+
+	// Write enough to exceed logFileMaxSize.
+	chunk := strings.Repeat("X", 1024) + "\n"
+	written := 0
+	for written < logFileMaxSize+1024 {
+		lw.Write([]byte(chunk))
+		written += len(chunk)
+	}
+
+	// Backup file should exist.
+	backupPath := path + ".1"
+	if _, err := os.Stat(backupPath); os.IsNotExist(err) {
+		t.Fatal("expected backup file to exist after rotation")
+	}
+
+	// Current file should be smaller than max (it was rotated).
+	st, err := os.Stat(path)
+	if err != nil {
+		t.Fatalf("stat current: %v", err)
+	}
+	if st.Size() > logFileMaxSize {
+		t.Fatalf("current file too large after rotation: %d", st.Size())
+	}
+}
+
+func Test_logWriter_FilePaths(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "test.log")
+	lw := newLogWriterWithSize(logWriterSize)
+
+	// No file configured.
+	c, b := lw.logFilePaths()
+	if c != "" || b != "" {
+		t.Fatalf("expected empty paths, got %q %q", c, b)
+	}
+
+	if err := lw.setLogFile(path); err != nil {
+		t.Fatalf("setLogFile: %v", err)
+	}
+	defer lw.closeLogFile()
+
+	// Current exists, no backup yet.
+	c, b = lw.logFilePaths()
+	if c != path {
+		t.Fatalf("current: got %q, want %q", c, path)
+	}
+	if b != "" {
+		t.Fatalf("backup should be empty, got %q", b)
+	}
+
+	// Create a backup file manually.
+	os.WriteFile(path+".1", []byte("old"), 0600)
+	_, b = lw.logFilePaths()
+	if b != path+".1" {
+		t.Fatalf("backup: got %q, want %q", b, path+".1")
+	}
+}
+
+func Test_logWriter_FileAppendOnRestart(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "test.log")
+
+	// Simulate first run.
+	lw1 := newLogWriterWithSize(logWriterSize)
+	if err := lw1.setLogFile(path); err != nil {
+		t.Fatalf("setLogFile: %v", err)
+	}
+	lw1.Write([]byte("run1\n"))
+	lw1.closeLogFile()
+
+	// Simulate second run (restart) — file should be appended.
+	lw2 := newLogWriterWithSize(logWriterSize)
+	if err := lw2.setLogFile(path); err != nil {
+		t.Fatalf("setLogFile: %v", err)
+	}
+	lw2.Write([]byte("run2\n"))
+	lw2.closeLogFile()
+
+	data, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("ReadFile: %v", err)
+	}
+	want := "run1\nrun2\n"
+	if string(data) != want {
+		t.Fatalf("file: got %q, want %q", data, want)
+	}
+}
@@ -146,6 +146,12 @@ type prog struct {
 	recoveryCancel   context.CancelFunc
 	recoveryRunning  atomic.Bool

+	// recoveryDebounceTimer coalesces rapid NetworkChange recovery triggers
+	// into a single handleRecovery call. Only handleRecovery is debounced —
+	// all other state updates (IP, pf anchor, VPN DNS) run immediately.
+	recoveryDebounceMu    sync.Mutex
+	recoveryDebounceTimer *time.Timer
+
 	// recoveryBypass is set when dns-intercept mode enters recovery.
 	// When true, proxy() forwards all queries to OS/DHCP resolver
 	// instead of using the normal upstream flow.
@@ -160,6 +160,7 @@ func hasLocalDnsServerRunning() bool {
 	if e != nil {
 		return false
 	}
+	defer windows.CloseHandle(h)
 	p := windows.ProcessEntry32{Size: processEntrySize}
 	for {
 		e := windows.Process32Next(h, &p)
@@ -2,6 +2,7 @@ package cli

 import (
 	"context"
+	"net"
 	"strings"
 	"sync"

@@ -241,10 +242,12 @@ func (m *vpnDNSManager) Routes() map[string][]string {

 // upstreamConfigFor creates a legacy upstream configuration for the given VPN DNS server.
 func (m *vpnDNSManager) upstreamConfigFor(server string) *ctrld.UpstreamConfig {
-	endpoint := server
-	if !strings.Contains(server, ":") {
-		endpoint = server + ":53"
-	}
+	// Use net.JoinHostPort to correctly handle both IPv4 and IPv6 addresses.
+	// Previously, the strings.Contains(":") check would skip appending ":53"
+	// for IPv6 addresses (they contain colons), leaving a bare address like
+	// "2a0d:6fc0:9b0:3600::1" which net.Dial rejects with "too many colons".
+	// net.JoinHostPort produces "[2a0d:6fc0:9b0:3600::1]:53" as required.
+	endpoint := net.JoinHostPort(server, "53")

 	return &ctrld.UpstreamConfig{
 		Name:     "VPN DNS",
@@ -56,7 +56,7 @@ ctrld run --intercept-mode hard --cd <resolver-uid>

 Windows DNS intercept uses a two-tier architecture with mode-dependent enforcement:

- **`dns` mode**: NRPT only — graceful DNS routing through the Windows DNS Client service. At worst, a VPN overwrites NRPT and queries bypass ctrld temporarily. DNS never breaks.
+- **`dns` mode**: NRPT + loopback WFP protect — graceful DNS routing through the Windows DNS Client service, with proactive WFP permit filters that protect the NRPT → localhost path from third-party DNS block filters (e.g., OpenVPN's `block-outside-dns`).
 - **`hard` mode**: NRPT + WFP — same NRPT routing, plus WFP kernel-level block filters that prevent any outbound DNS bypass. Equivalent enforcement to macOS pf.

 #### Why This Design?
@@ -70,8 +70,9 @@ Separating them into modes means most users get `dns` mode (safe, can never brea
 1. Creates NRPT catch-all registry rule (`.` → `127.0.0.1`) under `HKLM\...\DnsPolicyConfig\CtrldCatchAll`
 2. Triggers Group Policy refresh via `RefreshPolicyEx` (userenv.dll) so DNS Client loads NRPT immediately
 3. Flushes DNS cache to clear stale entries
-4. Starts NRPT health monitor (30s periodic check)
-5. Launches async NRPT probe-and-heal to verify NRPT is actually routing queries
+4. **Activates loopback WFP protect** — adds 4 permit filters (IPv4/IPv6 × UDP/TCP) for DNS to localhost with `FWPM_FILTER_FLAG_CLEAR_ACTION_RIGHT`. These prevent third-party WFP block filters from blocking the NRPT → `127.0.0.1` path (see [Loopback WFP Protect](#loopback-wfp-protect) below). Non-fatal if this fails.
+5. Starts NRPT health monitor (30s periodic check)
+6. Launches async NRPT probe-and-heal to verify NRPT is actually routing queries

 #### Startup Sequence (hard mode)

@@ -112,6 +113,32 @@ The **Name Resolution Policy Table** is a Windows feature (originally for Direct

 **VPN coexistence**: VPN software can set DNS to whatever it wants on the interface — for public IPs, the WFP block filter prevents those servers from being reached on port 53. For private IPs, the subnet permits allow it. ctrld handles all DNS routing through NRPT and can forward VPN-specific domains to VPN DNS servers through its own upstream mechanism.

+#### Loopback WFP Protect (dns mode)
+
+Third-party VPN software (e.g., OpenVPN, Securepoint SSL VPN) can install WFP block filters via `block-outside-dns` that block **all** DNS traffic to non-tunnel interfaces — including loopback. This breaks the NRPT → `127.0.0.1:53` path that ctrld depends on, causing DNS resolution to time out.
+
+ctrld proactively adds 4 WFP "hard permit" filters at startup:
+
+| Filter | Layer | Protocol |
+|---|---|---|
+| Permit DNS to localhost (IPv4/UDP) | ALE_AUTH_CONNECT_V4 | UDP |
+| Permit DNS to localhost (IPv4/TCP) | ALE_AUTH_CONNECT_V4 | TCP |
+| Permit DNS to localhost (IPv6/UDP) | ALE_AUTH_CONNECT_V6 | UDP |
+| Permit DNS to localhost (IPv6/TCP) | ALE_AUTH_CONNECT_V6 | TCP |
+
+**Key properties:**
+- **Scope**: Port 53 to `127.0.0.1` (or configured listener IP) and `::1` only
+- **Flag**: `FWPM_FILTER_FLAG_CLEAR_ACTION_RIGHT` (0x08) — "hard permit" that overrides BLOCK decisions from other sublayers regardless of weight or insertion order
+- **Weight**: 15 (above hard mode's permit=10)
+- **Sublayer**: ctrld's sublayer at maximum priority (0xFFFF)
+- **Lifetime**: Process lifetime — added at startup, removed on shutdown/uninstall
+
+Because `CLEAR_ACTION_RIGHT` is a cross-sublayer override, the order of filter installation doesn't matter — even if a VPN connects hours later and adds its own WFP block filters, ctrld's hard permit for loopback DNS is never overridden.
+
+The reactive fallback in `nrptProbeAndHeal()` is preserved as defense-in-depth for edge cases where proactive activation fails at startup.
+
+See: [Issue #526](https://gitlab.int.windscribe.com/controld/clients/ctrld/-/issues/526)
+
 #### NRPT Probe and Auto-Heal

 `RefreshPolicyEx` returns immediately — it does NOT wait for the DNS Client service to actually load the NRPT rule. On cold machines (first boot, fresh install), the DNS Client may take several seconds to process the policy refresh. During this window, the NRPT rule exists in the registry but isn't active.
@@ -22,6 +22,57 @@ This document outlines known issues with ctrld and their current status, workaro

 ---

+## Merlin Issues
+
+### Daemon Crashing on `Ctrl+C`
+
+**Issue**: `ctrld` daemon terminates unexpectedly after stopping a log tailing command. This typically occurs when running the daemon and the log viewer within the same SSH session on ASUSWRT-Merlin routers.
+
+**Description**
+
+The issue is caused by `Signal Propagation` within a shared `Process Group (PGID)`.
+
+Steps to reproduce:
+
+1. You start the daemon manually: `ctrld start --cd=<uid>`.
+2. You view internal logs in the same terminal: `ctrld log tail`.
+3. You press `Ctrl+C` to stop viewing logs.
+4. The `ctrld` daemon service stops immediately along with the log command.
+
+When you execute commands sequentially in a single interactive SSH session on Merlin, the shell often assigns them to the same Process Group. In Linux, the `SIGINT` signal (triggered by `Ctrl+C`) is not just sent to the foreground application, but is frequently propagated to every process belonging to that specific process group.
+
+Because the `ctrld` daemon remains "attached" to the terminal session's process group, it "hears" the interrupt signal intended for the `log tail` command and shuts down.
+
+**Workarounds**:
+
+To isolate the signals, avoid running the log viewer in the same window as the daemon:
+* **Window A:** Start the daemon and leave it running.
+* **Window B:** Open a new SSH connection to run `ctrld log tail`.
+Because Window B has a different **Session ID** and **Process Group ID**, pressing `Ctrl+C` in Window B will not affect the process in Window A.
+
+## Windows Issues
+
+### VPN `block-outside-dns` Breaks DNS When Using ctrld in DNS Mode
+
+**Issue**: VPN software that uses OpenVPN's `block-outside-dns` directive installs WFP (Windows Filtering Platform) block filters that prevent DNS queries from reaching ctrld's loopback listener.
+
+**Status**: Fixed in v1.5.1
+
+**Description**: When a VPN connects with `block-outside-dns` enabled, OpenVPN adds WFP filters that block all DNS traffic to non-tunnel interfaces — including loopback (`127.0.0.1`). Since ctrld's NRPT catch-all rule routes DNS through the Windows DNS Client to `127.0.0.1:53`, the WFP block filters prevent DNS Client from reaching ctrld, causing all DNS queries to time out.
+
+This affects any VPN client that implements `block-outside-dns` via WFP, including:
+- OpenVPN GUI (community)
+- Securepoint SSL VPN
+- Any OpenVPN-based client that honors the `block-outside-dns` push directive
+
+**Fix**: ctrld now proactively adds WFP "hard permit" filters for DNS to localhost at startup. These use `FWPM_FILTER_FLAG_CLEAR_ACTION_RIGHT` to override block decisions from any other WFP sublayer, ensuring the NRPT → loopback path is always available regardless of VPN state. See `docs/dns-intercept-mode.md` for technical details.
+
+**Affected Versions**: ctrld ≤ v1.5.0 in `dns` intercept mode on Windows
+
+**Last Updated**: 04/28/2026
+
+---
+
 ## Contributing to Known Issues

 If you encounter an issue not listed here, please:
@@ -17,7 +17,7 @@ options (set) → normalization (scrub) → queueing → translation (nat/rdr)
 | Anchor Type | Section | Purpose |
 |-------------|---------|---------|
 | `scrub-anchor` | Normalization | Packet normalization |
-| `nat-anchor` | Translation | NAT rules |
+| `nat-anchor` | Translation | NAT rules (not used by ctrld) |
 | `rdr-anchor` | Translation | Redirect rules |
 | `anchor` | Filtering | Pass/block rules |

@@ -122,69 +122,31 @@ Three problems prevent a simple "mirror the IPv4 rules" approach:

 3. **sendmsg from `[::1]` to global unicast fails**: Unlike IPv4 where the kernel allows `sendmsg` from `127.0.0.1` to local private IPs (e.g., `10.x.x.x`), macOS/BSD rejects `sendmsg` from `[::1]` to a global unicast IPv6 address with `EINVAL`. Since pf's `rdr` preserves the original source IP (the machine's global IPv6 address), ctrld's reply would fail.

-### Solution: nat + rdr + [::1] Listener
+### Solution: Block IPv6 DNS, Fallback to IPv4
+
+After extensive testing (#507), IPv6 DNS interception on macOS is not feasible with current pf capabilities. The solution is to block all outbound IPv6 DNS:

 ```
-# NAT: rewrite source to ::1 so ctrld can reply
-nat on lo0 inet6 proto udp from ! ::1 to ! ::1 port 53 -> ::1
-nat on lo0 inet6 proto tcp from ! ::1 to ! ::1 port 53 -> ::1
-
-# RDR: redirect destination to ctrld's IPv6 listener
-rdr on lo0 inet6 proto udp from any to ! ::1 port 53 -> ::1 port 53
-rdr on lo0 inet6 proto tcp from any to ! ::1 port 53 -> ::1 port 53
-
-# Filter: route-to forces IPv6 DNS to loopback (mirrors IPv4 rules)
-pass out quick on ! lo0 route-to lo0 inet6 proto udp from any to ! ::1 port 53
-pass out quick on ! lo0 route-to lo0 inet6 proto tcp from any to ! ::1 port 53
-
-# Pass on lo0 without state (mirrors IPv4)
-pass out quick on lo0 inet6 proto udp from any to ! ::1 port 53 no state
-pass out quick on lo0 inet6 proto tcp from any to ! ::1 port 53 no state
-
-# Accept redirected IPv6 DNS with reply-to (mirrors IPv4)
-pass in quick on lo0 reply-to lo0 inet6 proto { udp, tcp } from any to ::1 port 53
+block out quick on ! lo0 inet6 proto { udp, tcp } from any to any port 53
 ```

-### IPv6 Packet Flow
+macOS automatically retries DNS over IPv4 when the IPv6 path is blocked. The IPv4 path is fully intercepted via the normal route-to + rdr mechanism. Impact is minimal — at most ~1s latency on the very first DNS query while the IPv6 attempt is blocked.

-```
-Application queries [2607:f0c8:8000:8210::1]:53 (IPv6 DNS server)
-    ↓
-pf filter: "pass out route-to lo0 inet6 ... port 53" → redirects to lo0
-    ↓
-pf (outbound lo0): "pass out on lo0 inet6 ... no state" → passes
-    ↓
-Loopback reflects packet inbound on lo0
-    ↓
-pf nat: rewrites source 2607:f0c8:...:ec6e → ::1
-pf rdr: rewrites dest [2607:f0c8:8000:8210::1]:53 → [::1]:53
-    ↓
-ctrld receives query from [::1]:port → [::1]:53
-    ↓
-ctrld resolves via DoH, replies to [::1]:port (kernel accepts ::1 → ::1)
-    ↓
-pf reverses both translations:
-  - nat reverse: dest ::1 → 2607:f0c8:...:ec6e (original client)
-  - rdr reverse: src ::1 → 2607:f0c8:8000:8210::1 (original DNS server)
-    ↓
-Application receives response from [2607:f0c8:8000:8210::1]:53 ✓
-```
+### What Was Tried and Why It Failed

-### Client IP Recovery
-
-The `nat` rewrites the source to `::1`, so ctrld sees the client as `::1` (loopback). The existing `spoofLoopbackIpInClientInfo()` logic detects this and replaces it with the machine's real RFC1918 IPv4 address (e.g., `10.0.10.211`). This is the same mechanism used when queries arrive from `127.0.0.1` — no client identity is lost.
+| Approach | Result |
+|----------|--------|
+| `nat on lo0 inet6` to rewrite source to `::1` | pf skips translation on second interface pass — nat doesn't fire for route-to'd packets arriving on lo0 |
+| ULA address on lo0 (`fd00:53::1`) | Kernel rejects: `EHOSTUNREACH` — lo0's routing table is segregated from global unicast |
+| Raw IPv6 socket (`SOCK_RAW` + `IPPROTO_UDP`) | Bypasses sendmsg validation, but pf doesn't match raw socket packets against rdr state — response arrives from `::1` not the original server |
+| `DIOCNATLOOK` to get original dest + raw socket from that addr | Can't `bind()` to a non-local address (`EADDRNOTAVAIL`) — macOS has no `IPV6_HDRINCL` for source spoofing |
+| BPF packet injection on lo0 | Theoretically possible but extremely complex — not justified for the marginal benefit |

 ### IPv6 Listener

-The `[::1]` listener reuses the existing infrastructure from Windows (where it was added for the same reason — can't suppress IPv6 DNS resolvers from the system config). The `needLocalIPv6Listener()` function gates it, returning `true` on:
- **Windows**: Always (if IPv6 is available)
- **macOS**: Only in intercept mode
-
-If the `[::1]` listener fails to bind, it logs a warning and continues — the IPv4 listener is primary.
-
-### nat-anchor Requirement
-
-The `nat` rules in our anchor require a `nat-anchor "com.controld.ctrld"` reference in the main pf ruleset, in addition to the existing `rdr-anchor` and `anchor` references. All pf management functions (inject, remove, verify, watchdog, force-reload) handle all three anchor types.
+The `[::1]` listener is used on:
+- **Windows**: Always (if IPv6 is available) — Windows can't easily suppress IPv6 DNS resolvers
+- **macOS**: **Not used** — IPv6 DNS is blocked at pf, no listener needed

 ## Rule Ordering Within the Anchor

@@ -236,7 +198,7 @@ The trickiest part. macOS only processes anchors declared in the active pf rules

 1. Read `/etc/pf.conf`
 2. If our anchor reference already exists, reload as-is
-3. Otherwise, inject `nat-anchor "com.controld.ctrld"` and `rdr-anchor "com.controld.ctrld"` in the translation section and `anchor "com.controld.ctrld"` in the filter section
+3. Otherwise, inject `rdr-anchor "com.controld.ctrld"` in the translation section and `anchor "com.controld.ctrld"` in the filter section
 4. Write to a **temp file** and load with `pfctl -f <tmpfile>`
 5. **We never modify `/etc/pf.conf` on disk** — changes are runtime-only and don't survive reboot (ctrld re-injects on every start)

@@ -376,5 +338,8 @@ We chose `route-to + rdr` as the best balance of effectiveness and deployability
 9. **`pass out quick` exemptions work with route-to** — they fire in the same phase (filter), so `quick` + rule ordering means exempted packets never hit the route-to rule
 10. **pf cannot cross-AF redirect** — `rdr on lo0 inet6 ... -> 127.0.0.1` is invalid. IPv6 DNS must be handled by an `[::1]` listener.
 11. **`block return` doesn't work for IPv6 DNS** — BSD doesn't deliver ICMPv6 unreachable to unconnected UDP sockets (`sendto`). Apps timeout waiting for a response that never comes.
-12. **sendmsg from `::1` to global unicast fails on macOS** — unlike IPv4 where `127.0.0.1` can send to any local address, `::1` cannot send to the machine's own global IPv6 address. `nat` on lo0 is required to rewrite the source.
-13. **`nat-anchor` is separate from `rdr-anchor`** — pf requires both in the main ruleset for nat and rdr rules in an anchor to be evaluated. `rdr-anchor` alone does not cover nat rules.
+12. **sendmsg from `::1` to global unicast fails on macOS** — unlike IPv4 where `127.0.0.1` can send to any local address, `::1` cannot send to the machine's own global IPv6 address (`EINVAL`). This is the fundamental asymmetry that makes IPv6 DNS interception infeasible.
+13. **`nat on lo0` doesn't fire for `route-to`'d packets** — pf runs translation on the original outbound interface (en0), then skips it on lo0's outbound pass. `rdr` works because lo0 inbound is a genuinely new direction. Any lo0 address (including ULAs) can't route to global unicast — the kernel segregates lo0's routing table.
+14. **Raw IPv6 sockets bypass routing validation but pf doesn't match them** — `SOCK_RAW` can send from `::1` to global unicast, but pf treats raw socket packets as new connections (not matching rdr state), so reverse-translation doesn't happen. The client sees `::1` as the source, not the original DNS server.
+15. **`DIOCNATLOOK` can find the original dest but you can't use it** — The ioctl returns the pre-rdr destination, but `bind()` fails with `EADDRNOTAVAIL` because it's not a local address. macOS IPv6 raw sockets don't support `IPV6_HDRINCL` for source spoofing.
+16. **Blocking IPv6 DNS is the pragmatic solution** — macOS automatically retries over IPv4. The ~1s penalty on the first blocked query is negligible compared to the complexity of working around the kernel's IPv6 loopback restrictions.
@@ -15,11 +15,15 @@ the same enforcement guarantees as macOS pf.

 ```
 ┌─────────────────────────────────────────────────────────────────┐
-│                     dns mode (NRPT only)                        │
+│              dns mode (NRPT + loopback WFP protect)             │
 │                                                                 │
 │  App DNS query → DNS Client service → NRPT lookup               │
 │     → "." catch-all matches → forward to 127.0.0.1 (ctrld)    │
 │                                                                 │
+│  Loopback WFP protect: 4 hard-permit filters (port 53 to       │
+│  localhost, CLEAR_ACTION_RIGHT) prevent third-party VPN WFP     │
+│  blocks (e.g., OpenVPN block-outside-dns) from breaking NRPT.   │
+│                                                                 │
 │  If VPN clears NRPT: health monitor re-adds within 30s         │
 │  Worst case: queries go to VPN DNS until NRPT restored          │
 │  DNS never breaks — graceful degradation                        │
@@ -182,8 +186,10 @@ When `vpnDNSManager.Refresh()` discovers VPN DNS servers on public IPs:
   - Both UDP and TCP for each IP
 3. Store new filter IDs for next cleanup cycle

-**In `dns` mode, VPN DNS exemptions are skipped** — there are no WFP block
-filters to exempt from.
+**In `dns` mode, VPN DNS exemptions are skipped** — there are no ctrld WFP block
+filters to exempt from. The loopback WFP protect filters only permit localhost
+DNS; VPN DNS traffic goes through the tunnel interface and is already permitted
+by the VPN's own WFP rules.

 ### Session Lifecycle

@@ -202,8 +208,8 @@ filters to exempt from.
 **Startup (dns mode):**
 ```
 1. Add NRPT catch-all rule + GP refresh + DNS flush
-2. Start NRPT health monitor goroutine
-3. (No WFP — done)
+2. Activate loopback WFP protect (4 hard-permit filters for localhost DNS)
+3. Start NRPT health monitor goroutine
 ```

 **Shutdown:**
@@ -338,9 +344,9 @@ breaking DNS.
 | Aspect | macOS (pf) | Windows dns mode | Windows hard mode |
 |--------|-----------|------------------|-------------------|
 | **Routing** | `rdr` redirect | NRPT policy | NRPT policy |
-| **Enforcement** | `route-to` + block rules | None (graceful) | WFP block filters |
+| **Enforcement** | `route-to` + block rules | Loopback WFP protect | WFP block filters |
 | **Can break DNS?** | Yes (pf corruption) | No | Yes (if NRPT lost) |
-| **VPN coexistence** | Watchdog + stabilization | NRPT most-specific-match | Same + WFP permits |
+| **VPN coexistence** | Watchdog + stabilization | NRPT + loopback hard-permit | Same + WFP permits |
 | **Bypass protection** | pf catches all packets | None | WFP catches all connections |
 | **Recovery** | Probe + auto-heal | Health monitor re-adds | Full restart on sublayer loss |

@@ -42,11 +42,12 @@ const doqPoolSize = 16

 // doqConnPool manages a pool of QUIC connections for DoQ queries using a buffered channel.
 type doqConnPool struct {
-	uc        *UpstreamConfig
-	addrs     []string
-	port      string
-	tlsConfig *tls.Config
-	conns     chan *doqConn
+	uc         *UpstreamConfig
+	addrs      []string
+	port       string
+	tlsConfig  *tls.Config
+	quicConfig *quic.Config
+	conns      chan *doqConn
 }

 type doqConn struct {
@@ -65,12 +66,17 @@ func newDOQConnPool(uc *UpstreamConfig, addrs []string) *doqConnPool {
 		ServerName: uc.Domain,
 	}

+	quicConfig := &quic.Config{
+		KeepAlivePeriod: 15 * time.Second,
+	}
+
 	pool := &doqConnPool{
-		uc:        uc,
-		addrs:     addrs,
-		port:      port,
-		tlsConfig: tlsConfig,
-		conns:     make(chan *doqConn, doqPoolSize),
+		uc:         uc,
+		addrs:      addrs,
+		port:       port,
+		tlsConfig:  tlsConfig,
+		quicConfig: quicConfig,
+		conns:      make(chan *doqConn, doqPoolSize),
 	}

 	// Use SetFinalizer here because we need to call a method on the pool itself.
@@ -85,12 +91,22 @@ func newDOQConnPool(uc *UpstreamConfig, addrs []string) *doqConnPool {

 // Resolve performs a DNS query using a pooled QUIC connection.
 func (p *doqConnPool) Resolve(ctx context.Context, msg *dns.Msg) (*dns.Msg, error) {
-	// Retry logic for io.EOF errors (as per original implementation)
+	// Retry logic for transient errors: io.EOF (connection reset),
+	// IdleTimeoutError (stale pooled connection timed out), and
+	// StreamLimitReachedError (stream credit exhausted before server MAX_STREAMS arrived).
 	for range 5 {
 		answer, err := p.doResolve(ctx, msg)
 		if err == io.EOF {
 			continue
 		}
+		var idleErr *quic.IdleTimeoutError
+		if errors.As(err, &idleErr) {
+			continue
+		}
+		var streamLimitErr quic.StreamLimitReachedError
+		if errors.As(err, &streamLimitErr) {
+			continue
+		}
 		if err != nil {
 			return nil, wrapCertificateVerificationError(err)
 		}
@@ -115,18 +131,25 @@ func (p *doqConnPool) doResolve(ctx context.Context, msg *dns.Msg) (*dns.Msg, er
 		return nil, err
 	}

-	// Open a new stream for this query
-	stream, err := conn.OpenStream()
+	// Ensure the context has a deadline before calling OpenStreamSync, which
+	// blocks until the server sends a MAX_STREAMS update. Without a deadline the
+	// call could block indefinitely when the server never sends the update.
+	deadline, ok := ctx.Deadline()
+	if !ok {
+		var cancel context.CancelFunc
+		ctx, cancel = context.WithTimeout(ctx, 5*time.Second)
+		defer cancel()
+		deadline, _ = ctx.Deadline()
+	}
+
+	// OpenStreamSync blocks until the server's MAX_STREAMS credit arrives,
+	// avoiding the StreamLimitReachedError race that OpenStream (non-blocking)
+	// triggers when the credit replenishment frame is still in flight.
+	stream, err := conn.OpenStreamSync(ctx)
 	if err != nil {
 		p.putConn(conn, false)
 		return nil, err
 	}
-
-	// Set deadline
-	deadline, ok := ctx.Deadline()
-	if !ok {
-		deadline = time.Now().Add(5 * time.Second)
-	}
 	_ = stream.SetDeadline(deadline)

 	// Write message length (2 bytes) followed by message
@@ -226,7 +249,7 @@ func (p *doqConnPool) dialConn(ctx context.Context) (string, *quic.Conn, error)
 			udpConn.Close()
 			return "", nil, err
 		}
-		conn, err := quic.DialEarly(ctx, udpConn, remoteAddr, p.tlsConfig, nil)
+		conn, err := quic.DialEarly(ctx, udpConn, remoteAddr, p.tlsConfig, p.quicConfig)
 		if err != nil {
 			udpConn.Close()
 			return "", nil, err
@@ -241,7 +264,7 @@ func (p *doqConnPool) dialConn(ctx context.Context) (string, *quic.Conn, error)
 	}

 	pd := &quicParallelDialer{}
-	conn, err := pd.Dial(ctx, dialAddrs, p.tlsConfig, nil)
+	conn, err := pd.Dial(ctx, dialAddrs, p.tlsConfig, p.quicConfig)
 	if err != nil {
 		return "", nil, err
 	}
@@ -173,8 +173,10 @@ func (t *Table) SetSelfIP(ip string) {
 	t.selfIPLock.Lock()
 	defer t.selfIPLock.Unlock()
 	t.selfIP = ip
-	t.dhcp.selfIP = t.selfIP
-	t.dhcp.addSelf()
+	if t.dhcp != nil {
+		t.dhcp.selfIP = t.selfIP
+		t.dhcp.addSelf()
+	}
 }

 // initSelfDiscover initializes necessary client metadata for self query.
@@ -1,9 +1,64 @@
 package clientinfo

 import (
+	"sync"
 	"testing"
 )

+// TestTable_SetSelfIP_NilDHCP ensures SetSelfIP does not panic when t.dhcp is
+// nil, which happens when DHCP discovery is disabled and the network-change
+// callback fires before or without initialisation.
+func TestTable_SetSelfIP_NilDHCP(t *testing.T) {
+	table := &Table{} // dhcp is nil
+	// Must not panic.
+	table.SetSelfIP("192.168.1.1")
+	if got := table.SelfIP(); got != "192.168.1.1" {
+		t.Fatalf("SelfIP() = %q, want %q", got, "192.168.1.1")
+	}
+}
+
+// TestTable_SetSelfIP_UpdatesDHCP ensures SetSelfIP propagates the new IP to
+// the dhcp discover and calls addSelf when dhcp is initialised.
+func TestTable_SetSelfIP_UpdatesDHCP(t *testing.T) {
+	table := &Table{
+		dhcp: &dhcp{selfIP: "10.0.0.1"},
+	}
+	table.SetSelfIP("10.0.0.2")
+	if got := table.SelfIP(); got != "10.0.0.2" {
+		t.Fatalf("SelfIP() = %q, want %q", got, "10.0.0.2")
+	}
+	if table.dhcp.selfIP != "10.0.0.2" {
+		t.Fatalf("dhcp.selfIP = %q, want %q", table.dhcp.selfIP, "10.0.0.2")
+	}
+}
+
+// TestTable_SetSelfIP_Concurrent ensures concurrent calls to SetSelfIP do not
+// race, regardless of whether dhcp is nil or not.
+func TestTable_SetSelfIP_Concurrent(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		table *Table
+	}{
+		{"nil dhcp", &Table{}},
+		{"with dhcp", &Table{dhcp: &dhcp{}}},
+	} {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+			var wg sync.WaitGroup
+			for range 10 {
+				wg.Add(1)
+				go func() {
+					defer wg.Done()
+					tc.table.SetSelfIP("192.168.1.1")
+					_ = tc.table.SelfIP()
+				}()
+			}
+			wg.Wait()
+		})
+	}
+}
+
 func Test_normalizeIP(t *testing.T) {
 	tests := []struct {
 		name string
@@ -4,6 +4,7 @@ package ctrld

 import (
 	"net"
+	"net/netip"
 	"slices"
 	"time"

@@ -17,6 +18,31 @@ func currentNameserversFromResolvconf() []string {
 	return resolvconffile.NameServers()
 }

+// localNameservers filters a list of nameserver strings, returning only those
+// that are not loopback or local machine IP addresses.
+func localNameservers(nss []string, regularIPs, loopbackIPs []netip.Addr) []string {
+	var result []string
+	seen := make(map[string]bool)
+
+	for _, ns := range nss {
+		if ip := net.ParseIP(ns); ip != nil {
+			// skip loopback and local IPs
+			isLocal := false
+			for _, v := range slices.Concat(regularIPs, loopbackIPs) {
+				if ip.String() == v.String() {
+					isLocal = true
+					break
+				}
+			}
+			if !isLocal && !seen[ip.String()] {
+				seen[ip.String()] = true
+				result = append(result, ip.String())
+			}
+		}
+	}
+	return result
+}
+
 // dnsFromResolvConf reads usable nameservers from /etc/resolv.conf file.
 // A nameserver is usable if it's not one of current machine's IP addresses
 // and loopback IP addresses.
@@ -35,24 +61,7 @@ func dnsFromResolvConf() []string {
 		}

 		nss := resolvconffile.NameServers()
-		var localDNS []string
-		seen := make(map[string]bool)
-
-		for _, ns := range nss {
-			if ip := net.ParseIP(ns); ip != nil {
-				// skip loopback IPs
-				for _, v := range slices.Concat(regularIPs, loopbackIPs) {
-					ipStr := v.String()
-					if ip.String() == ipStr {
-						continue
-					}
-				}
-				if !seen[ip.String()] {
-					seen[ip.String()] = true
-					localDNS = append(localDNS, ip.String())
-				}
-			}
-		}
+		localDNS := localNameservers(nss, regularIPs, loopbackIPs)

 		// If we successfully read the file and found nameservers, return them
 		if len(localDNS) > 0 {
@@ -0,0 +1,105 @@
+//go:build unix
+
+package ctrld
+
+import (
+	"net/netip"
+	"testing"
+)
+
+func Test_localNameservers(t *testing.T) {
+	loopbackIPs := []netip.Addr{
+		netip.MustParseAddr("127.0.0.1"),
+		netip.MustParseAddr("::1"),
+	}
+	regularIPs := []netip.Addr{
+		netip.MustParseAddr("192.168.1.100"),
+		netip.MustParseAddr("10.0.0.5"),
+	}
+
+	tests := []struct {
+		name       string
+		nss        []string
+		regularIPs []netip.Addr
+		loopbackIPs []netip.Addr
+		want       []string
+	}{
+		{
+			name:        "filters loopback IPv4",
+			nss:         []string{"127.0.0.1", "8.8.8.8"},
+			regularIPs:  nil,
+			loopbackIPs: loopbackIPs,
+			want:        []string{"8.8.8.8"},
+		},
+		{
+			name:        "filters loopback IPv6",
+			nss:         []string{"::1", "1.1.1.1"},
+			regularIPs:  nil,
+			loopbackIPs: loopbackIPs,
+			want:        []string{"1.1.1.1"},
+		},
+		{
+			name:        "filters local machine IPs",
+			nss:         []string{"192.168.1.100", "8.8.4.4"},
+			regularIPs:  regularIPs,
+			loopbackIPs: nil,
+			want:        []string{"8.8.4.4"},
+		},
+		{
+			name:        "filters both loopback and local IPs",
+			nss:         []string{"127.0.0.1", "192.168.1.100", "8.8.8.8"},
+			regularIPs:  regularIPs,
+			loopbackIPs: loopbackIPs,
+			want:        []string{"8.8.8.8"},
+		},
+		{
+			name:        "deduplicates results",
+			nss:         []string{"8.8.8.8", "8.8.8.8", "1.1.1.1"},
+			regularIPs:  regularIPs,
+			loopbackIPs: loopbackIPs,
+			want:        []string{"8.8.8.8", "1.1.1.1"},
+		},
+		{
+			name:        "all filtered returns nil",
+			nss:         []string{"127.0.0.1", "::1", "192.168.1.100"},
+			regularIPs:  regularIPs,
+			loopbackIPs: loopbackIPs,
+			want:        nil,
+		},
+		{
+			name:        "empty input returns nil",
+			nss:         nil,
+			regularIPs:  regularIPs,
+			loopbackIPs: loopbackIPs,
+			want:        nil,
+		},
+		{
+			name:        "skips unparseable entries",
+			nss:         []string{"not-an-ip", "8.8.8.8"},
+			regularIPs:  regularIPs,
+			loopbackIPs: loopbackIPs,
+			want:        []string{"8.8.8.8"},
+		},
+		{
+			name:        "no local IPs filters nothing",
+			nss:         []string{"8.8.8.8", "1.1.1.1"},
+			regularIPs:  nil,
+			loopbackIPs: nil,
+			want:        []string{"8.8.8.8", "1.1.1.1"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := localNameservers(tt.nss, tt.regularIPs, tt.loopbackIPs)
+			if len(got) != len(tt.want) {
+				t.Fatalf("localNameservers() = %v, want %v", got, tt.want)
+			}
+			for i := range got {
+				if got[i] != tt.want[i] {
+					t.Errorf("localNameservers()[%d] = %q, want %q", i, got[i], tt.want[i])
+				}
+			}
+		})
+	}
+}