From b9fb3b917668b71084e0526232f7f80e66ac6453 Mon Sep 17 00:00:00 2001 From: Codescribe Date: Thu, 5 Mar 2026 04:50:16 -0500 Subject: [PATCH] feat: add Windows NRPT and WFP DNS interception --- cmd/cli/dns_intercept_windows.go | 1685 ++++++++++++++++++++++++++++++ docs/wfp-dns-intercept.md | 449 ++++++++ scripts/nrpt-diag.ps1 | 132 +++ 3 files changed, 2266 insertions(+) create mode 100644 cmd/cli/dns_intercept_windows.go create mode 100644 docs/wfp-dns-intercept.md create mode 100644 scripts/nrpt-diag.ps1 diff --git a/cmd/cli/dns_intercept_windows.go b/cmd/cli/dns_intercept_windows.go new file mode 100644 index 0000000..1da790d --- /dev/null +++ b/cmd/cli/dns_intercept_windows.go @@ -0,0 +1,1685 @@ +//go:build windows + +package cli + +import ( + "context" + "fmt" + "math/rand" + "net" + "os/exec" + "runtime" + "sync/atomic" + "time" + "unsafe" + + "golang.org/x/sys/windows" + "golang.org/x/sys/windows/registry" + + "github.com/Control-D-Inc/ctrld" +) + +// DNS Intercept Mode — Windows Implementation (WFP) +// +// This file implements DNS interception using Windows Filtering Platform (WFP). +// WFP is a kernel-level network filtering framework that allows applications to +// inspect and modify network traffic at various layers of the TCP/IP stack. +// +// Strategy: +// - Create a WFP sublayer at maximum priority (weight 0xFFFF) +// - Add PERMIT filters (weight 10) for DNS to localhost (ctrld's listener) +// - Add BLOCK filters (weight 1) for all other outbound DNS +// - Dynamically add/remove PERMIT filters for VPN DNS server exemptions +// +// This means even if VPN software overwrites adapter DNS settings, the OS +// cannot reach those DNS servers on port 53 — all DNS must flow through ctrld. +// +// Key advantages over macOS pf: +// - WFP filters are per-process kernel objects — other apps can't wipe them +// - No watchdog or stabilization needed +// - Connection-level filtering — no packet state/return-path complications +// - Full IPv4 + IPv6 support +// +// See docs/wfp-dns-intercept.md for architecture diagrams and debugging tips. + +// WFP GUIDs and constants for DNS interception. +// These are defined by Microsoft's Windows Filtering Platform API. +var ( + // ctrldSubLayerGUID is a unique GUID for ctrld's WFP sublayer. + // Generated specifically for ctrld DNS intercept mode. + ctrldSubLayerGUID = windows.GUID{ + Data1: 0x7a4e5b6c, + Data2: 0x3d2f, + Data3: 0x4a1e, + Data4: [8]byte{0x9b, 0x8c, 0x1d, 0x2e, 0x3f, 0x4a, 0x5b, 0x6c}, + } + + // Well-known WFP layer GUIDs from Microsoft documentation. + // FWPM_LAYER_ALE_AUTH_CONNECT_V4: filters outbound IPv4 connection attempts. + fwpmLayerALEAuthConnectV4 = windows.GUID{ + Data1: 0xc38d57d1, + Data2: 0x05a7, + Data3: 0x4c33, + Data4: [8]byte{0x90, 0x4f, 0x7f, 0xbc, 0xee, 0xe6, 0x0e, 0x82}, + } + // FWPM_LAYER_ALE_AUTH_CONNECT_V6: filters outbound IPv6 connection attempts. + fwpmLayerALEAuthConnectV6 = windows.GUID{ + Data1: 0x4a72393b, + Data2: 0x319f, + Data3: 0x44bc, + Data4: [8]byte{0x84, 0xc3, 0xba, 0x54, 0xdc, 0xb3, 0xb6, 0xb4}, + } + + // FWPM_CONDITION_IP_REMOTE_PORT: condition matching on remote port. + fwpmConditionIPRemotePort = windows.GUID{ + Data1: 0xc35a604d, + Data2: 0xd22b, + Data3: 0x4e1a, + Data4: [8]byte{0x91, 0xb4, 0x68, 0xf6, 0x74, 0xee, 0x67, 0x4b}, + } + // FWPM_CONDITION_IP_REMOTE_ADDRESS: condition matching on remote address. + fwpmConditionIPRemoteAddress = windows.GUID{ + Data1: 0xb235ae9a, + Data2: 0x1d64, + Data3: 0x49b8, + Data4: [8]byte{0xa4, 0x4c, 0x5f, 0xf3, 0xd9, 0x09, 0x50, 0x45}, + } + // FWPM_CONDITION_IP_PROTOCOL: condition matching on IP protocol. + fwpmConditionIPProtocol = windows.GUID{ + Data1: 0x3971ef2b, + Data2: 0x623e, + Data3: 0x4f9a, + Data4: [8]byte{0x8c, 0xb1, 0x6e, 0x79, 0xb8, 0x06, 0xb9, 0xa7}, + } +) + +const ( + // WFP action constants. These combine a base action with the TERMINATING flag. + // See: https://docs.microsoft.com/en-us/windows/win32/api/fwptypes/ne-fwptypes-fwp_action_type + fwpActionFlagTerminating uint32 = 0x00001000 + fwpActionBlock uint32 = 0x00000001 | fwpActionFlagTerminating // 0x00001001 + fwpActionPermit uint32 = 0x00000002 | fwpActionFlagTerminating // 0x00001002 + + // FWP_MATCH_EQUAL is the match type for exact value comparison. + fwpMatchEqual uint32 = 0 // FWP_MATCH_EQUAL + + // FWP_DATA_TYPE constants for condition values. + // Enum starts at FWP_EMPTY=0, so FWP_UINT8=1, etc. + // See: https://learn.microsoft.com/en-us/windows/win32/api/fwptypes/ne-fwptypes-fwp_data_type + fwpUint8 uint32 = 1 // FWP_UINT8 + fwpUint16 uint32 = 2 // FWP_UINT16 + fwpUint32 uint32 = 3 // FWP_UINT32 + fwpByteArray16Type uint32 = 11 // FWP_BYTE_ARRAY16_TYPE + fwpV4AddrMask uint32 = 0x100 // FWP_V4_ADDR_MASK (after FWP_SINGLE_DATA_TYPE_MAX=0xff) + + // IP protocol numbers. + ipprotoUDP uint8 = 17 + ipprotoTCP uint8 = 6 + + // DNS port. + dnsPort uint16 = 53 +) + +// WFP API structures. These mirror the C structures from fwpmtypes.h and fwptypes.h. +// We define them here because golang.org/x/sys/windows doesn't include WFP types. +// +// IMPORTANT: These struct layouts must match the C ABI exactly (64-bit Windows). +// Field alignment and padding are critical. Any mismatch will cause access violations +// or silent corruption. The layouts below are for AMD64 only. +// If issues arise, verify against the Windows SDK headers with offsetof() checks. + +// fwpmSession0 represents FWPM_SESSION0 for opening a WFP engine handle. +type fwpmSession0 struct { + sessionKey windows.GUID + displayData fwpmDisplayData0 + flags uint32 + txnWaitTimeoutInMSec uint32 + processId uint32 + sid *windows.SID + username *uint16 + kernelMode int32 // Windows BOOL is int32, not Go bool + _ [4]byte // padding to next 8-byte boundary +} + +// fwpmDisplayData0 represents FWPM_DISPLAY_DATA0 for naming WFP objects. +type fwpmDisplayData0 struct { + name *uint16 + description *uint16 +} + +// fwpmSublayer0 represents FWPM_SUBLAYER0 for creating a WFP sublayer. +type fwpmSublayer0 struct { + subLayerKey windows.GUID + displayData fwpmDisplayData0 + flags uint32 + _ [4]byte // padding + providerKey *windows.GUID + providerData fwpByteBlob + weight uint16 + _ [6]byte // padding +} + +// fwpByteBlob represents FWP_BYTE_BLOB for raw data blobs. +type fwpByteBlob struct { + size uint32 + _ [4]byte // padding + data *byte +} + +// fwpmFilter0 represents FWPM_FILTER0 for adding WFP filters. +type fwpmFilter0 struct { + filterKey windows.GUID + displayData fwpmDisplayData0 + flags uint32 + _ [4]byte // padding + providerKey *windows.GUID + providerData fwpByteBlob + layerKey windows.GUID + subLayerKey windows.GUID + weight fwpValue0 + numFilterConds uint32 + _ [4]byte // padding + filterCondition *fwpmFilterCondition0 + action fwpmAction0 + // After action is a union of UINT64 (rawContext) and GUID (providerContextKey). + // GUID is 16 bytes, UINT64 is 8 bytes. Union size = 16 bytes. + rawContext uint64 // first 8 bytes of the union + _rawContextPad uint64 // remaining 8 bytes (unused, for GUID alignment) + reserved *windows.GUID + filterId uint64 + effectiveWeight fwpValue0 +} + +// fwpValue0 represents FWP_VALUE0, a tagged union for filter weights and values. +type fwpValue0 struct { + valueType uint32 + _ [4]byte // padding + value uint64 // union: uint8/uint16/uint32/uint64/pointer +} + +// fwpmFilterCondition0 represents FWPM_FILTER_CONDITION0 for filter match conditions. +type fwpmFilterCondition0 struct { + fieldKey windows.GUID + matchType uint32 + _ [4]byte // padding + condValue fwpConditionValue0 +} + +// fwpConditionValue0 represents FWP_CONDITION_VALUE0, the value to match against. +type fwpConditionValue0 struct { + valueType uint32 + _ [4]byte // padding + value uint64 // union +} + +// fwpV4AddrAndMask represents FWP_V4_ADDR_AND_MASK for subnet matching. +// Both addr and mask are in host byte order. +type fwpV4AddrAndMask struct { + addr uint32 + mask uint32 +} + +// fwpmAction0 represents FWPM_ACTION0 for specifying what happens on match. +// Size: 20 bytes (uint32 + GUID). No padding needed — GUID has 4-byte alignment. +type fwpmAction0 struct { + actionType uint32 + filterType windows.GUID // union: filterType or calloutKey +} + +// wfpState holds the state of the WFP DNS interception filters. +// It tracks the engine handle and all filter IDs for cleanup on shutdown. +// All filter IDs are stored so we can remove them individually without +// needing to enumerate the sublayer's filters via WFP API. +// +// In "dns" mode, engineHandle is 0 (no WFP filters) and only NRPT is active. +// In "hard" mode, both NRPT and WFP filters are active. +// +// The engine handle is opened once at startup and kept for the lifetime +// of the ctrld process. Filter additions/removals happen through this handle. +type wfpState struct { + engineHandle uintptr + filterIDv4UDP uint64 + filterIDv4TCP uint64 + filterIDv6UDP uint64 + filterIDv6TCP uint64 + // Permit filter IDs for localhost traffic (prevent blocking ctrld's own listener). + permitIDv4UDP uint64 + permitIDv4TCP uint64 + permitIDv6UDP uint64 + permitIDv6TCP uint64 + // Dynamic permit filter IDs for VPN DNS server IPs. + vpnPermitFilterIDs []uint64 + // Static permit filter IDs for RFC1918/CGNAT subnet ranges. + // These allow VPN DNS servers on private IPs to work without dynamic exemptions. + subnetPermitFilterIDs []uint64 + // nrptActive tracks whether the NRPT catch-all rule was successfully added. + // Used by stopDNSIntercept to know whether cleanup is needed. + nrptActive bool + // listenerIP is the actual IP address ctrld is listening on (e.g., "127.0.0.1" + // or "127.0.0.2" on AD DC). Used by NRPT rule creation and health monitor to + // ensure NRPT points to the correct address. + listenerIP string + // stopCh is used to shut down the NRPT health monitor goroutine. + stopCh chan struct{} +} + +// Lazy-loaded WFP DLL procedures. +var ( + fwpuclntDLL = windows.NewLazySystemDLL("fwpuclnt.dll") + procFwpmEngineOpen0 = fwpuclntDLL.NewProc("FwpmEngineOpen0") + procFwpmEngineClose0 = fwpuclntDLL.NewProc("FwpmEngineClose0") + procFwpmSubLayerAdd0 = fwpuclntDLL.NewProc("FwpmSubLayerAdd0") + procFwpmSubLayerDeleteByKey0 = fwpuclntDLL.NewProc("FwpmSubLayerDeleteByKey0") + procFwpmFilterAdd0 = fwpuclntDLL.NewProc("FwpmFilterAdd0") + procFwpmFilterDeleteById0 = fwpuclntDLL.NewProc("FwpmFilterDeleteById0") + procFwpmSubLayerGetByKey0 = fwpuclntDLL.NewProc("FwpmSubLayerGetByKey0") + procFwpmFreeMemory0 = fwpuclntDLL.NewProc("FwpmFreeMemory0") +) + +// Lazy-loaded dnsapi.dll for flushing the DNS Client cache after NRPT changes. +var ( + dnsapiDLL = windows.NewLazySystemDLL("dnsapi.dll") + procDnsFlushResolverCache = dnsapiDLL.NewProc("DnsFlushResolverCache") +) + +// Lazy-loaded userenv.dll for triggering Group Policy refresh so DNS Client +// picks up new NRPT registry entries without waiting for the next GP cycle. +var ( + userenvDLL = windows.NewLazySystemDLL("userenv.dll") + procRefreshPolicyEx = userenvDLL.NewProc("RefreshPolicyEx") +) + +// NRPT (Name Resolution Policy Table) Registry Constants +// +// NRPT tells the Windows DNS Client service where to send queries for specific +// namespaces. We add a catch-all rule ("." matches everything) that directs all +// DNS queries to ctrld's listener (typically 127.0.0.1, but may be 127.0.0.x on AD DC). +// +// This complements the WFP block filters: +// - NRPT: tells Windows DNS Client to send queries to ctrld (positive routing) +// - WFP: blocks any DNS that somehow bypasses NRPT (enforcement backstop) +// +// Without NRPT, WFP blocks outbound DNS but doesn't redirect it — applications +// would just see DNS failures instead of getting answers from ctrld. +const ( + // nrptBaseKey is the GP registry path where Windows stores NRPT policy rules. + nrptBaseKey = `SOFTWARE\Policies\Microsoft\Windows NT\DNSClient\DnsPolicyConfig` + // nrptDirectKey is the local service store path. The DNS Client reads NRPT + // from both locations, but on some machines (including stock Win11) it only + // honors the direct path. This is the same path Add-DnsClientNrptRule uses. + nrptDirectKey = `SYSTEM\CurrentControlSet\Services\Dnscache\Parameters\DnsPolicyConfig` + // nrptRuleName is the name of our specific rule key under the GP path. + nrptRuleName = `CtrldCatchAll` + // nrptDirectRuleName is the key name for the direct service store path. + // The DNS Client requires direct-path rules to use GUID-in-braces format. + // Using a plain name like "CtrldCatchAll" makes the rule visible in + // Get-DnsClientNrptRule but DNS Client won't apply it for resolution + // (Get-DnsClientNrptPolicy returns empty). This is a deterministic GUID + // so we can reliably find and clean up our own rule. + nrptDirectRuleName = `{B2E9A3C1-7F4D-4A8E-9D6B-5C1E0F3A2B8D}` +) + +// addNRPTCatchAllRule creates an NRPT catch-all rule that directs all DNS queries +// to the specified listener IP. +// +// Windows NRPT has two registry paths with all-or-nothing precedence: +// - GP path: SOFTWARE\Policies\...\DnsPolicyConfig (Group Policy) +// - Local path: SYSTEM\CurrentControlSet\...\DnsPolicyConfig (service store) +// +// If ANY rules exist in the GP path (from IT policy, VPN, MDM, etc.), DNS Client +// enters "GP mode" and ignores ALL local-path rules entirely. Conversely, if the +// GP path is empty/absent, DNS Client reads from the local path only. +// +// Strategy (matching Tailscale's approach): +// - Always write to the local path (baseline for non-domain machines). +// - Check if OTHER software has GP rules. If yes, also write to the GP path +// so our rule isn't invisible. If no, clean our stale GP rules and delete the +// empty GP key to stay in "local mode". +// - After GP writes, call RefreshPolicyEx to activate. +func addNRPTCatchAllRule(listenerIP string) error { + // Always write to local/direct service store path. + if err := writeNRPTRule(nrptDirectKey+`\`+nrptDirectRuleName, listenerIP); err != nil { + return fmt.Errorf("failed to write NRPT local path rule: %w", err) + } + + // Check if other software has GP NRPT rules. If so, we must also write + // to the GP path — otherwise DNS Client's "GP mode" hides our local rule. + if otherGPRulesExist() { + mainLog.Load().Info().Msg("DNS intercept: other GP NRPT rules detected — also writing to GP path") + if err := writeNRPTRule(nrptBaseKey+`\`+nrptRuleName, listenerIP); err != nil { + mainLog.Load().Warn().Err(err).Msg("DNS intercept: failed to write NRPT GP rule (local rule still active if GP clears)") + } + } else { + // No other GP rules — clean our stale GP entry and delete the empty + // GP parent key so DNS Client stays in "local mode". + cleanGPPath() + } + return nil +} + +// otherGPRulesExist checks if non-ctrld NRPT rules exist in the GP path. +// When other software (IT policy, VPN, MDM) has GP rules, DNS Client enters +// "GP mode" and ignores ALL local-path rules. +func otherGPRulesExist() bool { + k, err := registry.OpenKey(registry.LOCAL_MACHINE, nrptBaseKey, registry.ENUMERATE_SUB_KEYS) + if err != nil { + return false // GP key doesn't exist — no GP rules. + } + names, err := k.ReadSubKeyNames(-1) + k.Close() + if err != nil { + return false + } + for _, name := range names { + if name != nrptRuleName { // Not our CtrldCatchAll + return true + } + } + return false +} + +// cleanGPPath removes our CtrldCatchAll rule from the GP path and deletes +// the GP DnsPolicyConfig parent key if no other rules remain. Removing the +// empty GP key is critical: its mere existence forces DNS Client into "GP mode" +// where local-path rules are ignored. +func cleanGPPath() { + // Delete our specific rule. + registry.DeleteKey(registry.LOCAL_MACHINE, nrptBaseKey+`\`+nrptRuleName) + + // If the GP parent key is now empty, delete it entirely to exit "GP mode". + k, err := registry.OpenKey(registry.LOCAL_MACHINE, nrptBaseKey, registry.ENUMERATE_SUB_KEYS) + if err != nil { + return // Key doesn't exist — clean state. + } + names, err := k.ReadSubKeyNames(-1) + k.Close() + if err != nil || len(names) > 0 { + if len(names) > 0 { + mainLog.Load().Debug().Strs("remaining", names).Msg("DNS intercept: GP path has other rules, leaving parent key") + } + return + } + // Empty — delete it to exit "GP mode". + if err := registry.DeleteKey(registry.LOCAL_MACHINE, nrptBaseKey); err == nil { + mainLog.Load().Info().Msg("DNS intercept: deleted empty GP DnsPolicyConfig key (exits GP mode)") + } +} + +// writeNRPTRule writes a single NRPT catch-all rule at the given registry keyPath. +func writeNRPTRule(keyPath, listenerIP string) error { + k, _, err := registry.CreateKey(registry.LOCAL_MACHINE, keyPath, registry.SET_VALUE) + if err != nil { + return fmt.Errorf("failed to create NRPT registry key %q: %w", keyPath, err) + } + defer k.Close() + + // Name (REG_MULTI_SZ): namespace patterns to match. "." = catch-all. + if err := k.SetStringsValue("Name", []string{"."}); err != nil { + return fmt.Errorf("failed to set NRPT Name value: %w", err) + } + // GenericDNSServers (REG_SZ): DNS server(s) to use for matching queries. + if err := k.SetStringValue("GenericDNSServers", listenerIP); err != nil { + return fmt.Errorf("failed to set NRPT GenericDNSServers value: %w", err) + } + // ConfigOptions (REG_DWORD): 0x8 = use standard DNS resolution (no DirectAccess). + if err := k.SetDWordValue("ConfigOptions", 0x8); err != nil { + return fmt.Errorf("failed to set NRPT ConfigOptions value: %w", err) + } + // Version (REG_DWORD): 0x2 = NRPT rule version 2. + if err := k.SetDWordValue("Version", 0x2); err != nil { + return fmt.Errorf("failed to set NRPT Version value: %w", err) + } + // Match the exact fields Add-DnsClientNrptRule creates. The DNS Client CIM + // provider writes these as empty strings; their absence may cause the service + // to skip the rule on some Windows builds. + k.SetStringValue("Comment", "") + k.SetStringValue("DisplayName", "") + k.SetStringValue("IPSECCARestriction", "") + return nil +} + +// removeNRPTCatchAllRule deletes the ctrld NRPT catch-all registry key and +// cleans up the empty parent key if no other NRPT rules remain. +// +// The empty parent cleanup is critical: an empty DnsPolicyConfig key causes +// DNS Client to cache a "no rules" state. On next start, DNS Client ignores +// newly written rules because it still has the cached empty state. By deleting +// the empty parent on stop, we ensure a clean slate for the next start. +func removeNRPTCatchAllRule() error { + // Remove our GUID-named rule from local/direct path. + if err := registry.DeleteKey(registry.LOCAL_MACHINE, nrptDirectKey+`\`+nrptDirectRuleName); err != nil { + if err != registry.ErrNotExist { + return fmt.Errorf("failed to delete NRPT local rule: %w", err) + } + } + deleteEmptyParentKey(nrptDirectKey) + // Clean up legacy rules from earlier builds (plain name in direct path, GP path rules). + registry.DeleteKey(registry.LOCAL_MACHINE, nrptDirectKey+`\`+nrptRuleName) + cleanGPPath() + return nil +} + +// deleteEmptyParentKey removes a registry key if it exists but has no subkeys. +func deleteEmptyParentKey(keyPath string) { + k, err := registry.OpenKey(registry.LOCAL_MACHINE, keyPath, registry.ENUMERATE_SUB_KEYS) + if err != nil { + return + } + names, err := k.ReadSubKeyNames(-1) + k.Close() + if err != nil || len(names) > 0 { + return + } + registry.DeleteKey(registry.LOCAL_MACHINE, keyPath) +} + +// nrptCatchAllRuleExists checks whether our NRPT catch-all rule exists +// in either the local or GP path. +func nrptCatchAllRuleExists() bool { + for _, path := range []string{ + nrptDirectKey + `\` + nrptDirectRuleName, + nrptBaseKey + `\` + nrptRuleName, + } { + k, err := registry.OpenKey(registry.LOCAL_MACHINE, path, registry.QUERY_VALUE) + if err == nil { + k.Close() + return true + } + } + return false +} + +// refreshNRPTPolicy triggers a machine Group Policy refresh so the DNS Client +// service picks up new/changed NRPT registry entries immediately. Without this, +// NRPT changes only take effect on the next GP cycle (default: 90 minutes). +// +// Uses RefreshPolicyEx(bMachine=TRUE, dwOptions=RP_FORCE=1) from userenv.dll. +// See: https://learn.microsoft.com/en-us/windows/win32/api/userenv/nf-userenv-refreshpolicyex +func refreshNRPTPolicy() { + if err := userenvDLL.Load(); err != nil { + mainLog.Load().Debug().Err(err).Msg("DNS intercept: userenv.dll not available, falling back to gpupdate") + if out, err := exec.Command("gpupdate", "/target:computer", "/force").CombinedOutput(); err != nil { + mainLog.Load().Debug().Msgf("DNS intercept: gpupdate failed: %v: %s", err, string(out)) + } else { + mainLog.Load().Debug().Msg("DNS intercept: triggered GP refresh via gpupdate") + } + return + } + if err := procRefreshPolicyEx.Find(); err != nil { + mainLog.Load().Debug().Err(err).Msg("DNS intercept: RefreshPolicyEx not found, falling back to gpupdate") + exec.Command("gpupdate", "/target:computer", "/force").Run() + return + } + ret, _, _ := procRefreshPolicyEx.Call(1, 1) + if ret != 0 { + mainLog.Load().Debug().Msg("DNS intercept: triggered machine GP refresh via RefreshPolicyEx") + } else { + mainLog.Load().Debug().Msg("DNS intercept: RefreshPolicyEx returned FALSE, falling back to gpupdate") + exec.Command("gpupdate", "/target:computer", "/force").Run() + } +} + +// flushDNSCache flushes the Windows DNS Client resolver cache and triggers a +// Group Policy refresh so NRPT changes take effect immediately. +func flushDNSCache() { + refreshNRPTPolicy() + if err := dnsapiDLL.Load(); err == nil { + if err := procDnsFlushResolverCache.Find(); err == nil { + ret, _, _ := procDnsFlushResolverCache.Call() + if ret != 0 { + mainLog.Load().Debug().Msg("DNS intercept: flushed DNS resolver cache via DnsFlushResolverCache") + return + } + } + } + if out, err := exec.Command("ipconfig", "/flushdns").CombinedOutput(); err != nil { + mainLog.Load().Debug().Msgf("DNS intercept: ipconfig /flushdns failed: %v: %s", err, string(out)) + } else { + mainLog.Load().Debug().Msg("DNS intercept: flushed DNS resolver cache via ipconfig /flushdns") + } +} + +// sendParamChange sends SERVICE_CONTROL_PARAMCHANGE to the DNS Client (Dnscache) +// service, signaling it to re-read its configuration including NRPT rules from +// the registry. This is the standard mechanism used by FortiClient, Tailscale, +// and other DNS-aware software — it's reliable and non-disruptive unlike +// restarting the Dnscache service (which always fails on modern Windows because +// Dnscache is a protected shared svchost service). +func sendParamChange() { + if out, err := exec.Command("sc", "control", "dnscache", "paramchange").CombinedOutput(); err != nil { + mainLog.Load().Debug().Err(err).Str("output", string(out)).Msg("DNS intercept: sc control dnscache paramchange failed") + } else { + mainLog.Load().Debug().Msg("DNS intercept: sent paramchange to Dnscache service") + } +} + +// cleanEmptyNRPTParent removes empty NRPT parent keys that block activation. +// An empty DnsPolicyConfig key (exists but no subkeys) causes DNS Client to +// cache "no rules" and ignore subsequently-added rules. +// +// Also cleans the GP path entirely if it has no non-ctrld rules, since the GP +// path's existence forces DNS Client into "GP mode" where local-path rules +// are ignored. +// +// Returns true if cleanup was performed (caller should add a delay). +func cleanEmptyNRPTParent() bool { + cleaned := false + + // Always clean the GP path — its existence blocks local path activation. + cleanGPPath() + + // Clean empty local/direct path parent key. + k, err := registry.OpenKey(registry.LOCAL_MACHINE, nrptDirectKey, registry.ENUMERATE_SUB_KEYS) + if err != nil { + return false + } + names, err := k.ReadSubKeyNames(-1) + k.Close() + if err != nil || len(names) > 0 { + return false + } + + mainLog.Load().Warn().Msg("DNS intercept: found empty NRPT local parent key (blocks activation) — removing") + if err := registry.DeleteKey(registry.LOCAL_MACHINE, nrptDirectKey); err != nil { + mainLog.Load().Warn().Err(err).Msg("DNS intercept: failed to delete empty NRPT local parent key") + return false + } + cleaned = true + + // Signal DNS Client to process the deletion and reset its internal cache. + mainLog.Load().Info().Msg("DNS intercept: empty NRPT parent key removed — signaling DNS Client") + sendParamChange() + flushDNSCache() + return cleaned +} + +// logNRPTParentKeyState logs the state of both NRPT registry paths for diagnostics. +func logNRPTParentKeyState(context string) { + for _, path := range []struct { + name string + key string + }{ + {"GP", nrptBaseKey}, + {"local", nrptDirectKey}, + } { + k, err := registry.OpenKey(registry.LOCAL_MACHINE, path.key, registry.ENUMERATE_SUB_KEYS) + if err != nil { + mainLog.Load().Debug().Str("context", context).Str("path", path.name). + Msg("DNS intercept: NRPT parent key does not exist") + continue + } + names, err := k.ReadSubKeyNames(-1) + k.Close() + if err != nil { + continue + } + if len(names) == 0 { + mainLog.Load().Warn().Str("context", context).Str("path", path.name). + Msg("DNS intercept: NRPT parent key exists but is EMPTY — blocks activation") + } else { + mainLog.Load().Debug().Str("context", context).Str("path", path.name). + Int("subkeys", len(names)).Strs("names", names). + Msg("DNS intercept: NRPT parent key state") + } + } +} + +// startDNSIntercept activates WFP-based DNS interception on Windows. +// It creates a WFP sublayer and adds filters that block all outbound DNS (port 53) +// traffic except to localhost (127.0.0.1/::1), ensuring all DNS queries must go +// through ctrld's local listener. This eliminates the race condition with VPN +// software that overwrites interface DNS settings. +// +// The approach: +// 1. Permit outbound DNS to 127.0.0.1/::1 (ctrld's listener) +// 2. Block all other outbound DNS (port 53 UDP+TCP) +// +// This means even if a VPN overwrites DNS settings to its own servers, +// the OS cannot reach those servers on port 53 — queries fail and fall back +// to ctrld via the loopback address. +func (p *prog) startDNSIntercept() error { + // Resolve the actual listener IP. On AD DC / Windows Server with a local DNS + // server, ctrld may have fallen back to 127.0.0.x:53 instead of 127.0.0.1:53. + // NRPT must point to whichever address ctrld is actually listening on. + listenerIP := "127.0.0.1" + if lc := p.cfg.FirstListener(); lc != nil && lc.IP != "" { + listenerIP = lc.IP + } + + state := &wfpState{ + stopCh: make(chan struct{}), + listenerIP: listenerIP, + } + + // Step 1: Add NRPT catch-all rule (both dns and hard modes). + // NRPT must succeed before proceeding with WFP in hard mode. + mainLog.Load().Info().Msgf("DNS intercept: initializing (mode: %s)", interceptMode) + + logNRPTParentKeyState("pre-write") + + // Two-phase empty parent key recovery: if the GP DnsPolicyConfig key exists + // but is empty, it poisons DNS Client's cache. Clean it before writing. + cleanEmptyNRPTParent() + + if err := addNRPTCatchAllRule(listenerIP); err != nil { + return fmt.Errorf("dns intercept: failed to add NRPT catch-all rule: %w", err) + } + logNRPTParentKeyState("post-write") + + state.nrptActive = true + refreshNRPTPolicy() + sendParamChange() + flushDNSCache() + mainLog.Load().Info().Msgf("DNS intercept: NRPT catch-all rule active — all DNS queries directed to %s", listenerIP) + + // Step 2: In hard mode, also set up WFP filters to block non-local DNS. + if hardIntercept { + if err := p.startWFPFilters(state); err != nil { + // Roll back NRPT since WFP failed. + mainLog.Load().Error().Err(err).Msg("DNS intercept: WFP setup failed, rolling back NRPT") + _ = removeNRPTCatchAllRule() + flushDNSCache() + state.nrptActive = false + return fmt.Errorf("dns intercept: WFP setup failed: %w", err) + } + } else { + mainLog.Load().Info().Msg("DNS intercept: dns mode — NRPT only, no WFP filters (graceful)") + } + + p.dnsInterceptState = state + + // Start periodic NRPT health monitor. + go p.nrptHealthMonitor(state) + + // Verify NRPT is actually working (async — doesn't block startup). + // This catches the race condition where RefreshPolicyEx returns before + // the DNS Client service has loaded the NRPT rule from registry. + go p.nrptProbeAndHeal() + + return nil +} + +// startWFPFilters opens the WFP engine and adds all block/permit filters. +// Called only in hard intercept mode. +func (p *prog) startWFPFilters(state *wfpState) error { + mainLog.Load().Info().Msg("DNS intercept: initializing Windows Filtering Platform (WFP)") + + var engineHandle uintptr + session := fwpmSession0{} + sessionName, _ := windows.UTF16PtrFromString("ctrld DNS Intercept") + session.displayData.name = sessionName + + // RPC_C_AUTHN_DEFAULT (0xFFFFFFFF) lets the system pick the appropriate + // authentication service. RPC_C_AUTHN_NONE (0) returns ERROR_NOT_SUPPORTED + // on some Windows configurations (e.g., Parallels VMs). + const rpcCAuthnDefault = 0xFFFFFFFF + r1, _, _ := procFwpmEngineOpen0.Call( + 0, + uintptr(rpcCAuthnDefault), + 0, + uintptr(unsafe.Pointer(&session)), + uintptr(unsafe.Pointer(&engineHandle)), + ) + if r1 != 0 { + return fmt.Errorf("FwpmEngineOpen0 failed: HRESULT 0x%x", r1) + } + mainLog.Load().Info().Msgf("DNS intercept: WFP engine opened (handle: 0x%x)", engineHandle) + + // Clean up any stale sublayer from a previous unclean shutdown. + // If ctrld crashed or was killed, the non-dynamic WFP session may have left + // orphaned filters. Deleting the sublayer removes all its child filters. + r1, _, _ = procFwpmSubLayerDeleteByKey0.Call( + engineHandle, + uintptr(unsafe.Pointer(&ctrldSubLayerGUID)), + ) + if r1 == 0 { + mainLog.Load().Info().Msg("DNS intercept: cleaned up stale WFP sublayer from previous session") + } + // r1 != 0 means sublayer didn't exist — that's fine, nothing to clean up. + + sublayer := fwpmSublayer0{ + subLayerKey: ctrldSubLayerGUID, + weight: 0xFFFF, + } + sublayerName, _ := windows.UTF16PtrFromString("ctrld DNS Intercept Sublayer") + sublayerDesc, _ := windows.UTF16PtrFromString("Blocks outbound DNS except to ctrld listener. Prevents VPN DNS conflicts.") + sublayer.displayData.name = sublayerName + sublayer.displayData.description = sublayerDesc + + r1, _, _ = procFwpmSubLayerAdd0.Call( + engineHandle, + uintptr(unsafe.Pointer(&sublayer)), + 0, + ) + if r1 != 0 { + procFwpmEngineClose0.Call(engineHandle) + return fmt.Errorf("FwpmSubLayerAdd0 failed: HRESULT 0x%x", r1) + } + mainLog.Load().Info().Msg("DNS intercept: WFP sublayer created (weight: 0xFFFF — maximum priority)") + + state.engineHandle = engineHandle + + permitFilters := []struct { + name string + layer windows.GUID + proto uint8 + idField *uint64 + }{ + {"Permit DNS to localhost (IPv4/UDP)", fwpmLayerALEAuthConnectV4, ipprotoUDP, &state.permitIDv4UDP}, + {"Permit DNS to localhost (IPv4/TCP)", fwpmLayerALEAuthConnectV4, ipprotoTCP, &state.permitIDv4TCP}, + {"Permit DNS to localhost (IPv6/UDP)", fwpmLayerALEAuthConnectV6, ipprotoUDP, &state.permitIDv6UDP}, + {"Permit DNS to localhost (IPv6/TCP)", fwpmLayerALEAuthConnectV6, ipprotoTCP, &state.permitIDv6TCP}, + } + + for _, pf := range permitFilters { + filterID, err := p.addWFPPermitLocalhostFilter(engineHandle, pf.name, pf.layer, pf.proto) + if err != nil { + p.cleanupWFPFilters(state) + return fmt.Errorf("failed to add permit filter %q: %w", pf.name, err) + } + *pf.idField = filterID + mainLog.Load().Debug().Msgf("DNS intercept: added permit filter %q (ID: %d)", pf.name, filterID) + } + + blockFilters := []struct { + name string + layer windows.GUID + proto uint8 + idField *uint64 + }{ + {"Block outbound DNS (IPv4/UDP)", fwpmLayerALEAuthConnectV4, ipprotoUDP, &state.filterIDv4UDP}, + {"Block outbound DNS (IPv4/TCP)", fwpmLayerALEAuthConnectV4, ipprotoTCP, &state.filterIDv4TCP}, + {"Block outbound DNS (IPv6/UDP)", fwpmLayerALEAuthConnectV6, ipprotoUDP, &state.filterIDv6UDP}, + {"Block outbound DNS (IPv6/TCP)", fwpmLayerALEAuthConnectV6, ipprotoTCP, &state.filterIDv6TCP}, + } + + for _, bf := range blockFilters { + filterID, err := p.addWFPBlockDNSFilter(engineHandle, bf.name, bf.layer, bf.proto) + if err != nil { + p.cleanupWFPFilters(state) + return fmt.Errorf("failed to add block filter %q: %w", bf.name, err) + } + *bf.idField = filterID + mainLog.Load().Debug().Msgf("DNS intercept: added block filter %q (ID: %d)", bf.name, filterID) + } + + // Add static permit filters for RFC1918 + CGNAT ranges (UDP + TCP). + // This allows VPN DNS servers on private IPs (MagicDNS upstreams, F5, Windscribe, etc.) + // to work without dynamic per-server exemptions. + privateRanges := []struct { + name string + addr uint32 + mask uint32 + }{ + {"10.0.0.0/8", 0x0A000000, 0xFF000000}, + {"172.16.0.0/12", 0xAC100000, 0xFFF00000}, + {"192.168.0.0/16", 0xC0A80000, 0xFFFF0000}, + {"100.64.0.0/10", 0x64400000, 0xFFC00000}, + } + for _, r := range privateRanges { + for _, proto := range []struct { + num uint8 + name string + }{{ipprotoUDP, "UDP"}, {ipprotoTCP, "TCP"}} { + filterName := fmt.Sprintf("Permit DNS to %s (%s)", r.name, proto.name) + filterID, err := p.addWFPPermitSubnetFilter(engineHandle, filterName, proto.num, r.addr, r.mask) + if err != nil { + mainLog.Load().Warn().Err(err).Msgf("DNS intercept: failed to add subnet permit for %s/%s", r.name, proto.name) + continue + } + state.subnetPermitFilterIDs = append(state.subnetPermitFilterIDs, filterID) + mainLog.Load().Debug().Msgf("DNS intercept: added subnet permit %q (ID: %d)", filterName, filterID) + } + } + mainLog.Load().Info().Msgf("DNS intercept: %d subnet permit filters active (RFC1918 + CGNAT)", len(state.subnetPermitFilterIDs)) + + mainLog.Load().Info().Msgf("DNS intercept: WFP filters active — all outbound DNS (port 53) blocked except to localhost and private ranges. "+ + "Filter IDs: v4UDP=%d, v4TCP=%d, v6UDP=%d, v6TCP=%d (block), "+ + "v4UDP=%d, v4TCP=%d, v6UDP=%d, v6TCP=%d (permit localhost)", + state.filterIDv4UDP, state.filterIDv4TCP, state.filterIDv6UDP, state.filterIDv6TCP, + state.permitIDv4UDP, state.permitIDv4TCP, state.permitIDv6UDP, state.permitIDv6TCP) + + return nil +} + +// addWFPBlockDNSFilter adds a WFP filter that blocks outbound DNS traffic (port 53) +// for the given protocol (UDP or TCP) on the specified layer (V4 or V6). +func (p *prog) addWFPBlockDNSFilter(engineHandle uintptr, name string, layerKey windows.GUID, proto uint8) (uint64, error) { + filterName, _ := windows.UTF16PtrFromString("ctrld: " + name) + + conditions := make([]fwpmFilterCondition0, 2) + + conditions[0] = fwpmFilterCondition0{ + fieldKey: fwpmConditionIPProtocol, + matchType: fwpMatchEqual, + } + conditions[0].condValue.valueType = fwpUint8 + conditions[0].condValue.value = uint64(proto) + + conditions[1] = fwpmFilterCondition0{ + fieldKey: fwpmConditionIPRemotePort, + matchType: fwpMatchEqual, + } + conditions[1].condValue.valueType = fwpUint16 + conditions[1].condValue.value = uint64(dnsPort) + + filter := fwpmFilter0{ + layerKey: layerKey, + subLayerKey: ctrldSubLayerGUID, + numFilterConds: 2, + filterCondition: &conditions[0], + } + filter.displayData.name = filterName + filter.weight.valueType = fwpUint8 + filter.weight.value = 1 + filter.action.actionType = fwpActionBlock + + var filterID uint64 + r1, _, _ := procFwpmFilterAdd0.Call( + engineHandle, + uintptr(unsafe.Pointer(&filter)), + 0, + uintptr(unsafe.Pointer(&filterID)), + ) + runtime.KeepAlive(conditions) + if r1 != 0 { + return 0, fmt.Errorf("FwpmFilterAdd0 failed: HRESULT 0x%x", r1) + } + return filterID, nil +} + +// addWFPPermitLocalhostFilter adds a WFP filter that permits outbound DNS to localhost. +// This ensures ctrld's listener at 127.0.0.1/::1 can receive DNS queries. +// +// TODO: On AD DC where ctrld listens on 127.0.0.x, this filter should match +// the actual listener IP instead of hardcoded 127.0.0.1. Currently hard mode +// is unlikely on AD DC (NRPT dns mode is preferred), but if needed, this must +// be parameterized like addNRPTCatchAllRule. +// These filters have higher weight than block filters so they're matched first. +func (p *prog) addWFPPermitLocalhostFilter(engineHandle uintptr, name string, layerKey windows.GUID, proto uint8) (uint64, error) { + filterName, _ := windows.UTF16PtrFromString("ctrld: " + name) + + ipv6Loopback := [16]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1} + + conditions := make([]fwpmFilterCondition0, 3) + + conditions[0] = fwpmFilterCondition0{ + fieldKey: fwpmConditionIPProtocol, + matchType: fwpMatchEqual, + } + conditions[0].condValue.valueType = fwpUint8 + conditions[0].condValue.value = uint64(proto) + + conditions[1] = fwpmFilterCondition0{ + fieldKey: fwpmConditionIPRemotePort, + matchType: fwpMatchEqual, + } + conditions[1].condValue.valueType = fwpUint16 + conditions[1].condValue.value = uint64(dnsPort) + + conditions[2] = fwpmFilterCondition0{ + fieldKey: fwpmConditionIPRemoteAddress, + matchType: fwpMatchEqual, + } + if layerKey == fwpmLayerALEAuthConnectV4 { + conditions[2].condValue.valueType = fwpUint32 + conditions[2].condValue.value = 0x7F000001 + } else { + conditions[2].condValue.valueType = fwpByteArray16Type + conditions[2].condValue.value = uint64(uintptr(unsafe.Pointer(&ipv6Loopback))) + } + + filter := fwpmFilter0{ + layerKey: layerKey, + subLayerKey: ctrldSubLayerGUID, + numFilterConds: 3, + filterCondition: &conditions[0], + } + filter.displayData.name = filterName + filter.weight.valueType = fwpUint8 + filter.weight.value = 10 + filter.action.actionType = fwpActionPermit + + var filterID uint64 + r1, _, _ := procFwpmFilterAdd0.Call( + engineHandle, + uintptr(unsafe.Pointer(&filter)), + 0, + uintptr(unsafe.Pointer(&filterID)), + ) + runtime.KeepAlive(&ipv6Loopback) + runtime.KeepAlive(conditions) + if r1 != 0 { + return 0, fmt.Errorf("FwpmFilterAdd0 failed: HRESULT 0x%x", r1) + } + return filterID, nil +} + +// addWFPPermitSubnetFilter adds a WFP filter that permits outbound DNS to a given +// IPv4 subnet (addr/mask in host byte order). Used to exempt RFC1918 and CGNAT ranges +// so VPN DNS servers on private IPs are not blocked. +func (p *prog) addWFPPermitSubnetFilter(engineHandle uintptr, name string, proto uint8, addr, mask uint32) (uint64, error) { + filterName, _ := windows.UTF16PtrFromString("ctrld: " + name) + + addrMask := fwpV4AddrAndMask{addr: addr, mask: mask} + + conditions := make([]fwpmFilterCondition0, 3) + + conditions[0] = fwpmFilterCondition0{ + fieldKey: fwpmConditionIPProtocol, + matchType: fwpMatchEqual, + } + conditions[0].condValue.valueType = fwpUint8 + conditions[0].condValue.value = uint64(proto) + + conditions[1] = fwpmFilterCondition0{ + fieldKey: fwpmConditionIPRemotePort, + matchType: fwpMatchEqual, + } + conditions[1].condValue.valueType = fwpUint16 + conditions[1].condValue.value = uint64(dnsPort) + + conditions[2] = fwpmFilterCondition0{ + fieldKey: fwpmConditionIPRemoteAddress, + matchType: fwpMatchEqual, + } + conditions[2].condValue.valueType = fwpV4AddrMask + conditions[2].condValue.value = uint64(uintptr(unsafe.Pointer(&addrMask))) + + filter := fwpmFilter0{ + layerKey: fwpmLayerALEAuthConnectV4, + subLayerKey: ctrldSubLayerGUID, + numFilterConds: 3, + filterCondition: &conditions[0], + } + filter.displayData.name = filterName + filter.weight.valueType = fwpUint8 + filter.weight.value = 10 + filter.action.actionType = fwpActionPermit + + var filterID uint64 + r1, _, _ := procFwpmFilterAdd0.Call( + engineHandle, + uintptr(unsafe.Pointer(&filter)), + 0, + uintptr(unsafe.Pointer(&filterID)), + ) + runtime.KeepAlive(&addrMask) + runtime.KeepAlive(conditions) + if r1 != 0 { + return 0, fmt.Errorf("FwpmFilterAdd0 failed: HRESULT 0x%x", r1) + } + return filterID, nil +} + +// wfpSublayerExists checks whether our WFP sublayer still exists in the engine. +// Used by the watchdog to detect if another program removed our filters. +func wfpSublayerExists(engineHandle uintptr) bool { + var sublayerPtr uintptr + r1, _, _ := procFwpmSubLayerGetByKey0.Call( + engineHandle, + uintptr(unsafe.Pointer(&ctrldSubLayerGUID)), + uintptr(unsafe.Pointer(&sublayerPtr)), + ) + if r1 != 0 { + return false + } + if sublayerPtr != 0 { + procFwpmFreeMemory0.Call(uintptr(unsafe.Pointer(&sublayerPtr))) + } + return true +} + +// cleanupWFPFilters removes all WFP filters and the sublayer, then closes the engine. +// It logs each step and continues cleanup even if individual removals fail, +// to ensure maximum cleanup on shutdown. +func (p *prog) cleanupWFPFilters(state *wfpState) { + if state == nil || state.engineHandle == 0 { + return + } + + for _, filterID := range state.vpnPermitFilterIDs { + r1, _, _ := procFwpmFilterDeleteById0.Call(state.engineHandle, uintptr(filterID)) + if r1 != 0 { + mainLog.Load().Warn().Msgf("DNS intercept: failed to remove VPN permit filter (ID: %d, code: 0x%x)", filterID, r1) + } else { + mainLog.Load().Debug().Msgf("DNS intercept: removed VPN permit filter (ID: %d)", filterID) + } + } + + for _, filterID := range state.subnetPermitFilterIDs { + r1, _, _ := procFwpmFilterDeleteById0.Call(state.engineHandle, uintptr(filterID)) + if r1 != 0 { + mainLog.Load().Warn().Msgf("DNS intercept: failed to remove subnet permit filter (ID: %d, code: 0x%x)", filterID, r1) + } else { + mainLog.Load().Debug().Msgf("DNS intercept: removed subnet permit filter (ID: %d)", filterID) + } + } + + filterIDs := []struct { + name string + id uint64 + }{ + {"permit v4 UDP", state.permitIDv4UDP}, + {"permit v4 TCP", state.permitIDv4TCP}, + {"permit v6 UDP", state.permitIDv6UDP}, + {"permit v6 TCP", state.permitIDv6TCP}, + {"block v4 UDP", state.filterIDv4UDP}, + {"block v4 TCP", state.filterIDv4TCP}, + {"block v6 UDP", state.filterIDv6UDP}, + {"block v6 TCP", state.filterIDv6TCP}, + } + + for _, f := range filterIDs { + if f.id == 0 { + continue + } + r1, _, _ := procFwpmFilterDeleteById0.Call(state.engineHandle, uintptr(f.id)) + if r1 != 0 { + mainLog.Load().Warn().Msgf("DNS intercept: failed to remove WFP filter %q (ID: %d, code: 0x%x)", f.name, f.id, r1) + } else { + mainLog.Load().Debug().Msgf("DNS intercept: removed WFP filter %q (ID: %d)", f.name, f.id) + } + } + + r1, _, _ := procFwpmSubLayerDeleteByKey0.Call( + state.engineHandle, + uintptr(unsafe.Pointer(&ctrldSubLayerGUID)), + ) + if r1 != 0 { + mainLog.Load().Warn().Msgf("DNS intercept: failed to remove WFP sublayer (code: 0x%x)", r1) + } else { + mainLog.Load().Debug().Msg("DNS intercept: removed WFP sublayer") + } + + r1, _, _ = procFwpmEngineClose0.Call(state.engineHandle) + if r1 != 0 { + mainLog.Load().Warn().Msgf("DNS intercept: failed to close WFP engine (code: 0x%x)", r1) + } else { + mainLog.Load().Debug().Msg("DNS intercept: WFP engine closed") + } +} + +// stopDNSIntercept removes all WFP filters and shuts down the DNS interception. +func (p *prog) stopDNSIntercept() error { + if p.dnsInterceptState == nil { + mainLog.Load().Debug().Msg("DNS intercept: no state to clean up") + return nil + } + + state := p.dnsInterceptState.(*wfpState) + + // Stop the health monitor goroutine. + if state.stopCh != nil { + close(state.stopCh) + } + + // Remove NRPT rule BEFORE WFP cleanup — restore normal DNS resolution + // before removing the block filters that enforce it. + if state.nrptActive { + if err := removeNRPTCatchAllRule(); err != nil { + mainLog.Load().Warn().Err(err).Msg("DNS intercept: failed to remove NRPT catch-all rule") + } else { + mainLog.Load().Info().Msg("DNS intercept: removed NRPT catch-all rule") + } + flushDNSCache() + state.nrptActive = false + } + + // Only clean up WFP if we actually opened the engine (hard mode). + if state.engineHandle != 0 { + mainLog.Load().Info().Msg("DNS intercept: shutting down WFP filters") + p.cleanupWFPFilters(state) + mainLog.Load().Info().Msg("DNS intercept: WFP shutdown complete") + } + + p.dnsInterceptState = nil + mainLog.Load().Info().Msg("DNS intercept: shutdown complete") + return nil +} + +// dnsInterceptSupported reports whether DNS intercept mode is supported on this platform. +func dnsInterceptSupported() bool { + if err := fwpuclntDLL.Load(); err != nil { + return false + } + return true +} + +// validateDNSIntercept checks that the system meets requirements for DNS intercept mode. +func (p *prog) validateDNSIntercept() error { + // Hard mode requires WFP and elevation for filter management. + if hardIntercept { + if !dnsInterceptSupported() { + return fmt.Errorf("dns intercept: fwpuclnt.dll not available — WFP requires Windows Vista or later") + } + if !isElevated() { + return fmt.Errorf("dns intercept: administrator privileges required for WFP filter management in hard mode") + } + } + // dns mode only needs NRPT (HKLM registry writes), which services can do + // without explicit elevation checks. + return nil +} + +// isElevated checks if the current process has administrator privileges. +func isElevated() bool { + token := windows.GetCurrentProcessToken() + return token.IsElevated() +} + +// exemptVPNDNSServers updates the WFP filters to permit outbound DNS to VPN DNS servers. +// This prevents the block filters from intercepting ctrld's own forwarded queries to +// VPN DNS servers (split DNS routing). +// +// The function is idempotent: it first removes ALL existing VPN permit filters, +// then adds new ones for the current server list. When called with nil/empty +// exemptions (VPN disconnected), it just removes the old permits — leaving only +// the localhost permits and block-all filters active. +// +// On Windows, WFP filters are process-scoped (not interface-scoped like macOS pf), +// so we only use the server IPs from the exemptions. +// +// Supports both IPv4 and IPv6 VPN DNS servers. +// +// Called by vpnDNSManager.onServersChanged() whenever VPN DNS servers change. +func (p *prog) exemptVPNDNSServers(exemptions []vpnDNSExemption) error { + state, ok := p.dnsInterceptState.(*wfpState) + if !ok || state == nil { + return fmt.Errorf("DNS intercept state not available") + } + // In dns mode (no WFP), VPN DNS exemptions are not needed — there are no + // block filters to exempt from. + if state.engineHandle == 0 { + mainLog.Load().Debug().Msg("DNS intercept: dns mode — skipping VPN DNS exemptions (no WFP filters)") + return nil + } + + for _, filterID := range state.vpnPermitFilterIDs { + r1, _, _ := procFwpmFilterDeleteById0.Call(state.engineHandle, uintptr(filterID)) + if r1 != 0 { + mainLog.Load().Warn().Msgf("DNS intercept: failed to remove old VPN permit filter (ID: %d, code: 0x%x)", filterID, r1) + } + } + state.vpnPermitFilterIDs = nil + + // Extract unique server IPs from exemptions (WFP doesn't need interface info). + seen := make(map[string]bool) + var servers []string + for _, ex := range exemptions { + if !seen[ex.Server] { + seen[ex.Server] = true + servers = append(servers, ex.Server) + } + } + + for _, server := range servers { + ipv4 := parseIPv4AsUint32(server) + isIPv6 := ipv4 == 0 + + for _, proto := range []uint8{ipprotoUDP, ipprotoTCP} { + protoName := "UDP" + if proto == ipprotoTCP { + protoName = "TCP" + } + filterName := fmt.Sprintf("ctrld: Permit VPN DNS to %s (%s)", server, protoName) + + var filterID uint64 + var err error + if isIPv6 { + ipv6Bytes := parseIPv6AsBytes(server) + if ipv6Bytes == nil { + mainLog.Load().Warn().Msgf("DNS intercept: skipping invalid VPN DNS server: %s", server) + continue + } + filterID, err = p.addWFPPermitIPv6Filter(state.engineHandle, filterName, fwpmLayerALEAuthConnectV6, proto, ipv6Bytes) + } else { + filterID, err = p.addWFPPermitIPFilter(state.engineHandle, filterName, fwpmLayerALEAuthConnectV4, proto, ipv4) + } + if err != nil { + return fmt.Errorf("failed to add VPN DNS permit filter for %s/%s: %w", server, protoName, err) + } + state.vpnPermitFilterIDs = append(state.vpnPermitFilterIDs, filterID) + mainLog.Load().Debug().Msgf("DNS intercept: added VPN DNS permit filter for %s/%s (ID: %d)", server, protoName, filterID) + } + } + + mainLog.Load().Info().Msgf("DNS intercept: exempted %d VPN DNS servers from WFP block (%d filters)", len(servers), len(state.vpnPermitFilterIDs)) + return nil +} + +// addWFPPermitIPFilter adds a WFP permit filter for outbound DNS to a specific IPv4 address. +func (p *prog) addWFPPermitIPFilter(engineHandle uintptr, name string, layerKey windows.GUID, proto uint8, ipAddr uint32) (uint64, error) { + filterName, _ := windows.UTF16PtrFromString(name) + + conditions := make([]fwpmFilterCondition0, 3) + + conditions[0] = fwpmFilterCondition0{ + fieldKey: fwpmConditionIPProtocol, + matchType: fwpMatchEqual, + } + conditions[0].condValue.valueType = fwpUint8 + conditions[0].condValue.value = uint64(proto) + + conditions[1] = fwpmFilterCondition0{ + fieldKey: fwpmConditionIPRemotePort, + matchType: fwpMatchEqual, + } + conditions[1].condValue.valueType = fwpUint16 + conditions[1].condValue.value = uint64(dnsPort) + + conditions[2] = fwpmFilterCondition0{ + fieldKey: fwpmConditionIPRemoteAddress, + matchType: fwpMatchEqual, + } + conditions[2].condValue.valueType = fwpUint32 + conditions[2].condValue.value = uint64(ipAddr) + + filter := fwpmFilter0{ + layerKey: layerKey, + subLayerKey: ctrldSubLayerGUID, + numFilterConds: 3, + filterCondition: &conditions[0], + } + filter.displayData.name = filterName + filter.weight.valueType = fwpUint8 + filter.weight.value = 10 + filter.action.actionType = fwpActionPermit + + var filterID uint64 + r1, _, _ := procFwpmFilterAdd0.Call( + engineHandle, + uintptr(unsafe.Pointer(&filter)), + 0, + uintptr(unsafe.Pointer(&filterID)), + ) + runtime.KeepAlive(conditions) + if r1 != 0 { + return 0, fmt.Errorf("FwpmFilterAdd0 failed: HRESULT 0x%x", r1) + } + return filterID, nil +} + +// addWFPPermitIPv6Filter adds a WFP permit filter for outbound DNS to a specific IPv6 address. +func (p *prog) addWFPPermitIPv6Filter(engineHandle uintptr, name string, layerKey windows.GUID, proto uint8, ipAddr *[16]byte) (uint64, error) { + filterName, _ := windows.UTF16PtrFromString(name) + + conditions := make([]fwpmFilterCondition0, 3) + + conditions[0] = fwpmFilterCondition0{ + fieldKey: fwpmConditionIPProtocol, + matchType: fwpMatchEqual, + } + conditions[0].condValue.valueType = fwpUint8 + conditions[0].condValue.value = uint64(proto) + + conditions[1] = fwpmFilterCondition0{ + fieldKey: fwpmConditionIPRemotePort, + matchType: fwpMatchEqual, + } + conditions[1].condValue.valueType = fwpUint16 + conditions[1].condValue.value = uint64(dnsPort) + + conditions[2] = fwpmFilterCondition0{ + fieldKey: fwpmConditionIPRemoteAddress, + matchType: fwpMatchEqual, + } + conditions[2].condValue.valueType = fwpByteArray16Type + conditions[2].condValue.value = uint64(uintptr(unsafe.Pointer(ipAddr))) + + filter := fwpmFilter0{ + layerKey: layerKey, + subLayerKey: ctrldSubLayerGUID, + numFilterConds: 3, + filterCondition: &conditions[0], + } + filter.displayData.name = filterName + filter.weight.valueType = fwpUint8 + filter.weight.value = 10 + filter.action.actionType = fwpActionPermit + + var filterID uint64 + r1, _, _ := procFwpmFilterAdd0.Call( + engineHandle, + uintptr(unsafe.Pointer(&filter)), + 0, + uintptr(unsafe.Pointer(&filterID)), + ) + runtime.KeepAlive(ipAddr) + runtime.KeepAlive(conditions) + if r1 != 0 { + return 0, fmt.Errorf("FwpmFilterAdd0 failed: HRESULT 0x%x", r1) + } + return filterID, nil +} + +// parseIPv6AsBytes parses an IPv6 address string into a 16-byte array for WFP. +// Returns nil if the string is not a valid IPv6 address. +func parseIPv6AsBytes(ipStr string) *[16]byte { + ip := net.ParseIP(ipStr) + if ip == nil { + return nil + } + ip = ip.To16() + if ip == nil || ip.To4() != nil { + // It's IPv4, not IPv6 + return nil + } + var result [16]byte + copy(result[:], ip) + return &result +} + +// parseIPv4AsUint32 converts an IPv4 string to a uint32 in host byte order for WFP. +func parseIPv4AsUint32(ipStr string) uint32 { + parts := [4]byte{} + n := 0 + val := uint32(0) + for i := 0; i < len(ipStr) && n < 4; i++ { + if ipStr[i] == '.' { + parts[n] = byte(val) + n++ + val = 0 + } else if ipStr[i] >= '0' && ipStr[i] <= '9' { + val = val*10 + uint32(ipStr[i]-'0') + } else { + return 0 + } + } + if n == 3 { + parts[3] = byte(val) + return uint32(parts[0])<<24 | uint32(parts[1])<<16 | uint32(parts[2])<<8 | uint32(parts[3]) + } + return 0 +} + +// ensurePFAnchorActive is a no-op on Windows (WFP handles intercept differently). +func (p *prog) ensurePFAnchorActive() bool { + return false +} + +// checkTunnelInterfaceChanges is a no-op on Windows (WFP handles intercept differently). +func (p *prog) checkTunnelInterfaceChanges() bool { + return false +} + +// pfAnchorRecheckDelay is the delay for deferred pf anchor re-checks. +// Defined here as a stub for Windows (referenced from dns_proxy.go). +const pfAnchorRecheckDelay = 2 * time.Second + +// pfAnchorRecheckDelayLong is the longer delayed re-check for slower VPN teardowns. +const pfAnchorRecheckDelayLong = 4 * time.Second + +// scheduleDelayedRechecks schedules delayed OS resolver and VPN DNS refreshes after +// network change events. While WFP filters don't get wiped like pf anchors, the OS +// resolver and VPN DNS state can still be stale after VPN disconnect (same issue as macOS). +func (p *prog) scheduleDelayedRechecks() { + for _, delay := range []time.Duration{pfAnchorRecheckDelay, pfAnchorRecheckDelayLong} { + time.AfterFunc(delay, func() { + if p.dnsInterceptState == nil { + return + } + // Refresh OS resolver — VPN may have finished DNS cleanup since the + // immediate handler ran. + ctx := ctrld.LoggerCtx(context.Background(), p.logger.Load()) + ctrld.InitializeOsResolver(ctx, true) + if p.vpnDNS != nil { + p.vpnDNS.Refresh(ctx) + } + + // NRPT watchdog: some VPN software clears NRPT policy rules on + // connect/disconnect. Re-add our catch-all rule if it was removed. + state, ok := p.dnsInterceptState.(*wfpState) + if ok && state.nrptActive && !nrptCatchAllRuleExists() { + mainLog.Load().Warn().Msg("DNS intercept: NRPT catch-all rule was removed externally — re-adding") + if err := addNRPTCatchAllRule(state.listenerIP); err != nil { + mainLog.Load().Error().Err(err).Msg("DNS intercept: failed to re-add NRPT catch-all rule") + state.nrptActive = false + } else { + flushDNSCache() + mainLog.Load().Info().Msg("DNS intercept: NRPT catch-all rule restored") + } + } + + // WFP watchdog: verify our sublayer still exists. + if ok && state.engineHandle != 0 && !wfpSublayerExists(state.engineHandle) { + mainLog.Load().Warn().Msg("DNS intercept: WFP sublayer was removed externally — re-creating all filters") + _ = p.stopDNSIntercept() + if err := p.startDNSIntercept(); err != nil { + mainLog.Load().Error().Err(err).Msg("DNS intercept: failed to re-create WFP filters") + } + } + }) + } +} + +// nrptHealthMonitor periodically checks that the NRPT catch-all rule is still +// present and re-adds it if removed by VPN software or Group Policy updates. +// In hard mode, it also verifies the WFP sublayer exists and re-initializes +// all filters if they were removed. +func (p *prog) nrptHealthMonitor(state *wfpState) { + ticker := time.NewTicker(30 * time.Second) + defer ticker.Stop() + for { + select { + case <-state.stopCh: + return + case <-ticker.C: + if !state.nrptActive { + continue + } + // Step 1: Check registry key exists. + if !nrptCatchAllRuleExists() { + mainLog.Load().Warn().Msg("DNS intercept: NRPT health check — catch-all rule missing, restoring") + if err := addNRPTCatchAllRule(state.listenerIP); err != nil { + mainLog.Load().Error().Err(err).Msg("DNS intercept: failed to restore NRPT catch-all rule") + state.nrptActive = false + continue + } + refreshNRPTPolicy() + flushDNSCache() + mainLog.Load().Info().Msg("DNS intercept: NRPT catch-all rule restored by health monitor") + // After restoring, verify it's actually working. + go p.nrptProbeAndHeal() + continue + } + + // Step 2: Registry key exists — verify NRPT is actually routing + // queries to ctrld (catches the async GP refresh race). + if !p.probeNRPT() { + mainLog.Load().Warn().Msg("DNS intercept: NRPT health check — rule present but probe failed, running heal cycle") + go p.nrptProbeAndHeal() + } + + // Step 3: In hard mode, also verify WFP sublayer. + if state.engineHandle != 0 && !wfpSublayerExists(state.engineHandle) { + mainLog.Load().Warn().Msg("DNS intercept: WFP health check — sublayer missing, re-initializing all filters") + _ = p.stopDNSIntercept() + if err := p.startDNSIntercept(); err != nil { + mainLog.Load().Error().Err(err).Msg("DNS intercept: failed to re-initialize after WFP sublayer loss") + } else { + mainLog.Load().Info().Msg("DNS intercept: WFP filters restored by health monitor") + } + return // stopDNSIntercept closed our stopCh; startDNSIntercept started a new monitor + } + } + } +} + +// pfInterceptMonitor is a no-op on Windows — WFP filters are kernel objects +// and don't suffer from the pf translation state corruption that macOS has. +func (p *prog) pfInterceptMonitor() {} + +const ( + // nrptProbeDomain is the suffix used for NRPT verification probe queries. + // Probes use "_nrpt-probe-." — ctrld recognizes the + // prefix in the DNS handler and responds immediately without upstream forwarding. + nrptProbeDomain = "nrpt-probe.ctrld.test" + + // nrptProbeTimeout is how long to wait for a single probe query to arrive. + nrptProbeTimeout = 2 * time.Second +) + +// nrptProbeRunning ensures only one NRPT probe sequence runs at a time. +// Prevents the health monitor and startup from overlapping. +var nrptProbeRunning atomic.Bool + +// probeNRPT tests whether the NRPT catch-all rule is actually routing DNS queries +// to ctrld's listener. It sends a DNS query for a synthetic probe domain through +// the Windows DNS Client service (via Go's net.Resolver / GetAddrInfoW). If ctrld +// receives the query on its listener, NRPT is working. +// +// Returns true if NRPT is verified working, false if the probe timed out. +func (p *prog) probeNRPT() bool { + if p.dnsInterceptState == nil { + return true + } + + // Generate unique probe domain to defeat DNS caching. + probeID := fmt.Sprintf("_nrpt-probe-%x.%s", rand.Uint32(), nrptProbeDomain) + + // Register probe so DNS handler can detect and signal it. + // Reuse the same mechanism as macOS pf probes (pfProbeExpected/pfProbeCh). + probeCh := make(chan struct{}, 1) + p.pfProbeExpected.Store(probeID) + p.pfProbeCh.Store(&probeCh) + defer func() { + p.pfProbeExpected.Store("") + p.pfProbeCh.Store((*chan struct{})(nil)) + }() + + mainLog.Load().Debug().Str("domain", probeID).Msg("DNS intercept: sending NRPT verification probe") + + // Use Go's default resolver which calls GetAddrInfoW → DNS Client service → NRPT. + // If NRPT is active, the DNS Client routes this to 127.0.0.1 → ctrld receives it. + // If NRPT isn't loaded, the query goes to interface DNS → times out or NXDOMAIN. + ctx, cancel := context.WithTimeout(context.Background(), nrptProbeTimeout) + defer cancel() + + go func() { + resolver := &net.Resolver{} + // We don't care about the result — only whether ctrld's handler receives it. + _, _ = resolver.LookupHost(ctx, probeID) + }() + + select { + case <-probeCh: + mainLog.Load().Debug().Str("domain", probeID).Msg("DNS intercept: NRPT probe received — interception verified") + return true + case <-ctx.Done(): + mainLog.Load().Debug().Str("domain", probeID).Msg("DNS intercept: NRPT probe timed out — interception not working") + return false + } +} + +// restartDNSClientService restarts the Windows DNS Client (Dnscache) service. +// This forces the DNS Client to fully re-initialize, including re-reading NRPT +// from the registry. This is the nuclear option when RefreshPolicyEx alone isn't +// enough — equivalent to macOS forceReloadPFMainRuleset(). +func restartDNSClientService() { + mainLog.Load().Info().Msg("DNS intercept: restarting DNS Client service (Dnscache) to force NRPT reload") + cmd := exec.Command("net", "stop", "Dnscache", "/y") + if out, err := cmd.CombinedOutput(); err != nil { + mainLog.Load().Debug().Err(err).Str("output", string(out)).Msg("DNS intercept: failed to stop Dnscache (may require SYSTEM privileges)") + // Fall back to PowerShell Restart-Service + cmd2 := exec.Command("powershell", "-Command", "Restart-Service", "Dnscache", "-Force") + if out2, err2 := cmd2.CombinedOutput(); err2 != nil { + mainLog.Load().Warn().Err(err2).Str("output", string(out2)).Msg("DNS intercept: failed to restart Dnscache via PowerShell") + return + } + } else { + // Start it again + cmd3 := exec.Command("net", "start", "Dnscache") + if out3, err3 := cmd3.CombinedOutput(); err3 != nil { + mainLog.Load().Warn().Err(err3).Str("output", string(out3)).Msg("DNS intercept: failed to start Dnscache after stop") + } + } + mainLog.Load().Info().Msg("DNS intercept: DNS Client service restarted") +} + +// nrptProbeAndHeal runs the NRPT probe with retries and escalating remediation. +// Called asynchronously after startup and from the health monitor. +// +// Retry sequence (each attempt: GP refresh + paramchange + flush → sleep → probe): +// 1. Immediate probe +// 2. GP refresh + paramchange + flush → 1s → probe +// 3. GP refresh + paramchange + flush → 2s → probe +// 4. GP refresh + paramchange + flush → 4s → probe +// 5. Nuclear: two-phase delete → signal → re-add → probe +func (p *prog) nrptProbeAndHeal() { + if !nrptProbeRunning.CompareAndSwap(false, true) { + mainLog.Load().Debug().Msg("DNS intercept: NRPT probe already running, skipping") + return + } + defer nrptProbeRunning.Store(false) + + mainLog.Load().Info().Msg("DNS intercept: starting NRPT verification probe sequence") + + // Log parent key state for diagnostics. + logNRPTParentKeyState("probe-start") + + // Attempt 1: immediate probe + if p.probeNRPT() { + mainLog.Load().Info().Msg("DNS intercept: NRPT verified working") + return + } + + // Attempts 2-4: GP refresh + paramchange + flush with increasing backoff + delays := []time.Duration{1 * time.Second, 2 * time.Second, 4 * time.Second} + for i, delay := range delays { + attempt := i + 2 + mainLog.Load().Info().Int("attempt", attempt).Str("delay", delay.String()). + Msg("DNS intercept: NRPT probe failed, retrying with GP refresh + paramchange") + logNRPTParentKeyState(fmt.Sprintf("probe-attempt-%d", attempt)) + refreshNRPTPolicy() + sendParamChange() + flushDNSCache() + time.Sleep(delay) + if p.probeNRPT() { + mainLog.Load().Info().Int("attempt", attempt). + Msg("DNS intercept: NRPT verified working") + return + } + } + + // Nuclear option: two-phase delete → re-add cycle. + // DNS Client may have cached a stale "no rules" state. Delete our rule, + // signal DNS Client to forget it, wait, then re-add and signal again. + mainLog.Load().Warn().Msg("DNS intercept: all probes failed — attempting two-phase NRPT recovery (delete → signal → re-add)") + listenerIP := "127.0.0.1" + if state, ok := p.dnsInterceptState.(*wfpState); ok { + listenerIP = state.listenerIP + } + + // Phase 1: Remove our rule and the parent key if now empty. + _ = removeNRPTCatchAllRule() + cleanEmptyNRPTParent() + refreshNRPTPolicy() + sendParamChange() + flushDNSCache() + logNRPTParentKeyState("nuclear-after-delete") + + // Wait for DNS Client to process the deletion. + time.Sleep(1 * time.Second) + + // Phase 2: Re-add the rule. + if err := addNRPTCatchAllRule(listenerIP); err != nil { + mainLog.Load().Error().Err(err).Msg("DNS intercept: failed to re-add NRPT after nuclear recovery") + return + } + refreshNRPTPolicy() + sendParamChange() + flushDNSCache() + logNRPTParentKeyState("nuclear-after-readd") + + // Final probe after recovery. + time.Sleep(1 * time.Second) + if p.probeNRPT() { + mainLog.Load().Info().Msg("DNS intercept: NRPT verified working after two-phase recovery") + return + } + + logNRPTParentKeyState("probe-failed-final") + mainLog.Load().Error().Msg("DNS intercept: NRPT verification failed after all retries including two-phase recovery — " + + "DNS queries may not be routed through ctrld. A network interface toggle may be needed.") +} diff --git a/docs/wfp-dns-intercept.md b/docs/wfp-dns-intercept.md new file mode 100644 index 0000000..6b9c3b5 --- /dev/null +++ b/docs/wfp-dns-intercept.md @@ -0,0 +1,449 @@ +# Windows DNS Intercept — Technical Reference + +## Overview + +On Windows, DNS intercept mode uses a two-layer architecture: + +- **`dns` mode (default)**: NRPT only — graceful DNS routing via the Windows DNS Client service +- **`hard` mode**: NRPT + WFP — full enforcement with kernel-level block filters + +This dual-mode design ensures that `dns` mode can never break DNS (at worst, a VPN +overwrites NRPT and queries bypass ctrld temporarily), while `hard` mode provides +the same enforcement guarantees as macOS pf. + +## Architecture: dns vs hard Mode + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ dns mode (NRPT only) │ +│ │ +│ App DNS query → DNS Client service → NRPT lookup │ +│ → "." catch-all matches → forward to 127.0.0.1 (ctrld) │ +│ │ +│ If VPN clears NRPT: health monitor re-adds within 30s │ +│ Worst case: queries go to VPN DNS until NRPT restored │ +│ DNS never breaks — graceful degradation │ +└─────────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────────┐ +│ hard mode (NRPT + WFP) │ +│ │ +│ App DNS query → DNS Client service → NRPT → 127.0.0.1 (ctrld)│ +│ │ +│ Bypass attempt (raw 8.8.8.8:53) → WFP BLOCK filter │ +│ VPN DNS on private IP → WFP subnet PERMIT filter → allowed │ +│ │ +│ NRPT must be active before WFP starts (atomic guarantee) │ +│ If NRPT fails → WFP not started (avoids DNS blackhole) │ +│ If WFP fails → NRPT rolled back (all-or-nothing) │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## NRPT (Name Resolution Policy Table) + +### What It Does + +NRPT is a Windows feature (originally for DirectAccess) that tells the DNS Client +service to route queries matching specific namespace patterns to specific DNS servers. +ctrld adds a catch-all rule that routes ALL DNS to `127.0.0.1`: + +| Registry Value | Type | Value | Purpose | +|---|---|---|---| +| `Name` | REG_MULTI_SZ | `.` | Namespace (`.` = catch-all) | +| `GenericDNSServers` | REG_SZ | `127.0.0.1` | Target DNS server | +| `ConfigOptions` | REG_DWORD | `0x8` | Standard DNS resolution | +| `Version` | REG_DWORD | `0x2` | NRPT rule version 2 | +| `Comment` | REG_SZ | `` | Empty (matches PowerShell behavior) | +| `DisplayName` | REG_SZ | `` | Empty (matches PowerShell behavior) | +| `IPSECCARestriction` | REG_SZ | `` | Empty (matches PowerShell behavior) | + +### Registry Paths — GP vs Local (Critical) + +Windows NRPT has two registry paths with **all-or-nothing** precedence: + +| Path | Name | Mode | +|---|---|---| +| `HKLM\SOFTWARE\Policies\Microsoft\Windows NT\DNSClient\DnsPolicyConfig` | **GP path** | Group Policy mode | +| `HKLM\SYSTEM\CurrentControlSet\Services\Dnscache\Parameters\DnsPolicyConfig` | **Local path** | Local/service store mode | + +**Precedence rule**: If ANY rules exist in the GP path (from IT policy, VPN, MDM, +or our own earlier builds), DNS Client enters "GP mode" and **ignores ALL local-path +rules entirely**. This is not per-rule — it's a binary switch. + +**Consequence**: On non-domain-joined (WORKGROUP) machines, `RefreshPolicyEx` is +unreliable. If we write to the GP path, DNS Client enters GP mode but the rules +never activate — resulting in `Get-DnsClientNrptPolicy` returning empty even though +`Get-DnsClientNrptRule` shows the rule in registry. + +ctrld uses an adaptive strategy (matching [Tailscale's approach](https://github.com/tailscale/tailscale/blob/main/net/dns/nrpt_windows.go)): + +1. **Always write to the local path** using a deterministic GUID key name + (`{B2E9A3C1-7F4D-4A8E-9D6B-5C1E0F3A2B8D}`). This is the baseline that works + on all non-domain machines. +2. **Check if other software has GP NRPT rules** (`otherGPRulesExist()`). If + foreign GP rules are present (IT policy, VPN), DNS Client is already in GP mode + and our local rule would be invisible — so we also write to the GP path. +3. **If no foreign GP rules exist**, clean any stale ctrld GP rules and delete + the empty GP parent key. This ensures DNS Client stays in "local mode" where + the local-path rule activates immediately via `paramchange`. + +### VPN Coexistence + +NRPT uses most-specific-match. VPN NRPT rules for specific domains (e.g., +`*.corp.local` → `10.20.30.1`) take priority over ctrld's `.` catch-all. +This means VPN split DNS works naturally — VPN-specific domains go to VPN DNS, +everything else goes to ctrld. No exemptions or special handling needed. + +### DNS Client Notification + +After writing NRPT rules, DNS Client must be notified to reload: + +1. **`paramchange`**: `sc control dnscache paramchange` — signals DNS Client to + re-read configuration. Works for local-path rules on most machines. +2. **`RefreshPolicyEx`**: `RefreshPolicyEx(bMachine=TRUE, dwOptions=RP_FORCE)` from + `userenv.dll` — triggers GP refresh for GP-path rules. Unreliable on non-domain + machines (WORKGROUP). Fallback: `gpupdate /target:computer /force`. +3. **DNS cache flush**: `DnsFlushResolverCache` from `dnsapi.dll` or `ipconfig /flushdns` + — clears stale cached results from before NRPT was active. + +### DNS Cache Flush + +After NRPT changes, stale DNS cache entries could bypass the new routing. ctrld flushes: + +1. **Primary**: `DnsFlushResolverCache` from `dnsapi.dll` +2. **Fallback**: `ipconfig /flushdns` (subprocess) + +### Known Limitation: nslookup + +`nslookup.exe` implements its own DNS resolver and does NOT use the Windows DNS Client +service. It ignores NRPT entirely. Use `Resolve-DnsName` (PowerShell) or `ping` to +verify DNS resolution through NRPT. This is a well-known Windows behavior. + +## WFP (Windows Filtering Platform) — hard Mode Only + +### Filter Stack + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Sublayer: "ctrld DNS Intercept" (weight 0xFFFF — max priority) │ +│ │ +│ ┌─ Permit Filters (weight 10) ─────────────────────────────┐ │ +│ │ • IPv4/UDP to 127.0.0.1:53 → PERMIT │ │ +│ │ • IPv4/TCP to 127.0.0.1:53 → PERMIT │ │ +│ │ • IPv6/UDP to ::1:53 → PERMIT │ │ +│ │ • IPv6/TCP to ::1:53 → PERMIT │ │ +│ │ • RFC1918 + CGNAT subnets:53 → PERMIT (VPN DNS) │ │ +│ │ • VPN DNS exemptions (dynamic) → PERMIT │ │ +│ └──────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─ Block Filters (weight 1) ───────────────────────────────┐ │ +│ │ • All IPv4/UDP to *:53 → BLOCK │ │ +│ │ • All IPv4/TCP to *:53 → BLOCK │ │ +│ │ • All IPv6/UDP to *:53 → BLOCK │ │ +│ │ • All IPv6/TCP to *:53 → BLOCK │ │ +│ └──────────────────────────────────────────────────────────┘ │ +│ │ +│ Filter evaluation: higher weight wins → permits checked first │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Why WFP Can't Work Alone + +WFP operates at the connection authorization layer (`FWPM_LAYER_ALE_AUTH_CONNECT`). +It can only **block** or **permit** connections — it **cannot redirect** them. +Redirection requires kernel-mode callout drivers (`FwpsCalloutRegister` in +`fwpkclnt.lib`) using `FWPM_LAYER_ALE_CONNECT_REDIRECT_V4/V6`, which are not +accessible from userspace. + +Without NRPT, WFP blocks outbound DNS but doesn't tell applications where to send +queries instead — they just see DNS failures. This is why `hard` mode requires NRPT +to be active first, and why WFP is rolled back if NRPT setup fails. + +### Sublayer Priority + +Weight `0xFFFF` (maximum) ensures ctrld's filters take priority over any other WFP +sublayers from VPN software, endpoint security, or Windows Defender Firewall. + +### RFC1918 + CGNAT Subnet Permits + +Static permit filters for private IP ranges (10.0.0.0/8, 172.16.0.0/12, +192.168.0.0/16, 100.64.0.0/10) allow VPN DNS servers on private IPs to work +without dynamic per-server exemptions. This covers Tailscale MagicDNS +(100.100.100.100), corporate VPN DNS (10.x.x.x), and similar. + +### VPN DNS Exemption Updates + +When `vpnDNSManager.Refresh()` discovers VPN DNS servers on public IPs: + +1. Delete all existing VPN permit filters (by stored IDs) +2. For each VPN DNS server IP: + - IPv4: `addWFPPermitIPFilter()` on `ALE_AUTH_CONNECT_V4` + - IPv6: `addWFPPermitIPv6Filter()` on `ALE_AUTH_CONNECT_V6` + - Both UDP and TCP for each IP +3. Store new filter IDs for next cleanup cycle + +**In `dns` mode, VPN DNS exemptions are skipped** — there are no WFP block +filters to exempt from. + +### Session Lifecycle + +**Startup (hard mode):** +``` +1. Add NRPT catch-all rule + GP refresh + DNS flush +2. FwpmEngineOpen0() with RPC_C_AUTHN_DEFAULT (0xFFFFFFFF) +3. Delete stale sublayer (crash recovery) +4. FwpmSubLayerAdd0() — weight 0xFFFF +5. Add 4 localhost permit filters +6. Add 4 block filters +7. Add RFC1918 + CGNAT subnet permits +8. Start NRPT health monitor goroutine +``` + +**Startup (dns mode):** +``` +1. Add NRPT catch-all rule + GP refresh + DNS flush +2. Start NRPT health monitor goroutine +3. (No WFP — done) +``` + +**Shutdown:** +``` +1. Stop NRPT health monitor +2. Remove NRPT catch-all rule + DNS flush +3. (hard mode only) Clean up all WFP filters, sublayer, close engine +``` + +**Crash Recovery:** +On startup, `FwpmSubLayerDeleteByKey0` removes any stale sublayer from a previous +unclean shutdown, including all its child filters (deterministic GUID ensures we +only clean up our own). + +## NRPT Probe and Auto-Heal + +### The Problem: Async GP Refresh Race + +`RefreshPolicyEx` triggers a Group Policy refresh but returns immediately — it does +NOT wait for the DNS Client service to actually reload NRPT from the registry. On +cold machines (first boot, fresh install, long sleep), the DNS Client may take +several seconds to process the policy refresh. During this window, NRPT rules exist +in the registry but the DNS Client hasn't loaded them — queries bypass ctrld. + +### The Solution: Active Probing + +After writing NRPT to the registry, ctrld sends a probe DNS query through the +Windows DNS Client path to verify NRPT is actually working: + +1. Generate a unique probe domain: `_nrpt-probe-.nrpt-probe.ctrld.test` +2. Send it via Go's `net.Resolver` (calls `GetAddrInfoW` → DNS Client → NRPT) +3. If NRPT is active, DNS Client routes it to 127.0.0.1 → ctrld receives it +4. ctrld's DNS handler recognizes the probe prefix and signals success +5. If the probe times out (2s), NRPT isn't loaded yet → retry with remediation + +### Startup Probe (Async) + +After NRPT setup, an async goroutine runs the probe-and-heal sequence without +blocking startup: + +``` +Probe attempt 1 (2s timeout) + ├─ Success → "NRPT verified working", done + └─ Timeout → GP refresh + DNS flush, sleep 1s + Probe attempt 2 (2s timeout) + ├─ Success → done + └─ Timeout → Restart DNS Client service (nuclear), sleep 2s + Re-add NRPT + GP refresh + DNS flush + Probe attempt 3 (2s timeout) + ├─ Success → done + └─ Timeout → GP refresh + DNS flush, sleep 4s + Probe attempt 4 (2s timeout) + ├─ Success → done + └─ Timeout → log error, continue +``` + +### DNS Client Restart (Nuclear Option) + +If GP refresh alone isn't enough, ctrld restarts the Windows DNS Client service +(`Dnscache`). This forces the DNS Client to fully re-initialize, including +re-reading all NRPT rules from the registry. This is the equivalent of macOS +`forceReloadPFMainRuleset()`. + +**Trade-offs:** +- Briefly interrupts ALL DNS resolution (few hundred ms during restart) +- Clears the system DNS cache (all apps need to re-resolve) +- VPN NRPT rules survive (they're in registry, re-read on restart) +- Enterprise security tools may log the service restart event + +This only fires as attempt #3 after two GP refresh attempts fail — at that point +DNS isn't working through ctrld anyway, so a brief DNS blip is acceptable. + +### Health Monitor Integration + +The 30s periodic health monitor now does actual probing, not just registry checks: + +``` +Every 30s: + ├─ Registry check: nrptCatchAllRuleExists()? + │ ├─ Missing → re-add + GP refresh + flush + probe-and-heal + │ └─ Present → probe to verify it's actually routing + │ ├─ Probe success → OK + │ └─ Probe failure → probe-and-heal cycle + │ + └─ (hard mode only) Check: wfpSublayerExists()? + ├─ Missing → full restart (stopDNSIntercept + startDNSIntercept) + └─ Present → OK +``` + +**Singleton guard:** Only one probe-and-heal sequence runs at a time (atomic bool). +The startup probe and health monitor cannot overlap. + +**Why periodic, not just network-event?** VPN software or Group Policy updates can +clear NRPT at any time, not just during network changes. A 30s periodic check ensures +recovery within a bounded window. + +**Hard mode safety:** The health monitor verifies NRPT before checking WFP. If NRPT +is gone, it's restored first. WFP is never running without NRPT — this prevents +DNS blackholes where WFP blocks everything but NRPT isn't routing to ctrld. + +## DNS Flow Diagrams + +### Normal Resolution (both modes) + +``` +App → DNS Client → NRPT lookup → "." matches → 127.0.0.1 → ctrld + → Control D DoH (port 443, not affected by WFP port-53 rules) + → response flows back +``` + +### VPN Split DNS (both modes) + +``` +App → DNS Client → NRPT lookup: + VPN domain (*.corp.local) → VPN's NRPT rule wins → VPN DNS server + Everything else → ctrld's "." catch-all → 127.0.0.1 → ctrld + → VPN domain match → forward to VPN DNS (port 53) + → (hard mode: WFP subnet permit allows private IP DNS) +``` + +### Bypass Attempt (hard mode only) + +``` +App → raw socket to 8.8.8.8:53 → WFP ALE_AUTH_CONNECT → BLOCK +``` + +In `dns` mode, this query would succeed (no WFP) — the tradeoff for never +breaking DNS. + +## Key Differences from macOS (pf) + +| Aspect | macOS (pf) | Windows dns mode | Windows hard mode | +|--------|-----------|------------------|-------------------| +| **Routing** | `rdr` redirect | NRPT policy | NRPT policy | +| **Enforcement** | `route-to` + block rules | None (graceful) | WFP block filters | +| **Can break DNS?** | Yes (pf corruption) | No | Yes (if NRPT lost) | +| **VPN coexistence** | Watchdog + stabilization | NRPT most-specific-match | Same + WFP permits | +| **Bypass protection** | pf catches all packets | None | WFP catches all connections | +| **Recovery** | Probe + auto-heal | Health monitor re-adds | Full restart on sublayer loss | + +## WFP API Notes + +### Struct Layouts + +WFP C API structures are manually defined in Go (`golang.org/x/sys/windows` doesn't +include WFP types). Field alignment must match the C ABI exactly — any mismatch +causes access violations or silent corruption. + +### FWP_DATA_TYPE Enum + +``` +FWP_EMPTY = 0 +FWP_UINT8 = 1 +FWP_UINT16 = 2 +FWP_UINT32 = 3 +FWP_UINT64 = 4 +... +``` + +**⚠️** Some documentation examples incorrectly start at 1. The enum starts at 0 +(`FWP_EMPTY`), making all subsequent values offset by 1 from what you might expect. + +### GC Safety + +When passing Go heap objects to WFP syscalls via `unsafe.Pointer`, use +`runtime.KeepAlive()` to prevent garbage collection during the call: + +```go +conditions := make([]fwpmFilterCondition0, 3) +filter.filterCondition = &conditions[0] +r1, _, _ := procFwpmFilterAdd0.Call(...) +runtime.KeepAlive(conditions) +``` + +### Authentication + +`FwpmEngineOpen0` requires `RPC_C_AUTHN_DEFAULT` (0xFFFFFFFF) for the authentication +service parameter. `RPC_C_AUTHN_NONE` (0) returns `ERROR_NOT_SUPPORTED` on some +configurations (e.g., Parallels VMs). + +### Elevation + +WFP requires admin/SYSTEM privileges. `FwpmEngineOpen0` fails with HRESULT 0x32 +when run non-elevated. Services running as SYSTEM have this automatically. + +## Debugging + +### Check NRPT Rules + +```powershell +# PowerShell — show active NRPT rules +Get-DnsClientNrptRule + +# Check registry directly +Get-ChildItem "HKLM:\SOFTWARE\Policies\Microsoft\Windows NT\DNSClient\DnsPolicyConfig" +``` + +### Check WFP Filters (hard mode) + +```powershell +# Show all WFP filters (requires admin) — output is XML +netsh wfp show filters + +# Search for ctrld's filters +Select-String "ctrld" filters.xml +``` + +### Verify DNS Resolution + +```powershell +# Use Resolve-DnsName, NOT nslookup (nslookup bypasses NRPT) +Resolve-DnsName example.com +ping example.com + +# If you must use nslookup, specify localhost: +nslookup example.com 127.0.0.1 + +# Force GP refresh (if NRPT not loading) +gpupdate /target:computer /force + +# Verify service registration +sc qc ctrld +``` + +### Service Verification + +After install, verify the Windows service is correctly registered: + +```powershell +# Check binary path and start type +sc qc ctrld + +# Should show: +# BINARY_PATH_NAME: "C:\...\ctrld.exe" run --cd xxxxx --intercept-mode dns +# START_TYPE: AUTO_START +``` + +## Related + +- [DNS Intercept Mode Overview](dns-intercept-mode.md) — cross-platform documentation +- [pf DNS Intercept](pf-dns-intercept.md) — macOS technical reference +- [Microsoft WFP Documentation](https://docs.microsoft.com/en-us/windows/win32/fwp/windows-filtering-platform-start-page) +- [Microsoft NRPT Documentation](https://docs.microsoft.com/en-us/previous-versions/windows/it-pro/windows-server-2012-r2-and-2012/dn593632(v=ws.11)) diff --git a/scripts/nrpt-diag.ps1 b/scripts/nrpt-diag.ps1 new file mode 100644 index 0000000..230ae94 --- /dev/null +++ b/scripts/nrpt-diag.ps1 @@ -0,0 +1,132 @@ +#Requires -RunAsAdministrator +<# +.SYNOPSIS + NRPT diagnostic script for ctrld DNS intercept troubleshooting. +.DESCRIPTION + Captures the full NRPT state: registry keys (both GP and direct paths), + effective policy, active rules, DNS Client service status, and resolver + config. Run as Administrator. +.EXAMPLE + .\nrpt-diag.ps1 + .\nrpt-diag.ps1 | Out-File nrpt-diag-output.txt +#> + +$ErrorActionPreference = 'SilentlyContinue' + +Write-Host "=== NRPT Diagnostic Report ===" -ForegroundColor Cyan +Write-Host "Date: $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')" +Write-Host "Computer: $env:COMPUTERNAME" +Write-Host "OS: $((Get-CimInstance Win32_OperatingSystem).Caption) $((Get-CimInstance Win32_OperatingSystem).BuildNumber)" +Write-Host "" + +# --- 1. DNS Client Service --- +Write-Host "=== 1. DNS Client (Dnscache) Service ===" -ForegroundColor Yellow +$svc = Get-Service Dnscache +Write-Host "Status: $($svc.Status) StartType: $($svc.StartType)" +Write-Host "" + +# --- 2. GP Path (Policy store) --- +$gpPath = "HKLM:\SOFTWARE\Policies\Microsoft\Windows NT\DNSClient\DnsPolicyConfig" +Write-Host "=== 2. GP Path: $gpPath ===" -ForegroundColor Yellow +$gpKey = Get-Item $gpPath 2>$null +if ($gpKey) { + Write-Host "Key EXISTS" + $subkeys = Get-ChildItem $gpPath 2>$null + if ($subkeys) { + foreach ($sk in $subkeys) { + Write-Host "" + Write-Host " Subkey: $($sk.PSChildName)" -ForegroundColor Green + foreach ($prop in $sk.Property) { + $val = $sk.GetValue($prop) + $kind = $sk.GetValueKind($prop) + Write-Host " $prop ($kind) = $val" + } + } + } else { + Write-Host " ** EMPTY (no subkeys) — this blocks NRPT activation! **" -ForegroundColor Red + } +} else { + Write-Host "Key does NOT exist (clean state)" +} +Write-Host "" + +# --- 3. Direct Path (Service store) --- +$directPath = "HKLM:\SYSTEM\CurrentControlSet\Services\Dnscache\Parameters\DnsPolicyConfig" +Write-Host "=== 3. Direct Path: $directPath ===" -ForegroundColor Yellow +$directKey = Get-Item $directPath 2>$null +if ($directKey) { + Write-Host "Key EXISTS" + $subkeys = Get-ChildItem $directPath 2>$null + if ($subkeys) { + foreach ($sk in $subkeys) { + Write-Host "" + Write-Host " Subkey: $($sk.PSChildName)" -ForegroundColor Green + foreach ($prop in $sk.Property) { + $val = $sk.GetValue($prop) + $kind = $sk.GetValueKind($prop) + Write-Host " $prop ($kind) = $val" + } + } + } else { + Write-Host " ** EMPTY (no subkeys) **" -ForegroundColor Red + } +} else { + Write-Host "Key does NOT exist" +} +Write-Host "" + +# --- 4. Effective NRPT Rules (what Windows sees) --- +Write-Host "=== 4. Get-DnsClientNrptRule ===" -ForegroundColor Yellow +$rules = Get-DnsClientNrptRule 2>$null +if ($rules) { + $rules | Format-List Name, Version, Namespace, NameServers, NameEncoding, DnsSecEnabled +} else { + Write-Host "(none)" +} +Write-Host "" + +# --- 5. Effective NRPT Policy (what DNS Client actually applies) --- +Write-Host "=== 5. Get-DnsClientNrptPolicy ===" -ForegroundColor Yellow +$policy = Get-DnsClientNrptPolicy 2>$null +if ($policy) { + $policy | Format-List Namespace, NameServers, NameEncoding, QueryPolicy +} else { + Write-Host "(none — DNS Client is NOT honoring any NRPT rules)" -ForegroundColor Red +} +Write-Host "" + +# --- 6. Interface DNS servers --- +Write-Host "=== 6. Interface DNS Configuration ===" -ForegroundColor Yellow +Get-DnsClientServerAddress -AddressFamily IPv4 | Where-Object { $_.ServerAddresses } | + Format-Table InterfaceAlias, InterfaceIndex, ServerAddresses -AutoSize +Write-Host "" + +# --- 7. DNS resolution test --- +Write-Host "=== 7. DNS Resolution Test ===" -ForegroundColor Yellow +Write-Host "Resolve-DnsName example.com (uses DNS Client / NRPT):" +try { + $result = Resolve-DnsName example.com -Type A -DnsOnly -ErrorAction Stop + $result | Format-Table Name, Type, IPAddress -AutoSize +} catch { + Write-Host " FAILED: $_" -ForegroundColor Red +} +Write-Host "" +Write-Host "nslookup example.com 127.0.0.1 (direct to ctrld, bypasses NRPT):" +$ns = nslookup example.com 127.0.0.1 2>&1 +$ns | ForEach-Object { Write-Host " $_" } +Write-Host "" + +# --- 8. Domain join status --- +Write-Host "=== 8. Domain Status ===" -ForegroundColor Yellow +$cs = Get-CimInstance Win32_ComputerSystem +Write-Host "Domain: $($cs.Domain) PartOfDomain: $($cs.PartOfDomain)" +Write-Host "" + +# --- 9. Group Policy NRPT --- +Write-Host "=== 9. GP Result (NRPT section) ===" -ForegroundColor Yellow +Write-Host "(Running gpresult — may take a few seconds...)" +$gp = gpresult /r 2>&1 +$gp | Select-String -Pattern "DNS|NRPT|Policy" | ForEach-Object { Write-Host " $_" } +Write-Host "" + +Write-Host "=== End of Diagnostic Report ===" -ForegroundColor Cyan