From 704f89bf660294054b0111a1fac5033d607c7a3c Mon Sep 17 00:00:00 2001 From: Garrett Delfosse Date: Fri, 10 Apr 2026 18:52:56 +0000 Subject: [PATCH] fix(dns): preserve NRPT rules on startup and improve hosts file retry Two changes to improve DNS resilience on Windows when GlobalProtect or other VPNs cause engine restarts: 1. Don't delete NRPT rules in nrptRuleDatabase constructor. Previously, newNRPTRuleDatabase() called DelAllRuleKeys() unconditionally, wiping all .coder DNS routing rules on engine startup. Now, existing rules are tracked as 'stale' and only cleaned up after replacement rules are confirmed written in WriteSplitDNSConfig(). This preserves DNS routing during the gap between engine creation and first successful configuration. 2. Replace hosts file retry (5x10ms) with exponential backoff. Endpoint security tools (GlobalProtect, CrowdStrike, Windows Defender) can hold the hosts file for seconds. The new retry uses 50ms initial backoff, 2x multiplier, 5s max, 30s deadline, with debug logging on each attempt. --- net/dns/manager_windows.go | 20 +++++++++++++++----- net/dns/nrpt_windows.go | 33 ++++++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/net/dns/manager_windows.go b/net/dns/manager_windows.go index 01133d8c0c64c..f9403b90be4ea 100644 --- a/net/dns/manager_windows.go +++ b/net/dns/manager_windows.go @@ -169,15 +169,25 @@ func (m *windowsManager) setHosts(hosts []*HostEntry) error { } const fileMode = 0 // ignored on windows. - // This can fail spuriously with an access denied error, so retry it a - // few times. - for i := 0; i < 5; i++ { + // The hosts file can be locked by antivirus, endpoint security + // (e.g., GlobalProtect, CrowdStrike), or Windows Defender for + // several seconds. Use exponential backoff to tolerate this. + backoff := 50 * time.Millisecond + const maxBackoff = 5 * time.Second + deadline := time.Now().Add(30 * time.Second) + for attempt := 0; ; attempt++ { if err = atomicfile.WriteFile(hostsFile, outB, fileMode); err == nil { return nil } - time.Sleep(10 * time.Millisecond) + if time.Now().After(deadline) { + return fmt.Errorf("setHosts: failed after %d attempts over 30s: %w", attempt+1, err) + } + m.logf("setHosts: attempt %d failed, retrying in %v: %v", attempt+1, backoff, err) + time.Sleep(backoff) + if backoff *= 2; backoff > maxBackoff { + backoff = maxBackoff + } } - return err } // setPrimaryDNS sets the given resolvers and domains as the Tailscale diff --git a/net/dns/nrpt_windows.go b/net/dns/nrpt_windows.go index d64110f4320c6..ee90331e03a05 100644 --- a/net/dns/nrpt_windows.go +++ b/net/dns/nrpt_windows.go @@ -67,6 +67,7 @@ type nrptRuleDatabase struct { isGPRefreshPending atomic.Bool mu sync.Mutex // protects the fields below ruleIDs []string + staleRuleIDs []string // rule IDs inherited from a previous run, cleaned up on first WriteSplitDNSConfig isGPDirty bool writeAsGP bool } @@ -76,13 +77,17 @@ func newNRPTRuleDatabase(logf logger.Logf) *nrptRuleDatabase { ret.loadRuleSubkeyNames() ret.detectWriteAsGP() ret.watchForGPChanges() - // Best-effort: if our NRPT rule exists, try to delete it. Unlike - // per-interface configuration, NRPT rules survive the unclean - // termination of the Tailscale process, and depending on the - // rule, it may prevent us from reaching login.tailscale.com to - // boot up. The bootstrap resolver logic will save us, but it - // slows down start-up a bunch. - ret.DelAllRuleKeys() + // Track existing NRPT rules as stale rather than deleting them + // immediately. This preserves DNS routing for .coder queries + // during the gap between engine startup and the first successful + // DNS configuration. Stale rules are cleaned up in the first + // call to WriteSplitDNSConfig, after replacement rules are + // confirmed written. + if len(ret.ruleIDs) > 0 { + ret.staleRuleIDs = make([]string, len(ret.ruleIDs)) + copy(ret.staleRuleIDs, ret.ruleIDs) + logf("preserved %d existing NRPT rule(s) as stale, will clean up on first config", len(ret.staleRuleIDs)) + } return ret } @@ -180,6 +185,12 @@ func (db *nrptRuleDatabase) DelAllRuleKeys() error { db.mu.Lock() defer db.mu.Unlock() + // Also clean up any stale rules from a previous instance. + if len(db.staleRuleIDs) > 0 { + db.delRuleKeys(db.staleRuleIDs) + db.staleRuleIDs = nil + } + if err := db.delRuleKeys(db.ruleIDs); err != nil { return err } @@ -251,6 +262,14 @@ func (db *nrptRuleDatabase) WriteSplitDNSConfig(servers []string, domains []dnsn db.mu.Lock() defer db.mu.Unlock() + // Clean up stale rules from a previous engine instance, now that + // we have new rules to replace them. + if len(db.staleRuleIDs) > 0 { + db.logf("cleaning up %d stale NRPT rule(s) from previous session", len(db.staleRuleIDs)) + db.delRuleKeys(db.staleRuleIDs) + db.staleRuleIDs = nil + } + // NRPT has an undocumented restriction that each rule may only be associated // with a maximum of 50 domains. If we are setting rules for more domains // than that, we need to split domains into chunks and write out a rule per chunk.