diff --git a/net/dns/manager_windows.go b/net/dns/manager_windows.go index 01133d8c0c64c..f9403b90be4ea 100644 --- a/net/dns/manager_windows.go +++ b/net/dns/manager_windows.go @@ -169,15 +169,25 @@ func (m *windowsManager) setHosts(hosts []*HostEntry) error { } const fileMode = 0 // ignored on windows. - // This can fail spuriously with an access denied error, so retry it a - // few times. - for i := 0; i < 5; i++ { + // The hosts file can be locked by antivirus, endpoint security + // (e.g., GlobalProtect, CrowdStrike), or Windows Defender for + // several seconds. Use exponential backoff to tolerate this. + backoff := 50 * time.Millisecond + const maxBackoff = 5 * time.Second + deadline := time.Now().Add(30 * time.Second) + for attempt := 0; ; attempt++ { if err = atomicfile.WriteFile(hostsFile, outB, fileMode); err == nil { return nil } - time.Sleep(10 * time.Millisecond) + if time.Now().After(deadline) { + return fmt.Errorf("setHosts: failed after %d attempts over 30s: %w", attempt+1, err) + } + m.logf("setHosts: attempt %d failed, retrying in %v: %v", attempt+1, backoff, err) + time.Sleep(backoff) + if backoff *= 2; backoff > maxBackoff { + backoff = maxBackoff + } } - return err } // setPrimaryDNS sets the given resolvers and domains as the Tailscale diff --git a/net/dns/nrpt_windows.go b/net/dns/nrpt_windows.go index d64110f4320c6..ee90331e03a05 100644 --- a/net/dns/nrpt_windows.go +++ b/net/dns/nrpt_windows.go @@ -67,6 +67,7 @@ type nrptRuleDatabase struct { isGPRefreshPending atomic.Bool mu sync.Mutex // protects the fields below ruleIDs []string + staleRuleIDs []string // rule IDs inherited from a previous run, cleaned up on first WriteSplitDNSConfig isGPDirty bool writeAsGP bool } @@ -76,13 +77,17 @@ func newNRPTRuleDatabase(logf logger.Logf) *nrptRuleDatabase { ret.loadRuleSubkeyNames() ret.detectWriteAsGP() ret.watchForGPChanges() - // Best-effort: if our NRPT rule exists, try to delete it. Unlike - // per-interface configuration, NRPT rules survive the unclean - // termination of the Tailscale process, and depending on the - // rule, it may prevent us from reaching login.tailscale.com to - // boot up. The bootstrap resolver logic will save us, but it - // slows down start-up a bunch. - ret.DelAllRuleKeys() + // Track existing NRPT rules as stale rather than deleting them + // immediately. This preserves DNS routing for .coder queries + // during the gap between engine startup and the first successful + // DNS configuration. Stale rules are cleaned up in the first + // call to WriteSplitDNSConfig, after replacement rules are + // confirmed written. + if len(ret.ruleIDs) > 0 { + ret.staleRuleIDs = make([]string, len(ret.ruleIDs)) + copy(ret.staleRuleIDs, ret.ruleIDs) + logf("preserved %d existing NRPT rule(s) as stale, will clean up on first config", len(ret.staleRuleIDs)) + } return ret } @@ -180,6 +185,12 @@ func (db *nrptRuleDatabase) DelAllRuleKeys() error { db.mu.Lock() defer db.mu.Unlock() + // Also clean up any stale rules from a previous instance. + if len(db.staleRuleIDs) > 0 { + db.delRuleKeys(db.staleRuleIDs) + db.staleRuleIDs = nil + } + if err := db.delRuleKeys(db.ruleIDs); err != nil { return err } @@ -251,6 +262,14 @@ func (db *nrptRuleDatabase) WriteSplitDNSConfig(servers []string, domains []dnsn db.mu.Lock() defer db.mu.Unlock() + // Clean up stale rules from a previous engine instance, now that + // we have new rules to replace them. + if len(db.staleRuleIDs) > 0 { + db.logf("cleaning up %d stale NRPT rule(s) from previous session", len(db.staleRuleIDs)) + db.delRuleKeys(db.staleRuleIDs) + db.staleRuleIDs = nil + } + // NRPT has an undocumented restriction that each rule may only be associated // with a maximum of 50 domains. If we are setting rules for more domains // than that, we need to split domains into chunks and write out a rule per chunk.