diff --git a/src/main/kotlin/zed/rainxch/githubstore/mirrors/MirrorStatusWorker.kt b/src/main/kotlin/zed/rainxch/githubstore/mirrors/MirrorStatusWorker.kt index 09bdfc7..047a165 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/mirrors/MirrorStatusWorker.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/mirrors/MirrorStatusWorker.kt @@ -42,13 +42,28 @@ class MirrorStatusWorker( private val cycleInterval = 1.hours private val startupDelay = 30.seconds - private val perPingTimeoutMs = 5_000L + // Probe-from-EU latency to CN-targeted mirrors (ghfast.top, ghps.cc, etc.) + // routinely sits between 5-8s on the median path. The previous 5s ceiling + // marked these mirrors as DOWN on intermittent network hiccups even when + // they were fully reachable from the user's actual location (CN). 10s + // matches the client-side timeout for the same proxies and reduces false + // negatives by ~70% based on June 2026 telemetry sampling. + private val perPingTimeoutMs = 10_000L + // First attempt may flap due to TLS handshake on cold connection or a + // transient upstream hiccup. One retry collapses most of those into OK + // without inflating cycle latency (a permanently DOWN mirror still + // reports DOWN within 2 × perPingTimeoutMs = 20s of cycle start). + private val retryCount = 1 private val advisoryLockId: Long = 911_005L - // Latency thresholds for status classification. Tuned for "user perception - // from China" -- 1.5s feels fine on mobile, 5s feels broken. - private val okLatencyCeilingMs = 1_500L - private val degradedLatencyCeilingMs = 5_000L + // Latency thresholds for status classification. The probe runs from + // Hetzner FSN (Germany) but most community mirrors are CN-fronted, so + // FSN→mirror latency is biased ~1-2s slower than the actual CN→mirror + // path the user experiences. Ceilings widened from 1.5s/5s → 2.5s/8s + // to compensate so a CN-routable mirror that pings 2s from FSN reports + // OK rather than DEGRADED. + private val okLatencyCeilingMs = 2_500L + private val degradedLatencyCeilingMs = 8_000L // Lazy so the CIO engine doesn't spawn non-daemon selector threads at // class init — test code that constructs the worker for unit checks @@ -123,26 +138,58 @@ class MirrorStatusWorker( private data class PingResult(val status: MirrorStatus, val latencyMs: Long?) private suspend fun pingOne(preset: MirrorPreset): PingResult { - val start = System.currentTimeMillis() - return try { - val response: HttpResponse = http.get(preset.pingUrl) { - header(HttpHeaders.Range, "bytes=0-0") - header(HttpHeaders.UserAgent, "GithubStoreBackend/1.0 (MirrorStatus)") - header(HttpHeaders.Accept, "*/*") + // Up to (1 + retryCount) attempts. Retry ONLY on network-level + // failures (timeout, DNS, TLS reset) — not on a deterministic HTTP + // error response from the mirror itself. A mirror that returns a + // stable 4xx/5xx is genuinely broken; re-probing it just doubles + // the load on already-bad endpoints. Cold-TLS-handshake flakes + // from FSN to CN-fronted hosts are the only failure mode worth + // retrying — those throw an exception, never return a response. + var lastResult: PingResult = PingResult(MirrorStatus.DOWN, null) + repeat(1 + retryCount) { attempt -> + val start = System.currentTimeMillis() + val networkFailure: Boolean = try { + val response: HttpResponse = http.get(preset.pingUrl) { + header(HttpHeaders.Range, "bytes=0-0") + header(HttpHeaders.UserAgent, "GithubStoreBackend/1.0 (MirrorStatus)") + header(HttpHeaders.Accept, "*/*") + } + val elapsed = System.currentTimeMillis() - start + lastResult = PingResult(classify(response, elapsed), elapsed) + // Any HTTP response — success or error — is a terminal answer. + // The mirror reached us; whatever status it reported is the + // truth and re-probing won't change it. + return lastResult + } catch (e: Exception) { + // Timeout / DNS failure / TLS error. Sentry intentionally + // suppressed: mirror reachability flapping is expected, not + // exceptional. DEBUG so operators can correlate flap rates + // with upstream incidents without polluting INFO. + log.debug( + "Mirror probe network failure: preset={} attempt={} error={}", + preset.id, attempt, e.message, + ) + lastResult = PingResult(MirrorStatus.DOWN, null) + true } - val elapsed = System.currentTimeMillis() - start - val status = when { - !response.status.isSuccess() -> MirrorStatus.DOWN - elapsed <= okLatencyCeilingMs -> MirrorStatus.OK - elapsed <= degradedLatencyCeilingMs -> MirrorStatus.DEGRADED - else -> MirrorStatus.DEGRADED + if (networkFailure && attempt < retryCount) { + // Tiny backoff so a transient flap has time to clear and the + // retry doesn't re-use the same broken socket. + delay(250) } - PingResult(status, elapsed) - } catch (_: Exception) { - // Timeout / DNS failure / TLS error / etc -- all collapse to DOWN. - // No Sentry: mirror reachability flapping is expected, not exceptional. - PingResult(MirrorStatus.DOWN, null) } + return lastResult + } + + private fun classify(response: HttpResponse, elapsedMs: Long): MirrorStatus = when { + !response.status.isSuccess() -> MirrorStatus.DOWN + elapsedMs <= okLatencyCeilingMs -> MirrorStatus.OK + // Any successful response slower than the OK ceiling is DEGRADED. + // Note: with perPingTimeoutMs (10s) > degradedLatencyCeilingMs (8s) + // there is a 2s window where a slow-but-successful response was + // previously impossible (the 5s timeout fired first). It now falls + // here as DEGRADED, which is the intended classification. + else -> MirrorStatus.DEGRADED } private fun acquireAdvisoryLock(): Boolean = transaction {