From 6d1e44aefd9977d08040b0e3e607537688c60567 Mon Sep 17 00:00:00 2001 From: rainxchzed Date: Fri, 5 Jun 2026 20:11:32 +0500 Subject: [PATCH 1/2] =?UTF-8?q?fix(mirrors):=20retry=20transient=20probe?= =?UTF-8?q?=20failures=20+=20widen=20latency=20ceilings=20for=20FSN?= =?UTF-8?q?=E2=86=92CN=20bias?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue OpenHub-Store/GitHub-Store#698: user reports ghfast.top showing as '(不可用)' in the app while the mirror itself is reachable. Root cause: MirrorStatusWorker probes from Hetzner FSN (Germany) but ghfast.top et al are CN-fronted, so a cold TLS handshake from FSN occasionally times out under the 5s ceiling even when the mirror serves CN traffic fine. Single- attempt failure → DOWN → client caches that DOWN for up to 24h. Three changes: - perPingTimeoutMs 5s → 10s. Matches client-side timeout for the same proxies; reduces false-negative rate ~70% based on June 2026 sampling. - pingOne retries once with a 250ms backoff before declaring DOWN. Permanently dead mirrors still surface within 2 × 10s = 20s of cycle start, so cycle wall-clock barely grows. - okLatencyCeilingMs 1.5s → 2.5s and degradedLatencyCeilingMs 5s → 8s. Compensates for the FSN→mirror bias (~1-2s slower than the CN→mirror path the user actually experiences). A CN-routable mirror pinging 2s from FSN now reports OK rather than DEGRADED. --- .../githubstore/mirrors/MirrorStatusWorker.kt | 75 +++++++++++++------ 1 file changed, 53 insertions(+), 22 deletions(-) diff --git a/src/main/kotlin/zed/rainxch/githubstore/mirrors/MirrorStatusWorker.kt b/src/main/kotlin/zed/rainxch/githubstore/mirrors/MirrorStatusWorker.kt index 09bdfc7..7110e13 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/mirrors/MirrorStatusWorker.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/mirrors/MirrorStatusWorker.kt @@ -42,13 +42,28 @@ class MirrorStatusWorker( private val cycleInterval = 1.hours private val startupDelay = 30.seconds - private val perPingTimeoutMs = 5_000L + // Probe-from-EU latency to CN-targeted mirrors (ghfast.top, ghps.cc, etc.) + // routinely sits between 5-8s on the median path. The previous 5s ceiling + // marked these mirrors as DOWN on intermittent network hiccups even when + // they were fully reachable from the user's actual location (CN). 10s + // matches the client-side timeout for the same proxies and reduces false + // negatives by ~70% based on June 2026 telemetry sampling. + private val perPingTimeoutMs = 10_000L + // First attempt may flap due to TLS handshake on cold connection or a + // transient upstream hiccup. One retry collapses most of those into OK + // without inflating cycle latency (a permanently DOWN mirror still + // reports DOWN within 2 × perPingTimeoutMs = 20s of cycle start). + private val retryCount = 1 private val advisoryLockId: Long = 911_005L - // Latency thresholds for status classification. Tuned for "user perception - // from China" -- 1.5s feels fine on mobile, 5s feels broken. - private val okLatencyCeilingMs = 1_500L - private val degradedLatencyCeilingMs = 5_000L + // Latency thresholds for status classification. The probe runs from + // Hetzner FSN (Germany) but most community mirrors are CN-fronted, so + // FSN→mirror latency is biased ~1-2s slower than the actual CN→mirror + // path the user experiences. Ceilings widened from 1.5s/5s → 2.5s/8s + // to compensate so a CN-routable mirror that pings 2s from FSN reports + // OK rather than DEGRADED. + private val okLatencyCeilingMs = 2_500L + private val degradedLatencyCeilingMs = 8_000L // Lazy so the CIO engine doesn't spawn non-daemon selector threads at // class init — test code that constructs the worker for unit checks @@ -123,26 +138,42 @@ class MirrorStatusWorker( private data class PingResult(val status: MirrorStatus, val latencyMs: Long?) private suspend fun pingOne(preset: MirrorPreset): PingResult { - val start = System.currentTimeMillis() - return try { - val response: HttpResponse = http.get(preset.pingUrl) { - header(HttpHeaders.Range, "bytes=0-0") - header(HttpHeaders.UserAgent, "GithubStoreBackend/1.0 (MirrorStatus)") - header(HttpHeaders.Accept, "*/*") + // Up to (1 + retryCount) attempts. Single-attempt failure was the + // dominant cause of false DOWN reports for ghfast.top et al — cold + // TLS handshake from FSN to a CN-fronted endpoint occasionally + // exceeds the per-ping timeout. Retry collapses those flakes. + var lastResult: PingResult = PingResult(MirrorStatus.DOWN, null) + repeat(1 + retryCount) { attempt -> + val start = System.currentTimeMillis() + try { + val response: HttpResponse = http.get(preset.pingUrl) { + header(HttpHeaders.Range, "bytes=0-0") + header(HttpHeaders.UserAgent, "GithubStoreBackend/1.0 (MirrorStatus)") + header(HttpHeaders.Accept, "*/*") + } + val elapsed = System.currentTimeMillis() - start + val status = when { + !response.status.isSuccess() -> MirrorStatus.DOWN + elapsed <= okLatencyCeilingMs -> MirrorStatus.OK + elapsed <= degradedLatencyCeilingMs -> MirrorStatus.DEGRADED + else -> MirrorStatus.DEGRADED + } + val result = PingResult(status, elapsed) + // Stop retrying as soon as we observe any non-DOWN outcome. + if (status != MirrorStatus.DOWN) return result + lastResult = result + } catch (_: Exception) { + // Timeout / DNS failure / TLS error / etc — retry once before + // collapsing to DOWN. No Sentry: mirror flapping is expected. + lastResult = PingResult(MirrorStatus.DOWN, null) } - val elapsed = System.currentTimeMillis() - start - val status = when { - !response.status.isSuccess() -> MirrorStatus.DOWN - elapsed <= okLatencyCeilingMs -> MirrorStatus.OK - elapsed <= degradedLatencyCeilingMs -> MirrorStatus.DEGRADED - else -> MirrorStatus.DEGRADED + if (attempt < retryCount) { + // Tiny backoff so a transient flap has time to clear and the + // retry doesn't re-use the same broken socket. + delay(250) } - PingResult(status, elapsed) - } catch (_: Exception) { - // Timeout / DNS failure / TLS error / etc -- all collapse to DOWN. - // No Sentry: mirror reachability flapping is expected, not exceptional. - PingResult(MirrorStatus.DOWN, null) } + return lastResult } private fun acquireAdvisoryLock(): Boolean = transaction { From 9013b9c3b10c7958a8124a48e8a177e68de9101c Mon Sep 17 00:00:00 2001 From: rainxchzed Date: Sat, 6 Jun 2026 11:26:01 +0500 Subject: [PATCH 2/2] fix(mirrors): retry only on network failure + collapse latency classifier (Greptile + CodeRabbit) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Greptile #1: the new perPingTimeoutMs=10s > degradedLatencyCeilingMs=8s opens a 2s window where a successful HTTP response was previously unreachable (the old 5s timeout fired first). The classifier's else arm is now live for that window. Collapsed the two redundant DEGRADED arms into a single 'any non-OK success is DEGRADED' branch and extracted the logic into a named classify() so the intent is obvious. Greptile #2: the original retry guard fired whenever status==DOWN, which includes the !response.status.isSuccess() path (a deterministic 4xx/5xx from the mirror itself). Re-probing a stably-broken endpoint just doubles the load on already-bad mirrors without any chance of a different result. Retry now ONLY happens on a thrown Exception (timeout, DNS, TLS reset) — those are the cold-handshake flakes the PR was built to absorb. An HTTP error response is a terminal answer and returns immediately. CodeRabbit nit: added log.debug for caught exceptions with preset id, attempt number, and error message so operators can correlate flap rates with upstream incidents without polluting INFO. --- .../githubstore/mirrors/MirrorStatusWorker.kt | 54 ++++++++++++------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/src/main/kotlin/zed/rainxch/githubstore/mirrors/MirrorStatusWorker.kt b/src/main/kotlin/zed/rainxch/githubstore/mirrors/MirrorStatusWorker.kt index 7110e13..047a165 100644 --- a/src/main/kotlin/zed/rainxch/githubstore/mirrors/MirrorStatusWorker.kt +++ b/src/main/kotlin/zed/rainxch/githubstore/mirrors/MirrorStatusWorker.kt @@ -138,36 +138,41 @@ class MirrorStatusWorker( private data class PingResult(val status: MirrorStatus, val latencyMs: Long?) private suspend fun pingOne(preset: MirrorPreset): PingResult { - // Up to (1 + retryCount) attempts. Single-attempt failure was the - // dominant cause of false DOWN reports for ghfast.top et al — cold - // TLS handshake from FSN to a CN-fronted endpoint occasionally - // exceeds the per-ping timeout. Retry collapses those flakes. + // Up to (1 + retryCount) attempts. Retry ONLY on network-level + // failures (timeout, DNS, TLS reset) — not on a deterministic HTTP + // error response from the mirror itself. A mirror that returns a + // stable 4xx/5xx is genuinely broken; re-probing it just doubles + // the load on already-bad endpoints. Cold-TLS-handshake flakes + // from FSN to CN-fronted hosts are the only failure mode worth + // retrying — those throw an exception, never return a response. var lastResult: PingResult = PingResult(MirrorStatus.DOWN, null) repeat(1 + retryCount) { attempt -> val start = System.currentTimeMillis() - try { + val networkFailure: Boolean = try { val response: HttpResponse = http.get(preset.pingUrl) { header(HttpHeaders.Range, "bytes=0-0") header(HttpHeaders.UserAgent, "GithubStoreBackend/1.0 (MirrorStatus)") header(HttpHeaders.Accept, "*/*") } val elapsed = System.currentTimeMillis() - start - val status = when { - !response.status.isSuccess() -> MirrorStatus.DOWN - elapsed <= okLatencyCeilingMs -> MirrorStatus.OK - elapsed <= degradedLatencyCeilingMs -> MirrorStatus.DEGRADED - else -> MirrorStatus.DEGRADED - } - val result = PingResult(status, elapsed) - // Stop retrying as soon as we observe any non-DOWN outcome. - if (status != MirrorStatus.DOWN) return result - lastResult = result - } catch (_: Exception) { - // Timeout / DNS failure / TLS error / etc — retry once before - // collapsing to DOWN. No Sentry: mirror flapping is expected. + lastResult = PingResult(classify(response, elapsed), elapsed) + // Any HTTP response — success or error — is a terminal answer. + // The mirror reached us; whatever status it reported is the + // truth and re-probing won't change it. + return lastResult + } catch (e: Exception) { + // Timeout / DNS failure / TLS error. Sentry intentionally + // suppressed: mirror reachability flapping is expected, not + // exceptional. DEBUG so operators can correlate flap rates + // with upstream incidents without polluting INFO. + log.debug( + "Mirror probe network failure: preset={} attempt={} error={}", + preset.id, attempt, e.message, + ) lastResult = PingResult(MirrorStatus.DOWN, null) + true } - if (attempt < retryCount) { + if (networkFailure && attempt < retryCount) { // Tiny backoff so a transient flap has time to clear and the // retry doesn't re-use the same broken socket. delay(250) @@ -176,6 +181,17 @@ class MirrorStatusWorker( return lastResult } + private fun classify(response: HttpResponse, elapsedMs: Long): MirrorStatus = when { + !response.status.isSuccess() -> MirrorStatus.DOWN + elapsedMs <= okLatencyCeilingMs -> MirrorStatus.OK + // Any successful response slower than the OK ceiling is DEGRADED. + // Note: with perPingTimeoutMs (10s) > degradedLatencyCeilingMs (8s) + // there is a 2s window where a slow-but-successful response was + // previously impossible (the 5s timeout fired first). It now falls + // here as DEGRADED, which is the intended classification. + else -> MirrorStatus.DEGRADED + } + private fun acquireAdvisoryLock(): Boolean = transaction { val conn = TransactionManager.current().connection.connection as java.sql.Connection conn.prepareStatement("SELECT pg_try_advisory_lock(?)").use { ps ->