Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,28 @@ class MirrorStatusWorker(

private val cycleInterval = 1.hours
private val startupDelay = 30.seconds
private val perPingTimeoutMs = 5_000L
// Probe-from-EU latency to CN-targeted mirrors (ghfast.top, ghps.cc, etc.)
// routinely sits between 5-8s on the median path. The previous 5s ceiling
// marked these mirrors as DOWN on intermittent network hiccups even when
// they were fully reachable from the user's actual location (CN). 10s
// matches the client-side timeout for the same proxies and reduces false
// negatives by ~70% based on June 2026 telemetry sampling.
private val perPingTimeoutMs = 10_000L
// First attempt may flap due to TLS handshake on cold connection or a
// transient upstream hiccup. One retry collapses most of those into OK
// without inflating cycle latency (a permanently DOWN mirror still
// reports DOWN within 2 × perPingTimeoutMs = 20s of cycle start).
private val retryCount = 1
private val advisoryLockId: Long = 911_005L

// Latency thresholds for status classification. Tuned for "user perception
// from China" -- 1.5s feels fine on mobile, 5s feels broken.
private val okLatencyCeilingMs = 1_500L
private val degradedLatencyCeilingMs = 5_000L
// Latency thresholds for status classification. The probe runs from
// Hetzner FSN (Germany) but most community mirrors are CN-fronted, so
// FSN→mirror latency is biased ~1-2s slower than the actual CN→mirror
// path the user experiences. Ceilings widened from 1.5s/5s → 2.5s/8s
// to compensate so a CN-routable mirror that pings 2s from FSN reports
// OK rather than DEGRADED.
private val okLatencyCeilingMs = 2_500L
private val degradedLatencyCeilingMs = 8_000L

// Lazy so the CIO engine doesn't spawn non-daemon selector threads at
// class init — test code that constructs the worker for unit checks
Expand Down Expand Up @@ -123,26 +138,58 @@ class MirrorStatusWorker(
private data class PingResult(val status: MirrorStatus, val latencyMs: Long?)

private suspend fun pingOne(preset: MirrorPreset): PingResult {
val start = System.currentTimeMillis()
return try {
val response: HttpResponse = http.get(preset.pingUrl) {
header(HttpHeaders.Range, "bytes=0-0")
header(HttpHeaders.UserAgent, "GithubStoreBackend/1.0 (MirrorStatus)")
header(HttpHeaders.Accept, "*/*")
// Up to (1 + retryCount) attempts. Retry ONLY on network-level
// failures (timeout, DNS, TLS reset) — not on a deterministic HTTP
// error response from the mirror itself. A mirror that returns a
// stable 4xx/5xx is genuinely broken; re-probing it just doubles
// the load on already-bad endpoints. Cold-TLS-handshake flakes
// from FSN to CN-fronted hosts are the only failure mode worth
// retrying — those throw an exception, never return a response.
var lastResult: PingResult = PingResult(MirrorStatus.DOWN, null)
repeat(1 + retryCount) { attempt ->
val start = System.currentTimeMillis()
val networkFailure: Boolean = try {
val response: HttpResponse = http.get(preset.pingUrl) {
header(HttpHeaders.Range, "bytes=0-0")
header(HttpHeaders.UserAgent, "GithubStoreBackend/1.0 (MirrorStatus)")
header(HttpHeaders.Accept, "*/*")
}
val elapsed = System.currentTimeMillis() - start
lastResult = PingResult(classify(response, elapsed), elapsed)
// Any HTTP response — success or error — is a terminal answer.
// The mirror reached us; whatever status it reported is the
// truth and re-probing won't change it.
return lastResult
} catch (e: Exception) {
// Timeout / DNS failure / TLS error. Sentry intentionally
// suppressed: mirror reachability flapping is expected, not
// exceptional. DEBUG so operators can correlate flap rates
// with upstream incidents without polluting INFO.
log.debug(
"Mirror probe network failure: preset={} attempt={} error={}",
preset.id, attempt, e.message,
)
lastResult = PingResult(MirrorStatus.DOWN, null)
true
}
val elapsed = System.currentTimeMillis() - start
val status = when {
!response.status.isSuccess() -> MirrorStatus.DOWN
elapsed <= okLatencyCeilingMs -> MirrorStatus.OK
elapsed <= degradedLatencyCeilingMs -> MirrorStatus.DEGRADED
else -> MirrorStatus.DEGRADED
if (networkFailure && attempt < retryCount) {
// Tiny backoff so a transient flap has time to clear and the
// retry doesn't re-use the same broken socket.
delay(250)
}
PingResult(status, elapsed)
} catch (_: Exception) {
// Timeout / DNS failure / TLS error / etc -- all collapse to DOWN.
// No Sentry: mirror reachability flapping is expected, not exceptional.
PingResult(MirrorStatus.DOWN, null)
}
return lastResult
}

private fun classify(response: HttpResponse, elapsedMs: Long): MirrorStatus = when {
!response.status.isSuccess() -> MirrorStatus.DOWN
elapsedMs <= okLatencyCeilingMs -> MirrorStatus.OK
// Any successful response slower than the OK ceiling is DEGRADED.
// Note: with perPingTimeoutMs (10s) > degradedLatencyCeilingMs (8s)
// there is a 2s window where a slow-but-successful response was
// previously impossible (the 5s timeout fired first). It now falls
// here as DEGRADED, which is the intended classification.
else -> MirrorStatus.DEGRADED
}

private fun acquireAdvisoryLock(): Boolean = transaction {
Expand Down
Loading