From 84fb6dd0956dddc21cbe6dd135cc52d3faa19072 Mon Sep 17 00:00:00 2001 From: Adam Fisk Date: Fri, 26 Jun 2026 15:24:46 -0600 Subject: [PATCH 1/8] site-rules: shared known-behaviors list + transport-failure classification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a curated, evolving per-site routing list (`site_rules`) consulted ABOVE the per-machine `site_cache`, so a client routes correctly on its first visit instead of re-paying the Cronet-fail-then-escalate cost. Seeded from anti-bot vendor knowledge + live failure telemetry (source/confidence tagged), overlaid by `~/.wick/site-rules.json` and refreshed daily from the Worker. Also classify transport failures so a user disconnect is never mistaken for "this site is hard" (which would poison the rules): capture the real Cronet net-error in `on_failed` (previously discarded) and gate every non-definitive cause on a connectivity probe — offline / dns / timeout / reset / refused / unreachable / quic / connect / other. The Worker aggregates `error_kind_dist` alongside `status_dist`, and serves the rules via GET/POST /v1/site-rules. - site_rules.rs: include_str! seed + on-disk overlay + once-per-process daily refresh - fetch.rs: rule-aware should_use_cef_first; thread residential flag + selector into CEF; classify_transport_error + connectivity probe (proxy-aware) - analytics.rs: report_transport_error carrying error_kind - cronet: bind Cronet_Error_error_code_get; surface the cause in on_failed - cef.rs: respawn the CEF daemon on a residential-mode mismatch - site_cache.rs: extract shared parent_domain host walk - main.rs: register site_rules; add `wick fetch --json` - worker: error_kind_dist + GET /v1/site-rules (public) + POST /v1/site-rules/:key (auth) Co-Authored-By: Claude Opus 4.8 (1M context) --- rust/data/site-rules.json | 37 +++++ rust/src/analytics.rs | 40 +++++ rust/src/cef.rs | 19 ++- rust/src/cronet/ffi.rs | 4 + rust/src/cronet/mod.rs | 38 ++++- rust/src/fetch.rs | 310 +++++++++++++++++++++++++++++++++----- rust/src/main.rs | 22 +++ rust/src/site_cache.rs | 22 +-- rust/src/site_rules.rs | 237 +++++++++++++++++++++++++++++ worker/src/index.js | 126 +++++++++++++++- 10 files changed, 803 insertions(+), 52 deletions(-) create mode 100644 rust/data/site-rules.json create mode 100644 rust/src/site_rules.rs diff --git a/rust/data/site-rules.json b/rust/data/site-rules.json new file mode 100644 index 0000000..acf95a4 --- /dev/null +++ b/rust/data/site-rules.json @@ -0,0 +1,37 @@ +{ + "version": 1, + "updated_at": "2026-06-26", + "note": "Curated, evolving per-site behavior rules — the shared 'known behaviors' list. Distinct from the per-machine site-cache: these ship with the binary so a brand-new client routes correctly on its FIRST visit instead of re-paying the Cronet-fail-then-escalate cost. Hand-seeded here from anti-bot-vendor knowledge (bench/sites.txt) plus live failure telemetry; the self-improvement harness overwrites entries with measured results and clients refresh this from releases.getwick.dev/v1/site-rules. 'render' and 'needs_residential' drive fetch.rs; 'vendor'/'confidence'/'source' are advisory metadata. needs_residential is mostly advisory for local users (who lack a residential exit) and actionable for server/Pro deployments + the CEF residential tunnel.", + "rules": { + "x.com": { "render": "cef", "wait_for_selector": "article", "vendor": "x-spa", "confidence": 0.85, "source": "seed" }, + "twitter.com": { "render": "cef", "wait_for_selector": "article", "vendor": "x-spa", "confidence": 0.8, "source": "seed" }, + "instagram.com": { "render": "cef", "vendor": "meta-spa", "confidence": 0.8, "source": "seed" }, + "facebook.com": { "render": "cef", "vendor": "meta-spa", "confidence": 0.75, "source": "seed" }, + "tiktok.com": { "render": "cef", "vendor": "tiktok", "confidence": 0.7, "source": "seed" }, + "linkedin.com": { "render": "cef", "vendor": "akamai", "confidence": 0.8, "source": "seed" }, + "reuters.com": { "render": "cef", "vendor": "bot-managed", "confidence": 0.85, "source": "seed" }, + "bloomberg.com": { "render": "cef", "vendor": "bot-managed", "confidence": 0.8, "source": "seed" }, + "tradingview.com": { "render": "cef", "vendor": "spa", "confidence": 0.8, "source": "seed" }, + "cfr.org": { "render": "cef", "vendor": "cloudflare", "confidence": 0.75, "source": "seed" }, + "finance.yahoo.com": { "render": "cef", "vendor": "bot-managed", "confidence": 0.75, "source": "seed" }, + "economictimes.indiatimes.com": { "render": "cef", "vendor": "bot-managed", "confidence": 0.7, "source": "seed" }, + "discord.com": { "render": "cef", "vendor": "cloudflare", "confidence": 0.75, "source": "seed" }, + "patreon.com": { "render": "cef", "vendor": "cloudflare", "confidence": 0.7, "source": "seed" }, + "chess.com": { "render": "cef", "vendor": "cloudflare", "confidence": 0.7, "source": "seed" }, + "zillow.com": { "render": "cef", "vendor": "perimeterx", "confidence": 0.7, "source": "seed" }, + "realtor.com": { "render": "cef", "vendor": "perimeterx", "confidence": 0.7, "source": "seed" }, + "crunchbase.com": { "render": "cef", "vendor": "perimeterx", "confidence": 0.7, "source": "seed" }, + "ticketmaster.com": { "render": "cef", "vendor": "perimeterx", "confidence": 0.65, "source": "seed" }, + "nike.com": { "render": "cef", "vendor": "akamai", "confidence": 0.7, "source": "seed" }, + "adidas.com": { "render": "cef", "vendor": "akamai", "confidence": 0.65, "source": "seed" }, + "indeed.com": { "render": "cef", "needs_residential": true, "vendor": "datadome", "confidence": 0.6, "source": "seed" }, + "glassdoor.com": { "render": "cef", "needs_residential": true, "vendor": "datadome", "confidence": 0.6, "source": "seed" }, + "vinted.com": { "render": "cef", "needs_residential": true, "vendor": "datadome", "confidence": 0.6, "source": "seed" }, + "leroymerlin.fr": { "render": "cef", "needs_residential": true, "vendor": "datadome", "confidence": 0.55, "source": "seed" }, + "mediamarkt.de": { "render": "cef", "needs_residential": true, "vendor": "datadome", "confidence": 0.55, "source": "seed" }, + "apkpure.com": { "render": "cef", "needs_residential": true, "vendor": "apk-mirror", "confidence": 0.5, "source": "seed" }, + "apkcombo.com": { "render": "cef", "needs_residential": true, "vendor": "apk-mirror", "confidence": 0.5, "source": "seed" }, + "apkmirror.com": { "render": "cef", "needs_residential": true, "vendor": "apk-mirror", "confidence": 0.5, "source": "seed" }, + "apk.support": { "render": "cef", "needs_residential": true, "vendor": "apk-mirror", "confidence": 0.5, "source": "seed" } + } +} diff --git a/rust/src/analytics.rs b/rust/src/analytics.rs index 1177613..8cc2665 100644 --- a/rust/src/analytics.rs +++ b/rust/src/analytics.rs @@ -7,6 +7,11 @@ //! - Command events (`install`, `install_cef`, `fetch` dedup'd daily, etc.) //! - Per-fetch records: hostname, strategy, ok, status, timing_ms, //! wick version, OS. +//! - For transport failures (no HTTP response at all): a coarse `error_kind` +//! cause — `offline` / `dns` / `timeout` / `reset` / `refused` / +//! `unreachable` / `quic` / `connect` / `other`. Lets the stats page tell a +//! user disconnect apart from a site actively blocking us. No error +//! strings, paths, or IPs — just the bucket. //! //! What is NOT collected: //! - URL paths or query strings, request headers, page content, titles @@ -116,6 +121,41 @@ pub fn report_fetch(ev: FetchEvent) { enqueue(EVENTS_URL, payload.to_string()); } +/// Report a transport-level failure: the request never produced an HTTP +/// response at all (DNS failure, connect refused, TLS reset, timeout, or the +/// user's own network being down). Posts the same `/v1/events` shape as +/// `report_fetch` plus an `error_kind` cause. +/// +/// `error_kind` is the load-bearing addition for the self-improvement loop: +/// it lets the stats page and the probing harness tell a site *actively +/// blocking us* (`reset`, `refused`, `timeout`-while-online) apart from the +/// *user's connection dropping* (`offline`). Without it, a user whose wifi +/// died 25 times reads identically to a genuinely hard site — and would +/// poison the curated rules with a phantom "this site is hard" signal. +pub fn report_transport_error( + host: &str, + strategy: &str, + error_kind: &str, + escalated_from: Option<&str>, + timing_ms: u64, +) { + if is_opted_out() { + return; + } + let payload = json!({ + "host": host, + "strategy": strategy, + "escalated_from": escalated_from, + "ok": false, + "status": 0, + "timing_ms": timing_ms, + "error_kind": error_kind, + "version": env!("CARGO_PKG_VERSION"), + "os": std::env::consts::OS, + }); + enqueue(EVENTS_URL, payload.to_string()); +} + /// Report a fetch failure — legacy endpoint. Still useful for aggregate /// error counts on the KV-backed dashboard. `report_fetch` supersedes it /// for per-host/per-strategy analysis. diff --git a/rust/src/cef.rs b/rust/src/cef.rs index 9dfb192..1ecdf9d 100644 --- a/rust/src/cef.rs +++ b/rust/src/cef.rs @@ -20,6 +20,10 @@ struct DaemonProcess { child: Child, stdin: ChildStdin, stdout: BufReader, + /// Whether this daemon was spawned with the residential tunnel + /// (LD_PRELOAD bindwg.so). The daemon is a process-wide singleton, so a + /// later request in the other mode must respawn it — see `ensure_daemon`. + use_residential: bool, } /// Render with default options (no residential tunnel, no selector wait). @@ -97,11 +101,20 @@ fn render_blocking(url: &str, opts: &RenderOptions) -> Result { fn ensure_daemon(use_residential: bool) -> Result<()> { let mut daemon = DAEMON.lock().map_err(|e| anyhow::anyhow!("lock: {}", e))?; - // Check if existing daemon is still alive + // Check if existing daemon is still alive AND in the requested + // residential mode. The daemon is a process-wide singleton whose + // residential tunnel is fixed at spawn (LD_PRELOAD), so reusing one + // started in the other mode would silently route through the wrong exit + // — e.g. a needs_residential site served over the datacenter IP it was + // flagged as blocking. On a mode mismatch, kill and respawn. if let Some(ref mut d) = *daemon { match d.child.try_wait() { Ok(Some(_)) => { *daemon = None; } - Ok(None) => return Ok(()), + Ok(None) if d.use_residential == use_residential => return Ok(()), + Ok(None) => { + let _ = d.child.kill(); + *daemon = None; + } Err(_) => { *daemon = None; } } } @@ -183,7 +196,7 @@ fn ensure_daemon(use_residential: bool) -> Result<()> { // Wait for CEF to initialize std::thread::sleep(Duration::from_secs(2)); - *daemon = Some(DaemonProcess { child, stdin, stdout }); + *daemon = Some(DaemonProcess { child, stdin, stdout, use_residential }); Ok(()) } diff --git a/rust/src/cronet/ffi.rs b/rust/src/cronet/ffi.rs index ad9d566..9d8d67f 100644 --- a/rust/src/cronet/ffi.rs +++ b/rust/src/cronet/ffi.rs @@ -118,4 +118,8 @@ extern "C" { // UrlResponseInfo pub fn Cronet_UrlResponseInfo_http_status_code_get(info: Cronet_UrlResponseInfoPtr) -> i32; + + // Error inspection — the net-error cause behind a failed request. + // Returns a Cronet_Error_ERROR_CODE (see include/cronet.idl_c.h). + pub fn Cronet_Error_error_code_get(error: Cronet_ErrorPtr) -> c_int; } diff --git a/rust/src/cronet/mod.rs b/rust/src/cronet/mod.rs index 2da840e..6dd728f 100644 --- a/rust/src/cronet/mod.rs +++ b/rust/src/cronet/mod.rs @@ -286,10 +286,44 @@ unsafe extern "C" fn on_failed( callback: ffi::Cronet_UrlRequestCallbackPtr, _request: ffi::Cronet_UrlRequestPtr, _info: ffi::Cronet_UrlResponseInfoPtr, - _error: ffi::Cronet_ErrorPtr, + error: ffi::Cronet_ErrorPtr, ) { + // Capture the net-error cause BEFORE finish_request tears the request + // down — the error object is owned by Cronet and valid only for the + // duration of this callback. The cause is embedded (in brackets) so + // fetch::classify_transport_error can tell a user disconnect apart from + // a site actively blocking us instead of lumping both into one opaque + // "transport error". + let code_name = if error.is_null() { + "ERROR_OTHER" + } else { + net_error_name(ffi::Cronet_Error_error_code_get(error)) + }; let boxed = finish_request(callback); - complete(&boxed, Err(anyhow::anyhow!("Cronet request failed"))); + complete( + &boxed, + Err(anyhow::anyhow!("cronet request failed [{}]", code_name)), + ); +} + +/// Map a `Cronet_Error_ERROR_CODE` (include/cronet.idl_c.h) to a stable name +/// embedded in the error message. `fetch::error_kind_from_message` matches on +/// these (lowercased) to classify the failure cause. +fn net_error_name(code: std::os::raw::c_int) -> &'static str { + match code { + 0 => "ERROR_CALLBACK", + 1 => "ERROR_HOSTNAME_NOT_RESOLVED", + 2 => "ERROR_INTERNET_DISCONNECTED", + 3 => "ERROR_NETWORK_CHANGED", + 4 => "ERROR_TIMED_OUT", + 5 => "ERROR_CONNECTION_CLOSED", + 6 => "ERROR_CONNECTION_TIMED_OUT", + 7 => "ERROR_CONNECTION_REFUSED", + 8 => "ERROR_CONNECTION_RESET", + 9 => "ERROR_ADDRESS_UNREACHABLE", + 10 => "ERROR_QUIC_PROTOCOL_FAILED", + _ => "ERROR_OTHER", + } } unsafe extern "C" fn on_canceled( diff --git a/rust/src/fetch.rs b/rust/src/fetch.rs index d8ad869..a174bb9 100644 --- a/rust/src/fetch.rs +++ b/rust/src/fetch.rs @@ -1,5 +1,5 @@ use anyhow::Result; -use std::time::Instant; +use std::time::{Duration, Instant}; use crate::analytics::{self, FetchEvent}; use crate::captcha; @@ -7,6 +7,7 @@ use crate::engine::Client; use crate::extract::{self, Format}; use crate::robots; use crate::site_cache; +use crate::site_rules; pub struct FetchResult { pub content: String, @@ -86,6 +87,7 @@ pub async fn fetch( ) -> Result { let start = Instant::now(); analytics::ping("fetch"); + site_rules::refresh_if_stale(); let parsed = url::Url::parse(url) .map_err(|e| anyhow::anyhow!("invalid URL: {}", e))?; @@ -130,6 +132,16 @@ pub async fn fetch( let mut cef_installed = crate::cef::is_available(); let cached = site_cache::get(host); + // Curated rule for this host (the shared "known behaviors" list). + // Consulted above the local cache so a brand-new client can route + // correctly on its FIRST visit — pinning CEF-first and/or requesting a + // residential exit before this machine has learned anything itself. + let rule = site_rules::get(host); + let use_residential = rule.as_ref().map(|r| r.needs_residential).unwrap_or(false); + // Effective selector: an explicit caller argument wins; otherwise fall + // back to the rule's selector (e.g. wait for `article` on x.com). + let effective_selector = + wait_for_selector.or_else(|| rule.as_ref().and_then(|r| r.wait_for_selector.as_deref())); // Forced CEF: attempt auto-install before bailing — the user has // explicitly asked for the renderer, so it's the strongest signal // they want CEF. Falls back to a clear error if install isn't @@ -149,10 +161,14 @@ pub async fn fetch( } let cef_first = render == RenderMode::Cef || (render == RenderMode::Auto - && should_use_cef_first(cached.as_ref().map(|s| s.strategy.as_str()), cef_installed)); + && should_use_cef_first( + rule.as_ref().map(|r| r.render.as_str()), + cached.as_ref().map(|s| s.strategy.as_str()), + cef_installed, + )); if cef_first { - match cef_render_with_retry(url, wait_for_selector).await { + match cef_render_with_retry(url, effective_selector, use_residential).await { Ok(html) => { let extracted = extract::extract(&html, &parsed, format)?; let content = append_media(&extracted.content, &html, &parsed); @@ -191,10 +207,12 @@ pub async fn fetch( Ok(r) => r, Err(e) => { let timing_ms = start.elapsed().as_millis() as u64; - analytics::report_fetch(FetchEvent { - host, strategy: "cronet-transport-error", escalated_from, - ok: false, status: 0, timing_ms, - }); + if !analytics::is_opted_out() { + let kind = classify_transport_error(&e).await; + analytics::report_transport_error( + host, "cronet-transport-error", kind, escalated_from, timing_ms, + ); + } return Err(e); } }; @@ -330,7 +348,7 @@ pub async fn fetch( // offer to install CEF on the spot in interactive sessions. The // captcha branch above falls through here when CEF could help. if (status == 403 || status == 503) && escalated_from.is_none() { - if let Some(html) = cef_render_or_install(url, wait_for_selector, render).await { + if let Some(html) = cef_render_or_install(url, effective_selector, render, use_residential).await { let extracted = extract::extract(&html, &parsed, format)?; let content = append_media(&extracted.content, &html, &parsed); let timing_ms = start.elapsed().as_millis() as u64; @@ -397,7 +415,7 @@ pub async fn fetch( // second auto-escalation trigger alongside 403/503. let js_shell = is_js_required_shell(&body); if escalated_from.is_none() && js_shell { - if let Some(html) = cef_render_or_install(url, wait_for_selector, render).await { + if let Some(html) = cef_render_or_install(url, effective_selector, render, use_residential).await { let extracted = extract::extract(&html, &parsed, format)?; let content = append_media(&extracted.content, &html, &parsed); let timing_ms = start.elapsed().as_millis() as u64; @@ -462,6 +480,7 @@ async fn cef_render_or_install( url: &str, wait_for_selector: Option<&str>, render: RenderMode, + use_residential: bool, ) -> Option { if render == RenderMode::Cronet { return None; @@ -474,7 +493,7 @@ async fn cef_render_or_install( return None; } } - cef_render_with_retry(url, wait_for_selector).await.ok() + cef_render_with_retry(url, wait_for_selector, use_residential).await.ok() } static USER_DECLINED_INSTALL: std::sync::atomic::AtomicBool = @@ -553,9 +572,10 @@ fn prompt_user_for_install() -> bool { async fn cef_render_with_retry( url: &str, wait_for_selector: Option<&str>, + use_residential: bool, ) -> Result { let opts = crate::cef::RenderOptions { - use_residential: false, + use_residential, wait_for_selector: wait_for_selector.map(|s| s.to_string()), }; match crate::cef::render_with(url, opts.clone()).await { @@ -627,6 +647,7 @@ pub async fn fetch_html( respect_robots: bool, ) -> Result { let start = Instant::now(); + site_rules::refresh_if_stale(); let parsed = url::Url::parse(url) .map_err(|e| anyhow::anyhow!("invalid URL: {}", e))?; @@ -650,10 +671,17 @@ pub async fn fetch_html( let cef_installed = crate::cef::is_available(); let cached = site_cache::get(host); - let cef_first = should_use_cef_first(cached.as_ref().map(|s| s.strategy.as_str()), cef_installed); + let rule = site_rules::get(host); + let use_residential = rule.as_ref().map(|r| r.needs_residential).unwrap_or(false); + let effective_selector = rule.as_ref().and_then(|r| r.wait_for_selector.as_deref()); + let cef_first = should_use_cef_first( + rule.as_ref().map(|r| r.render.as_str()), + cached.as_ref().map(|s| s.strategy.as_str()), + cef_installed, + ); if cef_first { - match cef_render_with_retry(url, None).await { + match cef_render_with_retry(url, effective_selector, use_residential).await { Ok(html) => { let timing_ms = start.elapsed().as_millis() as u64; site_cache::record(host, "cef", false, timing_ms); @@ -678,10 +706,12 @@ pub async fn fetch_html( Ok(r) => r, Err(e) => { let timing_ms = start.elapsed().as_millis() as u64; - analytics::report_fetch(FetchEvent { - host, strategy: "cronet-transport-error", escalated_from, - ok: false, status: 0, timing_ms, - }); + if !analytics::is_opted_out() { + let kind = classify_transport_error(&e).await; + analytics::report_transport_error( + host, "cronet-transport-error", kind, escalated_from, timing_ms, + ); + } return Err(e); } }; @@ -696,7 +726,7 @@ pub async fn fetch_html( host, strategy: "cronet-blocked", escalated_from: None, ok: false, status: resp.status, timing_ms, }); - match cef_render_with_retry(url, None).await { + match cef_render_with_retry(url, effective_selector, use_residential).await { Ok(html) => { let timing_ms = start.elapsed().as_millis() as u64; site_cache::record(host, "cef", false, timing_ms); @@ -758,6 +788,7 @@ pub async fn fetch_raw( respect_robots: bool, ) -> Result { let start = Instant::now(); + site_rules::refresh_if_stale(); let parsed = url::Url::parse(url) .map_err(|e| anyhow::anyhow!("invalid URL: {}", e))?; @@ -784,10 +815,12 @@ pub async fn fetch_raw( Ok(r) => r, Err(e) => { let timing_ms = start.elapsed().as_millis() as u64; - analytics::report_fetch(FetchEvent { - host, strategy: "cronet-raw-transport-error", escalated_from: None, - ok: false, status: 0, timing_ms, - }); + if !analytics::is_opted_out() { + let kind = classify_transport_error(&e).await; + analytics::report_transport_error( + host, "cronet-raw-transport-error", kind, None, timing_ms, + ); + } return Err(e); } }; @@ -810,6 +843,130 @@ pub async fn fetch_raw( }) } +/// Control endpoint for the connectivity probe. Reaching our own edge is the +/// relevant signal: if we can't reach it, telemetry can't post anyway, so +/// attributing the failure to the user rather than the site is correct. The +/// path is edge-cached (300s), so the probe is cheap. +const CONNECTIVITY_PROBE_URL: &str = "https://releases.getwick.dev/install-pro.sh"; + +/// Short-lived cache of the last connectivity verdict, so a site that +/// hard-blocks many requests in a burst doesn't fan out a probe per failure. +static CONNECTIVITY_CACHE: std::sync::LazyLock>> = + std::sync::LazyLock::new(|| std::sync::Mutex::new(None)); + +/// True when the error is *definitively* the user's own network being gone — +/// the OS told Cronet so. These never need the connectivity probe and must +/// never be attributed to the site. +fn is_definitely_offline(msg_lower: &str) -> bool { + msg_lower.contains("internet_disconnected") || msg_lower.contains("network_changed") +} + +/// True if the message looks like a DNS-resolution failure. +fn looks_like_dns(msg_lower: &str) -> bool { + msg_lower.contains("hostname_not_resolved") + || msg_lower.contains("name or service not known") + || msg_lower.contains("failed to lookup address") + || msg_lower.contains("dns error") +} + +/// Best guess at the *site-side* cause of a transport failure, ASSUMING the +/// machine is online — the caller applies the connectivity gate. Pure and +/// unit-testable. +/// +/// Cronet encodes its net-error name in brackets — see `cronet::on_failed`, +/// e.g. `ERROR_CONNECTION_RESET`. The reqwest fallback transport is matched on +/// its typed predicates first, since they're more reliable than the string. +fn candidate_cause(err: &anyhow::Error, msg_lower: &str) -> &'static str { + if let Some(re) = err.downcast_ref::() { + if re.is_connect() { + return "connect"; + } + if re.is_timeout() { + return "timeout"; + } + } + if msg_lower.contains("connection_refused") { + return "refused"; + } + if msg_lower.contains("connection_reset") || msg_lower.contains("connection_closed") { + return "reset"; + } + if msg_lower.contains("address_unreachable") { + return "unreachable"; + } + if msg_lower.contains("quic") { + return "quic"; + } + if looks_like_dns(msg_lower) { + return "dns"; + } + if msg_lower.contains("timed_out") || msg_lower.contains("timeout") || msg_lower.contains("timed out") { + return "timeout"; + } + if msg_lower.contains("connect") { + return "connect"; + } + "other" +} + +/// Classify a transport failure into a coarse cause for telemetry. +/// +/// The connectivity probe is the *universal* user-vs-site gate: a dying local +/// interface can surface as RST / refused / QUIC-failed / unreachable, not +/// only as `internet_disconnected`. So any non-definitive cause is reported as +/// `offline` when this machine can't reach our edge at that moment — only an +/// online machine's failure is attributed to the site. This is what keeps a +/// user disconnect from poisoning the curated rules with a phantom "this site +/// is hard" signal. The lone shortcut is the OS-confirmed offline case, which +/// skips the probe (and its latency) entirely. +async fn classify_transport_error(err: &anyhow::Error) -> &'static str { + let msg = err.to_string().to_ascii_lowercase(); + if is_definitely_offline(&msg) { + return "offline"; + } + let candidate = candidate_cause(err, &msg); + if !connectivity_ok().await { + return "offline"; + } + candidate +} + +/// Best-effort check that THIS machine has working off-box connectivity, used +/// to disambiguate a transport failure. Short timeout, cached briefly. On any +/// inability to even build the probe client we assume online, so we never +/// *over*-attribute "offline" (which would silently drop a real site signal). +async fn connectivity_ok() -> bool { + const TTL: Duration = Duration::from_secs(10); + if let Ok(guard) = CONNECTIVITY_CACHE.lock() { + if let Some((when, ok)) = *guard { + if when.elapsed() < TTL { + return ok; + } + } + } + // Probe through the same proxy real fetches use (WICK_PROXY), so a host + // whose only route off-box is a tunnel isn't falsely judged "offline" on + // every ambiguous failure. + let mut builder = reqwest::Client::builder().timeout(Duration::from_secs(2)); + if let Ok(proxy) = std::env::var("WICK_PROXY") { + let proxy = proxy.trim(); + if !proxy.is_empty() { + if let Ok(p) = reqwest::Proxy::all(proxy) { + builder = builder.proxy(p); + } + } + } + let ok = match builder.build() { + // Any HTTP response (even a 4xx) proves we have a route off the box. + Ok(c) => c.get(CONNECTIVITY_PROBE_URL).send().await.is_ok(), + Err(_) => true, + }; + if let Ok(mut guard) = CONNECTIVITY_CACHE.lock() { + *guard = Some((Instant::now(), ok)); + } + ok +} + fn is_challenge(body: &str) -> bool { let lower = body.to_lowercase(); [ @@ -854,11 +1011,29 @@ fn is_js_required_shell(body: &str) -> bool { SIGNALS.iter().any(|s| lower.contains(s)) } -/// Decide whether to try CEF first based on the cached strategy and CEF -/// availability. Pure function so the strategy-selection rule is easy -/// to unit-test without spinning up a fetch pipeline. -fn should_use_cef_first(cached_strategy: Option<&str>, cef_installed: bool) -> bool { - cef_installed && matches!(cached_strategy, Some("cef")) +/// Decide whether to try CEF first, consulting the curated rule (highest +/// authority) then the local cache. Pure function so the precedence rule is +/// easy to unit-test without spinning up a fetch pipeline. +/// +/// - curated rule `"cef"` → CEF-first (when installed), even with no local +/// cache entry — that head start is the whole point of the shared list. +/// - curated rule `"cronet"` → not CEF-first; trust the rule and leave the +/// normal on-block escalation as the only route to CEF. +/// - no curated opinion (no rule, or an unrecognized `render`) → fall back to +/// the local cache, preserving the prior behavior (`"cef"` ⇒ CEF-first). +fn should_use_cef_first( + rule_render: Option<&str>, + cached_strategy: Option<&str>, + cef_installed: bool, +) -> bool { + if !cef_installed { + return false; + } + match rule_render { + Some("cef") => true, + Some("cronet") => false, + _ => matches!(cached_strategy, Some("cef")), + } } #[cfg(test)] @@ -867,25 +1042,29 @@ mod tests { #[test] fn cef_first_only_when_cache_says_cef_and_installed() { - assert!(should_use_cef_first(Some("cef"), true)); + // No curated rule → fall back to the local cache. + assert!(should_use_cef_first(None, Some("cef"), true)); } #[test] fn cef_first_false_when_cef_not_installed() { - assert!(!should_use_cef_first(Some("cef"), false)); + assert!(!should_use_cef_first(None, Some("cef"), false)); + // Not even a curated "cef" rule force-routes when CEF isn't installed + // — Auto mode never auto-installs; escalation prompts instead. + assert!(!should_use_cef_first(Some("cef"), None, false)); } #[test] fn cef_first_false_for_cronet_cache_entry() { - assert!(!should_use_cef_first(Some("cronet"), true)); + assert!(!should_use_cef_first(None, Some("cronet"), true)); } #[test] fn cef_first_false_for_no_cache_entry() { // Default is Cronet-first. CEF is only used when later escalation // logic selects it. - assert!(!should_use_cef_first(None, true)); - assert!(!should_use_cef_first(None, false)); + assert!(!should_use_cef_first(None, None, true)); + assert!(!should_use_cef_first(None, None, false)); } #[test] @@ -893,9 +1072,37 @@ mod tests { // Anything outside the documented set falls through to the default // (Cronet-first). Forward-compatible with future cache-value // changes — won't accidentally route everything through CEF. - assert!(!should_use_cef_first(Some("captcha-auto"), true)); - assert!(!should_use_cef_first(Some("cef_timeout"), true)); - assert!(!should_use_cef_first(Some(""), true)); + assert!(!should_use_cef_first(None, Some("captcha-auto"), true)); + assert!(!should_use_cef_first(None, Some("cef_timeout"), true)); + assert!(!should_use_cef_first(None, Some(""), true)); + } + + #[test] + fn curated_cef_rule_wins_with_no_cache_entry() { + // The shared list's whole purpose: route a first-time visit to CEF + // before this machine has learned anything locally. + assert!(should_use_cef_first(Some("cef"), None, true)); + } + + #[test] + fn curated_cef_rule_overrides_stale_cronet_cache() { + // Curated authority beats a local cache that only ever saw Cronet + // (e.g. it never escalated because CEF wasn't installed at the time). + assert!(should_use_cef_first(Some("cef"), Some("cronet"), true)); + } + + #[test] + fn curated_cronet_rule_suppresses_cef_first() { + // A rule that says "cronet works here" should stop a stale local + // "cef" cache entry from forcing the slower CEF-first path. + assert!(!should_use_cef_first(Some("cronet"), Some("cef"), true)); + } + + #[test] + fn unknown_rule_render_falls_back_to_cache() { + // Forward-compat: an unrecognized `render` value is "no opinion". + assert!(should_use_cef_first(Some("future-mode"), Some("cef"), true)); + assert!(!should_use_cef_first(Some("future-mode"), None, true)); } #[test] @@ -989,6 +1196,39 @@ mod tests { assert!(!looks_like_cloudflare_interstitial(body)); } + #[test] + fn offline_is_definite_without_probe() { + // OS-confirmed network loss → "offline" with no probe (and no latency). + assert!(is_definitely_offline("cronet request failed [error_internet_disconnected]")); + assert!(is_definitely_offline("cronet request failed [error_network_changed]")); + // A reset is NOT definitively offline — it goes through the gate. + assert!(!is_definitely_offline("cronet request failed [error_connection_reset]")); + assert!(!is_definitely_offline("cronet request failed [error_timed_out]")); + } + + #[test] + fn candidate_cause_maps_cronet_names() { + // candidate_cause is the online-assumed mapping; the connectivity gate + // is applied separately by classify_transport_error. + let e = |s: &str| anyhow::anyhow!("{}", s); + assert_eq!(candidate_cause(&e("x"), "cronet request failed [error_connection_refused]"), "refused"); + assert_eq!(candidate_cause(&e("x"), "cronet request failed [error_connection_reset]"), "reset"); + assert_eq!(candidate_cause(&e("x"), "cronet request failed [error_connection_closed]"), "reset"); + assert_eq!(candidate_cause(&e("x"), "cronet request failed [error_address_unreachable]"), "unreachable"); + assert_eq!(candidate_cause(&e("x"), "cronet request failed [error_quic_protocol_failed]"), "quic"); + assert_eq!(candidate_cause(&e("x"), "cronet request failed [error_hostname_not_resolved]"), "dns"); + assert_eq!(candidate_cause(&e("x"), "cronet request failed [error_timed_out]"), "timeout"); + // Unmapped Cronet codes (e.g. ERROR_CALLBACK) fall through to "other". + assert_eq!(candidate_cause(&e("x"), "cronet request failed [error_callback]"), "other"); + } + + #[test] + fn dns_message_recognized() { + assert!(looks_like_dns("cronet request failed [error_hostname_not_resolved]")); + assert!(looks_like_dns("dns error: failed to lookup address information")); + assert!(!looks_like_dns("connection reset by peer")); + } + #[test] fn render_mode_parses_synonyms() { assert_eq!(RenderMode::from_str("cef"), RenderMode::Cef); diff --git a/rust/src/main.rs b/rust/src/main.rs index 56f4ff6..eaec53f 100644 --- a/rust/src/main.rs +++ b/rust/src/main.rs @@ -19,6 +19,7 @@ mod search; mod session; mod setup; mod site_cache; +mod site_rules; use anyhow::Result; use clap::{Parser, Subcommand}; @@ -72,6 +73,12 @@ enum Command { /// Ignore robots.txt restrictions #[arg(long)] no_robots: bool, + /// Emit one structured JSON line ({url, status_code, timing_ms, + /// bytes, title}) instead of page content. Used by the + /// self-improvement probe harness to judge per-strategy success + /// (forced --render + --proxy) deterministically. + #[arg(long)] + json: bool, }, /// Search the web and print results Search { @@ -208,6 +215,7 @@ async fn main() -> Result<()> { render, wait_for_selector, no_robots, + json, } => { let client = engine::Client::new(proxy)?; let parsed_format = extract::Format::from_str(&format); @@ -244,6 +252,20 @@ async fn main() -> Result<()> { ) .await?; + if json { + // One JSON line; success is judged by the harness from + // status_code + bytes. Content is intentionally omitted (the + // harness only needs the outcome, not the page). + let out = serde_json::json!({ + "url": result.url, + "status_code": result.status_code, + "timing_ms": result.timing_ms, + "bytes": result.content.len(), + "title": result.title, + }); + println!("{}", out); + return Ok(()); + } if let Some(title) = &result.title { eprintln!("Title: {}", title); } diff --git a/rust/src/site_cache.rs b/rust/src/site_cache.rs index 4d94620..2afd5eb 100644 --- a/rust/src/site_cache.rs +++ b/rust/src/site_cache.rs @@ -22,21 +22,25 @@ pub struct SiteStrategy { static CACHE: std::sync::LazyLock>> = std::sync::LazyLock::new(|| Mutex::new(load_cache())); +/// Given a host, return its one-level parent domain when that parent still +/// contains a dot (e.g. `www.reuters.com` → `reuters.com`, but `reuters.com` → +/// `None` since `com` isn't a usable key). Shared by the `site_cache` and +/// `site_rules` lookups so the two layers scope hosts identically — note this +/// is a single label strip, not PSL/eTLD+1 normalization. +pub(crate) fn parent_domain(host: &str) -> Option<&str> { + let dot = host.find('.')?; + let parent = &host[dot + 1..]; + parent.contains('.').then_some(parent) +} + /// Get the best strategy for a domain based on past fetches. pub fn get(host: &str) -> Option { let cache = CACHE.lock().ok()?; - // Check exact match, then parent domain (e.g., sub.example.com → example.com) + // Exact match, then one-level parent (e.g. sub.example.com → example.com). if let Some(s) = cache.get(host) { return Some(s.clone()); } - // Try parent domain - if let Some(dot) = host.find('.') { - let parent = &host[dot + 1..]; - if parent.contains('.') { - return cache.get(parent).cloned(); - } - } - None + parent_domain(host).and_then(|p| cache.get(p).cloned()) } /// Record the result of a fetch for future use. diff --git a/rust/src/site_rules.rs b/rust/src/site_rules.rs new file mode 100644 index 0000000..7a423b8 --- /dev/null +++ b/rust/src/site_rules.rs @@ -0,0 +1,237 @@ +//! Curated, evolving per-site behavior rules — the shared "known behaviors" +//! list the self-improvement loop maintains. +//! +//! This is **distinct from `site_cache`** (which is per-machine and learned +//! reactively from this machine's own fetches). `site_rules` is a *shared, +//! curated* list of known site behaviors — which transport works, whether +//! datacenter IPs are blocked, what selector to wait for. It lets a +//! brand-new client route correctly on its FIRST visit instead of re-paying +//! the Cronet-fail-then-escalate cost that `site_cache` only avoids after a +//! local miss. +//! +//! Precedence in `fetch.rs` (highest first): +//! forced `RenderMode` → **curated rule (here)** → local `site_cache` → +//! default (Cronet-first with on-block escalation). +//! +//! Sources, in increasing authority when both are present for a host: +//! 1. Bundled seed (`data/site-rules.json`, compiled in via `include_str!`) +//! — always available, works offline / on first run. +//! 2. On-disk overlay (`/site-rules.json`), refreshed from +//! `releases.getwick.dev/v1/site-rules` so the list "constantly evolves" +//! without a reinstall. The overlay wins per-host where present. +//! +//! Only `render`, `needs_residential`, and `wait_for_selector` change fetch +//! behavior; `vendor` / `confidence` / `source` are advisory metadata for the +//! harness and `wick rules` introspection. + +use serde::Deserialize; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::time::Duration; + +/// Bundled seed, embedded at compile time. The overlay (if any) is layered +/// on top at load. +const SEED: &str = include_str!("../data/site-rules.json"); + +#[derive(Debug, Clone, Deserialize)] +pub struct SiteRule { + /// What transport is known to work: `"cef"` or `"cronet"`. Unknown + /// values are treated as "no opinion" by the consumer (`fetch.rs`). + pub render: String, + /// Datacenter IPs are blocked here — a residential exit is needed for a + /// reliable fetch. Advisory for clients without a residential transport + /// (most local users); acted on by server/Pro deployments and the CEF + /// residential tunnel when it's available (see `cef::ensure_daemon`, + /// which no-ops the request if no tunnel is present). + #[serde(default)] + pub needs_residential: bool, + /// Optional CSS selector to wait for before dumping the DOM (SPAs that + /// hydrate content after first paint). Only meaningful for CEF renders. + #[serde(default)] + pub wait_for_selector: Option, + /// Anti-bot vendor, advisory only (e.g. `"datadome"`, `"cloudflare"`). + #[serde(default)] + pub vendor: Option, + /// 0..1 confidence. Hand-vetted seeds sit ~0.5–0.9; the harness writes + /// measured confidence. Advisory in PR1 (every present rule is applied); + /// a future threshold can gate on it. + #[serde(default)] + pub confidence: f32, + /// Provenance: `"seed"` | `"measured"` | `"curated"`. Lets the harness + /// tell its own measurements apart from hand-seeds it may overwrite. + #[serde(default)] + pub source: Option, +} + +#[derive(Debug, Deserialize)] +struct RuleFile { + #[serde(default)] + rules: HashMap, +} + +static RULES: std::sync::LazyLock> = + std::sync::LazyLock::new(load_rules); + +fn load_rules() -> HashMap { + // 1. Bundled seed (always present; a parse failure here is a build-time + // bug in the embedded JSON, so fall back to empty rather than panic + // a user's fetch). + let mut map = parse(SEED); + // 2. On-disk overlay wins per-host where present. Missing/corrupt + // overlay just leaves the seed in place. + if let Some(overlay) = read_overlay() { + for (host, rule) in parse(&overlay) { + map.insert(host, rule); + } + } + map +} + +fn parse(data: &str) -> HashMap { + serde_json::from_str::(data) + .map(|f| f.rules) + .unwrap_or_default() +} + +fn overlay_path() -> PathBuf { + crate::analytics::wick_home().join("site-rules.json") +} + +fn read_overlay() -> Option { + std::fs::read_to_string(overlay_path()).ok() +} + +/// Where clients pull the evolving curated rules (served by the Worker; see +/// `worker/src/index.js` `GET /v1/site-rules`). +const RULES_URL: &str = "https://releases.getwick.dev/v1/site-rules"; +/// Refresh the on-disk overlay at most once a day. +const REFRESH_INTERVAL_SECS: u64 = 24 * 3600; + +static REFRESH_ONCE: std::sync::Once = std::sync::Once::new(); + +/// Best-effort daily refresh of the on-disk overlay from the Worker — this is +/// what lets the curated list "constantly evolve" without a reinstall. +/// +/// Fire-and-forget: spawns a background thread (at most once per process) and +/// never blocks or fails the caller. The refreshed file is consumed on the +/// NEXT process start — `RULES` is a process-lifetime snapshot — which is the +/// right cadence for a CLI (rules change slowly; a long-lived `wick serve` +/// picks them up on restart). Honors the telemetry opt-out, since it's a +/// network callback home. +pub fn refresh_if_stale() { + REFRESH_ONCE.call_once(|| { + if crate::analytics::is_opted_out() { + return; + } + let path = overlay_path(); + if !needs_refresh(&path) { + return; + } + std::thread::Builder::new() + .name("wick-rules-refresh".into()) + .spawn(move || { + let _ = refresh_now(&path); + }) + .ok(); + }); +} + +fn needs_refresh(path: &Path) -> bool { + match std::fs::metadata(path).and_then(|m| m.modified()) { + // Refresh if the overlay is older than the interval, or if the clock + // is unreadable (treat as stale rather than never-refresh). + Ok(mtime) => mtime + .elapsed() + .map(|e| e.as_secs() > REFRESH_INTERVAL_SECS) + .unwrap_or(true), + // Missing / unreadable → first refresh. + Err(_) => true, + } +} + +/// Fetch the published rules and atomically replace the overlay. Only writes +/// when the body is a valid rules doc, so a transient 5xx / truncated +/// response can't clobber a good overlay with garbage. Writing even an empty +/// `{"rules":{}}` is fine — it just refreshes mtime (suppressing refetch for a +/// day) and overlays nothing onto the bundled seed. +fn refresh_now(path: &Path) -> std::io::Result<()> { + let io_err = |e| std::io::Error::new(std::io::ErrorKind::Other, e); + let client = reqwest::blocking::Client::builder() + .timeout(Duration::from_secs(5)) + .build() + .map_err(io_err)?; + let resp = client.get(RULES_URL).send().map_err(io_err)?; + if !resp.status().is_success() { + return Ok(()); // leave the existing overlay / seed in place + } + let body = resp.text().map_err(io_err)?; + // Validate before overwriting — must parse as our schema. + if serde_json::from_str::(&body).is_err() { + return Ok(()); + } + if let Some(dir) = path.parent() { + std::fs::create_dir_all(dir)?; + } + // Atomic replace: write a temp file then rename, so a reader never sees a + // half-written overlay. + let tmp = path.with_extension("json.tmp"); + std::fs::write(&tmp, body.as_bytes())?; + std::fs::rename(&tmp, path)?; + Ok(()) +} + +/// Look up the curated rule for a host: exact match, then a one-level parent +/// (so `www.reuters.com` resolves a `reuters.com` rule). Uses the same +/// `site_cache::parent_domain` walk as the local cache so the two layers +/// scope hosts identically. +pub fn get(host: &str) -> Option { + if let Some(r) = RULES.get(host) { + return Some(r.clone()); + } + crate::site_cache::parent_domain(host).and_then(|p| RULES.get(p).cloned()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn seed_parses_and_is_nonempty() { + // The bundled seed must always parse — a malformed edit to + // data/site-rules.json should fail CI here, not silently ship an + // empty rule set. + let seed = parse(SEED); + assert!(!seed.is_empty(), "bundled seed should contain rules"); + assert!(seed.contains_key("reuters.com")); + } + + #[test] + fn known_cef_seed_resolves_via_www_parent() { + // Telemetry reports `www.reuters.com`; the rule is keyed on the bare + // domain and must resolve through the one-level parent walk. + let r = get("www.reuters.com").expect("www.reuters.com → reuters.com"); + assert_eq!(r.render, "cef"); + } + + #[test] + fn exact_subdomain_rule_does_not_overreach() { + // `finance.yahoo.com` is keyed precisely; a bare `yahoo.com` (or a + // sibling like `mail.yahoo.com`) must NOT inherit its cef rule. + assert!(get("finance.yahoo.com").is_some()); + assert!(get("mail.yahoo.com").is_none()); + assert!(get("yahoo.com").is_none()); + } + + #[test] + fn residential_flag_seeded_for_datacenter_blockers() { + assert!(get("apkmirror.com").map(|r| r.needs_residential).unwrap_or(false)); + // www. parent walk carries the flag too. + assert!(get("www.apkmirror.com").map(|r| r.needs_residential).unwrap_or(false)); + } + + #[test] + fn unknown_host_has_no_rule() { + assert!(get("example.com").is_none()); + assert!(get("news.ycombinator.com").is_none()); + } +} diff --git a/worker/src/index.js b/worker/src/index.js index e021ecb..8423107 100644 --- a/worker/src/index.js +++ b/worker/src/index.js @@ -60,6 +60,21 @@ function isPrivateHost(hostname) { return false; } +/** + * Allowlist of transport-failure causes the client emits (see + * fetch::classify_transport_error). The /v1/events endpoint is + * unauthenticated, so we bound `error_kind_dist` to this fixed set rather + * than a loose regex — otherwise a buggy/malicious client could spray + * arbitrary [a-z_]{1,20} keys and inflate a day's KV value. + */ +const ERROR_KINDS = new Set([ + "offline", "dns", "timeout", "reset", "refused", + "unreachable", "quic", "connect", "other", +]); +function isErrorKind(k) { + return typeof k === "string" && ERROR_KINDS.has(k); +} + export default { async fetch(request, env) { const url = new URL(request.url); @@ -144,6 +159,7 @@ export default { // POST /v1/events with body: // { "host": "nytimes.com", "strategy": "cef", "escalated_from": null|"cronet", // "ok": true, "status": 200, "timing_ms": 1840, + // "error_kind": "reset"|"offline"|..., // only on transport failures // "version": "0.9.2", "os": "macos" } // // Storage model: one KV key per (date, host, strategy) with a merged @@ -186,7 +202,7 @@ export default { // manually edited in the KV dashboard) is treated as "start // fresh" so we never throw during increment or write back a // poisoned value. - const existing = { fetches: 0, successes: 0, total_ms: 0, status_dist: {} }; + const existing = { fetches: 0, successes: 0, total_ms: 0, status_dist: {}, error_kind_dist: {} }; const existingRaw = await env.SUBSCRIPTIONS.get(key); if (existingRaw) { try { @@ -211,6 +227,16 @@ export default { } } } + // error_kind_dist: {cause: count} for transport failures. Older + // KV values won't have it; ignore non-objects defensively. + if (parsed.error_kind_dist && typeof parsed.error_kind_dist === "object") { + for (const [kind, count] of Object.entries(parsed.error_kind_dist)) { + const n = Number(count); + if (Number.isFinite(n) && isErrorKind(kind)) { + existing.error_kind_dist[kind] = n; + } + } + } } } catch { /* corrupt JSON — start fresh */ } } @@ -233,6 +259,17 @@ export default { : "0"; existing.status_dist[statusBucket] = (existing.status_dist[statusBucket] || 0) + 1; + // Record the transport-failure cause when present (offline / dns / + // reset / refused / timeout / unreachable / quic / connect / other). + // Only transport-error events carry it; HTTP responses omit it, so + // this stays empty for them. This is what lets the stats page and the + // self-improvement harness exclude user-side "offline" failures from + // "this site is hard" — see analytics::report_transport_error. + if (isErrorKind(body.error_kind)) { + existing.error_kind_dist[body.error_kind] = + (existing.error_kind_dist[body.error_kind] || 0) + 1; + } + await env.SUBSCRIPTIONS.put(key, JSON.stringify(existing), { expirationTtl: 30 * 86400, }); @@ -250,7 +287,7 @@ export default { // v2 added per-row status_dist; bumping the key ensures the // first request after deploy rebuilds the cached payload in the // new shape instead of serving stale v1 JSON for up to 5 minutes. - const cacheKey = "stats:summary:v2"; + const cacheKey = "stats:summary:v3"; const cached = await env.SUBSCRIPTIONS.get(cacheKey); if (cached) { return new Response(cached, { @@ -307,7 +344,7 @@ export default { const strategy = rest.slice(lastColon + 1); const aggKey = `${host}|${strategy}`; const cur = agg.get(aggKey) || { - host, strategy, fetches: 0, successes: 0, total_ms: 0, status_dist: {}, + host, strategy, fetches: 0, successes: 0, total_ms: 0, status_dist: {}, error_kind_dist: {}, }; // Coerce each field via Number() and ignore non-finite // values — a stringly-typed stored value (`"1"`) would @@ -329,6 +366,16 @@ export default { } } } + // Merge transport-failure causes across days (same shape as + // status_dist). Pre-error_kind events simply don't contribute. + if (v.error_kind_dist && typeof v.error_kind_dist === "object") { + for (const [kind, count] of Object.entries(v.error_kind_dist)) { + const n = Number(count); + if (Number.isFinite(n) && isErrorKind(kind)) { + cur.error_kind_dist[kind] = (cur.error_kind_dist[kind] || 0) + n; + } + } + } agg.set(aggKey, cur); } cursor = list.list_complete ? undefined : list.cursor; @@ -348,6 +395,10 @@ export default { // of just a red bar. Pre-v2 events won't contribute here, so // historical rows may sum to less than `fetches`. status_dist: r.status_dist || {}, + // Per-cause counts for transport failures, so a row reading 0% + // success can be split into "offline×20" (user's network — ignore) + // vs "reset×20" (the site is actively blocking — a real signal). + error_kind_dist: r.error_kind_dist || {}, })) .sort((a, b) => b.fetches - a.fetches) .slice(0, 500); @@ -369,6 +420,75 @@ export default { }); } + // ── Curated site-rules (the evolving "known behaviors" list) ───────── + // + // GET /v1/site-rules → public, cached 1h. Clients refresh this + // daily into /site-rules.json, + // which overlays the binary's bundled seed + // (see rust/src/site_rules.rs). This is how + // the list "constantly evolves" without a + // reinstall. + // POST /v1/site-rules/:key → API-key gated. The self-improvement loop + // publishes a merged rules doc here (seed ∪ + // measured ∪ curated). Body is the full + // { version, rules: { host: {...} } } doc. + if (request.method === "GET" && path === "/v1/site-rules") { + const stored = await env.SUBSCRIPTIONS.get("site-rules:published"); + const body = stored || JSON.stringify({ version: 1, rules: {} }); + return new Response(body, { + headers: { + ...headers, + "Content-Type": "application/json", + "Cache-Control": "public, max-age=3600", + }, + }); + } + + const rulesPubMatch = path.match(/^\/v1\/site-rules\/([^/]+)$/); + if (request.method === "POST" && rulesPubMatch) { + const pubKey = rulesPubMatch[1]; + let keys; + try { keys = JSON.parse(env.API_KEYS || "{}"); } catch { + return new Response("Server error\n", { status: 500, headers }); + } + if (!keys[pubKey] || !keys[pubKey].active) { + return new Response("Invalid API key\n", { status: 403, headers }); + } + let doc; + try { doc = await request.json(); } catch { + return new Response("bad json\n", { status: 400, headers }); + } + if (!doc || typeof doc !== "object" || typeof doc.rules !== "object" || doc.rules === null) { + return new Response("expected { rules: { host: {...} } }\n", { status: 400, headers }); + } + const hostCount = Object.keys(doc.rules).length; + if (hostCount > 5000) { + return new Response("too many rules (max 5000)\n", { status: 400, headers }); + } + // Store verbatim (clients tolerate unknown fields; an unrecognized + // `render` is treated as "no opinion" client-side). Stamp a server + // receive time so clients/operators can see staleness. + const toStore = JSON.stringify({ + version: Number(doc.version) || 1, + published_at: new Date().toISOString(), + rules: doc.rules, + }); + if (toStore.length > 1_000_000) { + return new Response("payload too large (max 1MB)\n", { status: 413, headers }); + } + await env.SUBSCRIPTIONS.put("site-rules:published", toStore); + console.log(JSON.stringify({ + event: "site_rules_publish", + customer: keys[pubKey].customer, + hosts: hostCount, + timestamp: new Date().toISOString(), + })); + return new Response(JSON.stringify({ ok: true, hosts: hostCount }), { + status: 200, + headers: { ...headers, "Content-Type": "application/json" }, + }); + } + // Analytics dashboard — simple KV-based metrics // GET /analytics/:key (requires API key) if (path.match(/^\/analytics\/([^/]+)$/)) { From 64dde95a0e0384e89840a63531bf0b45b70598ac Mon Sep 17 00:00:00 2001 From: Adam Fisk Date: Fri, 26 Jun 2026 15:24:59 -0600 Subject: [PATCH 2/8] bench: residential self-improvement probe harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the loop from the public stats page back into routing. probe.sh reads /v1/stats/summary, selects genuinely site-side failing hosts (dropping error_kind=offline so user disconnects aren't chased), and probes a cronet | cronet+residential | cef matrix per host via `wick fetch --json`, deriving render + needs_residential. publish-rules.sh merges the measured verdicts with the bundled seed (measured wins, so a measurement CORRECTS an over-aggressive seed) and POSTs to /v1/site-rules. Fixes proxy-providers.sh: oxylabs is HTTP CONNECT on :7777 (:443-only), not SOCKS5 — the old socks5:// URL failed. See bench/PROBE.md for the pipeline, scheduling, and methodology caveats (notably: run from a datacenter VM to detect needs_residential faithfully; --proxy routes Cronet, not CEF). Co-Authored-By: Claude Opus 4.8 (1M context) --- bench/PROBE.md | 77 ++++++++++++++ bench/probe.sh | 223 +++++++++++++++++++++++++++++++++++++++ bench/proxy-providers.sh | 5 +- bench/publish-rules.sh | 70 ++++++++++++ 4 files changed, 374 insertions(+), 1 deletion(-) create mode 100644 bench/PROBE.md create mode 100755 bench/probe.sh create mode 100755 bench/publish-rules.sh diff --git a/bench/PROBE.md b/bench/PROBE.md new file mode 100644 index 0000000..3af9467 --- /dev/null +++ b/bench/PROBE.md @@ -0,0 +1,77 @@ +# Wick self-improvement probe harness + +Closes the loop from the public stats page back into Wick's routing: read which +sites Wick is **failing on**, empirically test access methods through a +residential proxy, and publish measured per-site rules that every client picks +up — so the curated "known behaviors" list (`rust/data/site-rules.json` + +`GET /v1/site-rules`) constantly evolves instead of relying on hand-seeds. + +``` + /v1/stats/summary ──→ probe.sh ──→ site-rules.measured.json ──→ publish-rules.sh ──→ POST /v1/site-rules + (failing sites) (matrix) (measured verdicts) (merge w/ seed) (clients refresh) +``` + +## The pipeline + +| stage | script | what it does | +|---|---|---| +| select | `probe.sh` step 1 | Pull `/v1/stats/summary`, aggregate per host, keep **site-side** failing hosts. Drops hosts whose failures are mostly `error_kind="offline"` (the user's own network) — so we never chase phantom "this site is hard" signals. | +| probe | `probe.sh` step 2–3 | Per host, run a matrix via `wick fetch --json`: `cronet` \| `cronet+residential` \| `cef`. Derive `render` (cef only if it beats a cronet failure) and `needs_residential` (residential beats a cronet failure). | +| emit | `probe.sh` step 4 | Write `~/.wick/probe/site-rules.measured.json` — a measured verdict for every host where *some* strategy worked (incl. `render:cronet`, so a measurement can **correct** an over-aggressive seed). Key is the host with a leading `www.` stripped, matching the seed convention. | +| publish | `publish-rules.sh` | Merge seed ∪ measured (**measured wins per host**) and `POST /v1/site-rules/:key`. | +| consume | client | `wick` refreshes `GET /v1/site-rules` into `/site-rules.json` daily; that overlay overrides the bundled seed (`site_rules.rs`). | + +## Running it + +```bash +# 1. residential creds from Vault (prod tailnet + GCP ADC required) +source /scripts/residential-proxy-env.sh # OXY_USER/OXY_PASS, ... + +# 2. ALWAYS probe availability first — it's time-varying +bash /scripts/residential-probe.sh US + +# 3. sweep (oxylabs is the reliable US provider; HTTP CONNECT :443-only) +bash bench/probe.sh --provider=oxylabs --country=us --max-hosts=15 +# → ~/.wick/probe/probe-.jsonl (per-host trace) +# → ~/.wick/probe/site-rules.measured.json + +# 4. publish (needs a Worker API key) +WICK_PUBLISH_KEY= bash bench/publish-rules.sh # or --dry-run to preview + +# candidate selection alone (no creds): bash bench/probe.sh --dry-run +``` + +## Scheduling + +Rules change slowly and residential probing has cost, so **weekly** is plenty. +Cron (sources creds at runtime — never bake Vault creds into the job): + +``` +# Sundays 04:00 — sweep the current failing set and republish +0 4 * * 0 source $HOME/.../scripts/residential-proxy-env.sh && \ + bash /abs/path/wick/bench/probe.sh --provider=oxylabs --country=us --max-hosts=25 && \ + WICK_PUBLISH_KEY=$WICK_PUBLISH_KEY bash /abs/path/wick/bench/publish-rules.sh +``` + +## Methodology caveats (read before trusting a single sweep) + +- **The `cronet` baseline cell uses the operator's own IP.** If that IP is clean + (residential / office), `cronet` succeeds and we conclude `needs_residential:false` + — even though the *datacenter*-hosted clients that generate much of the failing + telemetry would need residential. To detect `needs_residential` faithfully, run + the harness **from a datacenter VM** so the baseline matches the failing + population. (First live sweep, 2026-06-26, ran from a clean US vantage and found + reuters/cfr/tradingview/apkmirror/apkcombo all work on plain Cronet — i.e. the + telemetry failures were vantage-specific or user-side noise, and the hand-seeds + for those hosts were over-aggressive. The loop corrected them to `cronet`.) +- **`--proxy` (SOCKS/HTTP) routes only Cronet, not CEF.** CEF's residential path is + a WireGuard `LD_PRELOAD` (`bindwg.so`) that exists only on tunneled Linux servers, + so the `cef+residential` combination is **not** tested here. `render:cef` and + `needs_residential` are derived as independent signals; a site needing *both* + (e.g. apkpure — DataDome, failed every testable cell) is left to its seed / PR4's + agent. +- **Single residential IP per session, single country.** A site reachable from a + different country/ISP won't show it. Sweep multiple `--country` values for + geo-sensitive targets. +- A `200` under `MIN_OK_BYTES` (default 1000) is treated as a block/challenge shell, + not success (matches `fetch.rs`'s `is_acceptable_render`). diff --git a/bench/probe.sh b/bench/probe.sh new file mode 100755 index 0000000..66d6cef --- /dev/null +++ b/bench/probe.sh @@ -0,0 +1,223 @@ +#!/usr/bin/env bash +# Wick self-improvement probe harness (PR2). +# +# Closes the loop: read which sites Wick is FAILING on (from the public stats +# endpoint), then empirically test access methods against each through a +# residential proxy, and emit measured per-site rules in the site-rules.json +# schema that fetch.rs consumes. +# +# stats → candidates → strategy matrix → winner → site-rules.measured.json +# +# Pipeline: +# 1. Pull releases.getwick.dev/v1/stats/summary, aggregate per host, and +# select genuinely SITE-SIDE failing hosts — explicitly dropping hosts +# whose failures are mostly error_kind="offline" (the user's own network), +# so we never chase phantom "this site is hard" signals. +# 2. For each candidate, run a strategy matrix via `wick fetch --json`: +# - cronet (--render cronet) [direct] +# - cronet+residential(--render cronet --proxy ) [datacenter-block test] +# - cef (--render cef) [JS / bot-managed test] +# (cef+residential is NOT tested here: --proxy routes only the Cronet/ +# reqwest engine, not CEF, whose residential path is a WireGuard preload +# on tunneled servers. The rule still combines render:cef + needs_residential +# when both independent signals fire; PR4's agent refines.) +# 3. Decide the winner and derive the rule: +# render = "cef" if cef succeeds AND cronet-direct fails +# needs_residential = true if cronet+residential succeeds AND cronet-direct fails +# 4. Emit measured rules (source:"measured", with sample count + date) and a +# per-host JSONL trace. +# +# Residential creds come from the env (same convention as run.sh / +# proxy-providers.sh). Source the residential-proxy skill's env first: +# source /scripts/residential-proxy-env.sh # exports OXY_USER, ... +# bash bench/probe.sh --provider=oxylabs --country=us +# +# Safe under cron/launchd: serial, per-request timeout, polite sleep. + +set -u # NOT -e: a single failed probe must never kill the sweep. + +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROXY_BUILDER="$REPO_DIR/proxy-providers.sh" +STATS_URL="${WICK_STATS_URL:-https://releases.getwick.dev/v1/stats/summary}" + +OUT_DIR="${WICK_PROBE_OUT_DIR:-$HOME/.wick/probe}" +TS="$(date -u +'%Y-%m-%dT%H:%M:%SZ')" +DAY="$(date -u +'%Y-%m-%d')" +RESULTS="$OUT_DIR/probe-$TS.jsonl" +RULES_OUT="$OUT_DIR/site-rules.measured.json" + +# Tunables. +PROVIDER="${WICK_PROBE_PROVIDER:-}" +COUNTRY="${WICK_PROBE_COUNTRY:-us}" +MAX_HOSTS="${WICK_PROBE_MAX_HOSTS:-25}" +MIN_FETCHES="${WICK_PROBE_MIN_FETCHES:-4}" # ignore low-volume noise +MAX_SUCCESS_RATE="${WICK_PROBE_MAX_SR:-0.5}" # candidate if site-side SR below this +MIN_OK_BYTES="${WICK_PROBE_MIN_BYTES:-1000}" # a 200 with < this is treated as a block/shell +PER_REQUEST_TIMEOUT="${WICK_PROBE_TIMEOUT:-40}" +SLEEP_BETWEEN="${WICK_PROBE_SLEEP:-2}" +DRY_RUN=0 + +for arg in "$@"; do + case $arg in + --provider=*) PROVIDER="${arg#*=}" ;; + --country=*) COUNTRY="${arg#*=}" ;; + --max-hosts=*) MAX_HOSTS="${arg#*=}" ;; + --dry-run) DRY_RUN=1 ;; + *) echo "WARN: unknown arg ignored: $arg" >&2 ;; + esac +done + +command -v jq >/dev/null || { echo "ERROR: jq required" >&2; exit 1; } +command -v curl >/dev/null || { echo "ERROR: curl required" >&2; exit 1; } +WICK_BIN="${WICK_BIN:-$(command -v wick)}" +if [[ -z "$WICK_BIN" && "$DRY_RUN" -eq 0 ]]; then + echo "ERROR: wick not found on PATH (set WICK_BIN), or pass --dry-run" >&2 + exit 1 +fi +mkdir -p "$OUT_DIR" + +# ── Step 1: candidate selection ───────────────────────────────────────────── +# Aggregate the per-(host,strategy) rows into per-host totals. A host is a +# candidate when it has real volume, a low overall success rate, AND its +# failures are predominantly site-side (offline fraction < 0.5). error_kind_dist +# may be absent until the worker is deployed + clients ship; treat missing as +# offline=0 (so we don't accidentally exclude everything in the meantime). +echo "[$TS] fetching stats: $STATS_URL" >&2 +STATS_JSON="" +for attempt in 1 2 3 4; do + # --retry handles curl's own transient transport errors; the outer loop + # also retries an empty/non-JSON body (a transient edge blip we've seen). + if STATS_JSON="$(curl -s --max-time 30 --retry 2 "$STATS_URL")" \ + && printf '%s' "$STATS_JSON" | jq -e '.rows' >/dev/null 2>&1; then + break + fi + echo " stats fetch attempt $attempt failed; retrying in 3s…" >&2 + STATS_JSON="" + sleep 3 +done +[[ -n "$STATS_JSON" ]] || { echo "ERROR: stats fetch failed after retries" >&2; exit 1; } + +CANDIDATES="$(printf '%s' "$STATS_JSON" | jq -r --argjson minf "$MIN_FETCHES" --argjson maxsr "$MAX_SUCCESS_RATE" ' + [ .rows[] + | { host, fetches, successes, + offline: ((.error_kind_dist // {}).offline // 0) } + ] + | group_by(.host) + | map({ + host: .[0].host, + fetches: (map(.fetches) | add), + successes: (map(.successes) | add), + offline: (map(.offline) | add), + }) + | map(. + { + failures: (.fetches - .successes), + sr: (if .fetches > 0 then (.successes / .fetches) else 1 end), + }) + # real volume, low success, and failures that are mostly NOT user-offline + | map(select(.fetches >= $minf and .sr < $maxsr + and (.failures <= 0 or (.offline / .failures) < 0.5))) + | sort_by(.sr, (-.fetches)) + | .[].host +')" + +mapfile -t HOSTS < <(printf '%s\n' "$CANDIDATES" | grep -v '^\s*$' | head -n "$MAX_HOSTS") + +echo "[$TS] ${#HOSTS[@]} site-side failing candidate host(s) (max=$MAX_HOSTS):" >&2 +printf ' %s\n' "${HOSTS[@]}" >&2 + +if [[ "$DRY_RUN" -eq 1 ]]; then + echo "[$TS] --dry-run: stopping before probing. Matrix per host would be: cronet | cronet+residential | cef" >&2 + exit 0 +fi + +# Residential proxy is required to test the needs_residential signal. +if [[ -z "$PROVIDER" ]]; then + echo "WARN: no --provider set; testing cronet-direct and cef-direct only (cannot derive needs_residential)." >&2 +fi + +# Build a fresh residential SOCKS URL (new session → new exit IP) per call. +build_proxy() { + [[ -z "$PROVIDER" ]] && return 1 + "$PROXY_BUILDER" --provider="$PROVIDER" --country="$COUNTRY" 2>>"$RESULTS.err" +} + +# Run one matrix cell. Echoes "ok " or "fail ". +probe_cell() { + local url="$1" render="$2" proxy="$3" + local args=(fetch --json --no-robots --render "$render") + [[ -n "$proxy" ]] && args+=(--proxy "$proxy") + args+=("$url") + local out + if ! out="$(WICK_AUTO_INSTALL_CEF=1 timeout "$PER_REQUEST_TIMEOUT" "$WICK_BIN" "${args[@]}" 2>/dev/null)"; then + echo "fail $?" + return + fi + local status bytes + status="$(printf '%s' "$out" | jq -r '.status_code // 0' 2>/dev/null)" + bytes="$(printf '%s' "$out" | jq -r '.bytes // 0' 2>/dev/null)" + if [[ "$status" == "200" && "${bytes:-0}" -ge "$MIN_OK_BYTES" ]]; then + echo "ok $status $bytes" + else + echo "fail-block ${status:-0} ${bytes:-0}" + fi +} + +# ── Step 2 + 3: matrix + decision ─────────────────────────────────────────── +: > "$RESULTS" +for host in "${HOSTS[@]}"; do + url="https://$host/" + cronet="$(probe_cell "$url" cronet "")"; sleep "$SLEEP_BETWEEN" + cef="$(probe_cell "$url" cef "")"; sleep "$SLEEP_BETWEEN" + cronet_res="n/a" + if [[ -n "$PROVIDER" ]]; then + if px="$(build_proxy)"; then + cronet_res="$(probe_cell "$url" cronet "$px")"; sleep "$SLEEP_BETWEEN" + fi + fi + + cronet_ok=0; [[ "$cronet" == ok* ]] && cronet_ok=1 + cef_ok=0; [[ "$cef" == ok* ]] && cef_ok=1 + cronet_res_ok=0; [[ "$cronet_res" == ok* ]] && cronet_res_ok=1 + + # render: cef only when cef rescues a cronet-direct failure. + render="cronet" + [[ "$cronet_ok" -eq 0 && "$cef_ok" -eq 1 ]] && render="cef" + # needs_residential: residential rescues a cronet-direct failure. + needs_res="false" + [[ "$cronet_ok" -eq 0 && "$cronet_res_ok" -eq 1 ]] && needs_res="true" + + jq -nc \ + --arg host "$host" --arg render "$render" --argjson needs_res "$needs_res" \ + --arg cronet "$cronet" --arg cef "$cef" --arg cronet_res "$cronet_res" \ + --arg ts "$TS" \ + '{host:$host, render:$render, needs_residential:$needs_res, + cells:{cronet:$cronet, cef:$cef, cronet_residential:$cronet_res}, probed_at:$ts}' \ + | tee -a "$RESULTS" >&2 +done + +# ── Step 4: emit measured rules ───────────────────────────────────────────── +# Emit the measured verdict for every host where SOME strategy worked — including +# render:cronet. That's deliberate: a measurement of "cronet works here" must be +# able to CORRECT an over-aggressive hand-seed (the published overlay overrides +# the bundled seed per host). A host where every cell failed (e.g. apkpure, hard +# even via residential) emits nothing, so its seed stays until we learn a method +# that works. confidence is modest for a single sweep; repeated sweeps / PR4's +# agent raise it. +jq -s --arg day "$DAY" ' + { version: 1, updated_at: $day, + note: "measured by bench/probe.sh", + rules: ( + [ .[] + | select(.cells | to_entries | any(.value | startswith("ok"))) + # Key on the bare host (strip leading www.) to match the seed + # convention plus the client parent-domain walk, so a measurement + # OVERRIDES a same-host seed instead of sitting beside it. + | { key: (.host | sub("^www\\."; "")), + value: { render: .render, needs_residential: .needs_residential, + vendor: "measured", confidence: 0.7, source: "measured", + updated_at: $day } } + ] | from_entries) + }' "$RESULTS" > "$RULES_OUT" + +echo "[$TS] wrote $(jq '.rules | length' "$RULES_OUT") measured rule(s) → $RULES_OUT" >&2 +echo "[$TS] per-host trace → $RESULTS" >&2 diff --git a/bench/proxy-providers.sh b/bench/proxy-providers.sh index cd8e78c..0bd8026 100755 --- a/bench/proxy-providers.sh +++ b/bench/proxy-providers.sh @@ -135,9 +135,12 @@ country_name_for() { case "$PROVIDER" in oxylabs) + # Oxylabs residential is an HTTP CONNECT proxy on :7777 and is + # :443-only — SOCKS5 (and non-443 dest ports) return 403/errors. All + # Wick fetch targets are https, so CONNECT-to-443 is exactly right. require OXY_USER OXY_PASS login="customer-${OXY_USER}-cc-${CC}-sessid-$(session_id 10)-sesstime-10" - echo "socks5://${login}:${OXY_PASS}@pr.oxylabs.io:7777" + echo "http://${login}:${OXY_PASS}@pr.oxylabs.io:7777" ;; brightdata) # BD's SOCKS5 endpoint runs on a different port than HTTP (33335). diff --git a/bench/publish-rules.sh b/bench/publish-rules.sh new file mode 100755 index 0000000..467b6c1 --- /dev/null +++ b/bench/publish-rules.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# Publish merged site-rules to the Worker, closing the self-improvement loop: +# +# probe.sh → site-rules.measured.json ─┐ +# bundled seed (rust/data) ────────────┴─(merge)→ POST /v1/site-rules/:key +# → clients refresh daily +# +# Merge policy: measured rules WIN per host over the seed, so a measurement can +# *correct* an over-aggressive hand-seed (e.g. the probe finding that a site +# the seed flagged render:cef actually works on cronet). The seed supplies the +# long tail the harness hasn't probed yet. +# +# Auth: set WICK_PUBLISH_KEY to a Worker API key (the loop's identity). Use +# --dry-run to print the merged doc without publishing. + +set -u + +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SEED="${WICK_SEED_RULES:-$REPO_DIR/../rust/data/site-rules.json}" +MEASURED="${WICK_MEASURED_RULES:-${WICK_PROBE_OUT_DIR:-$HOME/.wick/probe}/site-rules.measured.json}" +URL="${WICK_RULES_PUBLISH_URL:-https://releases.getwick.dev/v1/site-rules}" + +DRY_RUN=0 +for arg in "$@"; do + case $arg in + --dry-run) DRY_RUN=1 ;; + *) echo "WARN: unknown arg ignored: $arg" >&2 ;; + esac +done + +command -v jq >/dev/null || { echo "ERROR: jq required" >&2; exit 1; } +command -v curl >/dev/null || { echo "ERROR: curl required" >&2; exit 1; } +[[ -f "$SEED" ]] || { echo "ERROR: seed not found: $SEED" >&2; exit 1; } + +# Measured file is optional — first run, or a sweep that learned nothing new. +measured_rules='{}' +if [[ -f "$MEASURED" ]]; then + measured_rules="$(jq -c '.rules // {}' "$MEASURED" 2>/dev/null || echo '{}')" +fi + +# Merge: seed first, measured second (so measured wins on key collision). +MERGED="$(jq -nc \ + --slurpfile seed "$SEED" \ + --argjson measured "$measured_rules" \ + '{ version: 1, + rules: (($seed[0].rules // {}) + $measured) }')" + +n_seed="$(jq '.rules | length' "$SEED")" +n_measured="$(printf '%s' "$measured_rules" | jq 'length')" +n_merged="$(printf '%s' "$MERGED" | jq '.rules | length')" +echo "merge: seed=$n_seed + measured=$n_measured → $n_merged host(s)" >&2 + +if [[ "$DRY_RUN" -eq 1 ]]; then + printf '%s\n' "$MERGED" | jq . + echo "(--dry-run: not published)" >&2 + exit 0 +fi + +: "${WICK_PUBLISH_KEY:?set WICK_PUBLISH_KEY to a Worker API key (or pass --dry-run)}" + +resp="$(curl -s -w '\n%{http_code}' -X POST "$URL/$WICK_PUBLISH_KEY" \ + -H 'content-type: application/json' --data-binary "$MERGED")" +code="$(printf '%s' "$resp" | tail -n1)" +body="$(printf '%s' "$resp" | sed '$d')" +if [[ "$code" == "200" ]]; then + echo "published: $body" >&2 +else + echo "ERROR: publish failed (HTTP $code): $body" >&2 + exit 1 +fi From 339950dae5c0596f2c78b286d798709bd9ff9295 Mon Sep 17 00:00:00 2001 From: Adam Fisk Date: Fri, 26 Jun 2026 15:34:01 -0600 Subject: [PATCH 3/8] review: address Copilot feedback on #9 - worker: only count error_kind on transport failures (ok !== true), so a client can't skew the offline fraction by attaching it to OK events - fetch/main: bridge the --proxy CLI arg into WICK_PROXY so connectivity_ok probes through the configured proxy (a proxied-only host was misclassified "offline") - site_rules: Windows-safe overlay replace (rename won't overwrite on Windows) - cronet: fix stale doc reference (classify_transport_error / candidate_cause) - bench/probe.sh: scheme-agnostic proxy wording (oxylabs is HTTP CONNECT, not SOCKS5) Co-Authored-By: Claude Opus 4.8 (1M context) --- bench/probe.sh | 5 +++-- rust/src/cronet/mod.rs | 4 ++-- rust/src/main.rs | 9 +++++++++ rust/src/site_rules.rs | 8 +++++++- worker/src/index.js | 5 ++++- 5 files changed, 25 insertions(+), 6 deletions(-) diff --git a/bench/probe.sh b/bench/probe.sh index 66d6cef..4989dbc 100755 --- a/bench/probe.sh +++ b/bench/probe.sh @@ -15,7 +15,7 @@ # so we never chase phantom "this site is hard" signals. # 2. For each candidate, run a strategy matrix via `wick fetch --json`: # - cronet (--render cronet) [direct] -# - cronet+residential(--render cronet --proxy ) [datacenter-block test] +# - cronet+residential(--render cronet --proxy ) [datacenter-block test] # - cef (--render cef) [JS / bot-managed test] # (cef+residential is NOT tested here: --proxy routes only the Cronet/ # reqwest engine, not CEF, whose residential path is a WireGuard preload @@ -135,7 +135,8 @@ if [[ -z "$PROVIDER" ]]; then echo "WARN: no --provider set; testing cronet-direct and cef-direct only (cannot derive needs_residential)." >&2 fi -# Build a fresh residential SOCKS URL (new session → new exit IP) per call. +# Build a fresh residential proxy URL (new session → new exit IP) per call. +# Scheme is provider-specific (oxylabs = HTTP CONNECT, others = SOCKS5). build_proxy() { [[ -z "$PROVIDER" ]] && return 1 "$PROXY_BUILDER" --provider="$PROVIDER" --country="$COUNTRY" 2>>"$RESULTS.err" diff --git a/rust/src/cronet/mod.rs b/rust/src/cronet/mod.rs index 6dd728f..5c1c69a 100644 --- a/rust/src/cronet/mod.rs +++ b/rust/src/cronet/mod.rs @@ -307,8 +307,8 @@ unsafe extern "C" fn on_failed( } /// Map a `Cronet_Error_ERROR_CODE` (include/cronet.idl_c.h) to a stable name -/// embedded in the error message. `fetch::error_kind_from_message` matches on -/// these (lowercased) to classify the failure cause. +/// embedded in the error message. `fetch::classify_transport_error` (via +/// `candidate_cause`) matches on these (lowercased) to classify the cause. fn net_error_name(code: std::os::raw::c_int) -> &'static str { match code { 0 => "ERROR_CALLBACK", diff --git a/rust/src/main.rs b/rust/src/main.rs index eaec53f..cc4170e 100644 --- a/rust/src/main.rs +++ b/rust/src/main.rs @@ -174,6 +174,15 @@ async fn main() -> Result<()> { let cli = Cli::parse(); let proxy = cli.proxy.as_deref(); + // Canonicalize the resolved proxy into WICK_PROXY so the connectivity + // probe (fetch::connectivity_ok) routes the same way real fetches do — + // clap fills this field from a `--proxy` CLI arg without exporting the + // env var, so a proxied-only host would otherwise probe direct and + // misclassify a site failure as "offline". Safe here: main start, no + // threads spawned yet. + if let Some(p) = proxy { + std::env::set_var("WICK_PROXY", p); + } match cli.command { Command::Serve { mcp: true, .. } => { diff --git a/rust/src/site_rules.rs b/rust/src/site_rules.rs index 7a423b8..cc66ae2 100644 --- a/rust/src/site_rules.rs +++ b/rust/src/site_rules.rs @@ -176,7 +176,13 @@ fn refresh_now(path: &Path) -> std::io::Result<()> { // half-written overlay. let tmp = path.with_extension("json.tmp"); std::fs::write(&tmp, body.as_bytes())?; - std::fs::rename(&tmp, path)?; + // Unix `rename` atomically replaces the destination; Windows `rename` + // errors if it already exists, which would silently stop refresh once the + // overlay was written. Fall back to a direct overwrite there. + if std::fs::rename(&tmp, path).is_err() { + std::fs::write(path, body.as_bytes())?; + let _ = std::fs::remove_file(&tmp); + } Ok(()) } diff --git a/worker/src/index.js b/worker/src/index.js index 8423107..a7da397 100644 --- a/worker/src/index.js +++ b/worker/src/index.js @@ -265,7 +265,10 @@ export default { // this stays empty for them. This is what lets the stats page and the // self-improvement harness exclude user-side "offline" failures from // "this site is hard" — see analytics::report_transport_error. - if (isErrorKind(body.error_kind)) { + // Only a transport FAILURE carries a cause — never an OK response. The + // endpoint is unauthenticated, so reject error_kind on ok events so a + // client can't skew the offline fraction by attaching it to successes. + if (body.ok !== true && isErrorKind(body.error_kind)) { existing.error_kind_dist[body.error_kind] = (existing.error_kind_dist[body.error_kind] || 0) + 1; } From 81facd9e79f108885ebc5dfce84e46b0a80db269 Mon Sep 17 00:00:00 2001 From: Adam Fisk Date: Fri, 26 Jun 2026 15:42:26 -0600 Subject: [PATCH 4/8] review: address Copilot round-2 feedback on #9 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - fetch --json: rename `bytes` → `content_bytes` and document it as the extracted-content size (a challenge/JS shell extracts to near nothing, so a small value flags a block) — not bytes-on-wire. Updated bench/probe.sh. - worker: gate error_kind on statusBucket==="0" as well — a cause means "no HTTP response at all", so an HTTP error (e.g. 403) must not carry one. - worker: reject arrays in doc.rules validation (typeof [] === "object" would otherwise let an array through as a rules map). Co-Authored-By: Claude Opus 4.8 (1M context) --- bench/probe.sh | 6 ++++-- rust/src/main.rs | 13 +++++++++---- worker/src/index.js | 14 +++++++++----- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/bench/probe.sh b/bench/probe.sh index 4989dbc..eeb7778 100755 --- a/bench/probe.sh +++ b/bench/probe.sh @@ -52,7 +52,7 @@ COUNTRY="${WICK_PROBE_COUNTRY:-us}" MAX_HOSTS="${WICK_PROBE_MAX_HOSTS:-25}" MIN_FETCHES="${WICK_PROBE_MIN_FETCHES:-4}" # ignore low-volume noise MAX_SUCCESS_RATE="${WICK_PROBE_MAX_SR:-0.5}" # candidate if site-side SR below this -MIN_OK_BYTES="${WICK_PROBE_MIN_BYTES:-1000}" # a 200 with < this is treated as a block/shell +MIN_OK_BYTES="${WICK_PROBE_MIN_BYTES:-1000}" # a 200 with < this many bytes of extracted content = block/shell PER_REQUEST_TIMEOUT="${WICK_PROBE_TIMEOUT:-40}" SLEEP_BETWEEN="${WICK_PROBE_SLEEP:-2}" DRY_RUN=0 @@ -155,7 +155,9 @@ probe_cell() { fi local status bytes status="$(printf '%s' "$out" | jq -r '.status_code // 0' 2>/dev/null)" - bytes="$(printf '%s' "$out" | jq -r '.bytes // 0' 2>/dev/null)" + # content_bytes = extracted-content size; a challenge/JS shell extracts to + # near nothing, so a small value below means a block (not bytes-on-wire). + bytes="$(printf '%s' "$out" | jq -r '.content_bytes // 0' 2>/dev/null)" if [[ "$status" == "200" && "${bytes:-0}" -ge "$MIN_OK_BYTES" ]]; then echo "ok $status $bytes" else diff --git a/rust/src/main.rs b/rust/src/main.rs index cc4170e..41e2eb0 100644 --- a/rust/src/main.rs +++ b/rust/src/main.rs @@ -74,9 +74,10 @@ enum Command { #[arg(long)] no_robots: bool, /// Emit one structured JSON line ({url, status_code, timing_ms, - /// bytes, title}) instead of page content. Used by the - /// self-improvement probe harness to judge per-strategy success - /// (forced --render + --proxy) deterministically. + /// content_bytes, title}) instead of page content. content_bytes is + /// the extracted-content size (a block/challenge shell extracts to + /// near nothing). Used by the self-improvement probe harness to judge + /// per-strategy success (forced --render + --proxy) deterministically. #[arg(long)] json: bool, }, @@ -269,7 +270,11 @@ async fn main() -> Result<()> { "url": result.url, "status_code": result.status_code, "timing_ms": result.timing_ms, - "bytes": result.content.len(), + // Size of the EXTRACTED content (default markdown), not the + // raw HTML — this is the harness's "did we get usable + // content" signal: a challenge/JS shell extracts to near + // nothing, so a small content_bytes flags a block. + "content_bytes": result.content.len(), "title": result.title, }); println!("{}", out); diff --git a/worker/src/index.js b/worker/src/index.js index a7da397..df52379 100644 --- a/worker/src/index.js +++ b/worker/src/index.js @@ -265,10 +265,11 @@ export default { // this stays empty for them. This is what lets the stats page and the // self-improvement harness exclude user-side "offline" failures from // "this site is hard" — see analytics::report_transport_error. - // Only a transport FAILURE carries a cause — never an OK response. The - // endpoint is unauthenticated, so reject error_kind on ok events so a - // client can't skew the offline fraction by attaching it to successes. - if (body.ok !== true && isErrorKind(body.error_kind)) { + // error_kind means "no HTTP response at all" (transport failure), so it + // only counts when the event is not ok AND has no status (statusBucket + // "0"). The endpoint is unauthenticated; this stops a client from + // attaching a cause to an HTTP error (e.g. 403) and skewing the stats. + if (body.ok !== true && statusBucket === "0" && isErrorKind(body.error_kind)) { existing.error_kind_dist[body.error_kind] = (existing.error_kind_dist[body.error_kind] || 0) + 1; } @@ -461,7 +462,10 @@ export default { try { doc = await request.json(); } catch { return new Response("bad json\n", { status: 400, headers }); } - if (!doc || typeof doc !== "object" || typeof doc.rules !== "object" || doc.rules === null) { + if (!doc || typeof doc !== "object" || typeof doc.rules !== "object" + || doc.rules === null || Array.isArray(doc.rules)) { + // typeof [] === "object", so arrays must be rejected explicitly — + // rules must be a host→rule map, not a list. return new Response("expected { rules: { host: {...} } }\n", { status: 400, headers }); } const hostCount = Object.keys(doc.rules).length; From e4293212735986d9d4c7f803bf2fd680dd1a26e0 Mon Sep 17 00:00:00 2001 From: Adam Fisk Date: Fri, 26 Jun 2026 15:48:48 -0600 Subject: [PATCH 5/8] review: default SiteRule.render so a partial overlay entry can't disable all rules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A published overlay entry missing `render` (manual edit, residential-only rule, partial doc) made serde_json fail the whole-file parse, silently dropping EVERY overlay rule back to the seed. `render` is now #[serde(default)] — an empty value is "no opinion" (same as no rule) per should_use_cef_first. Adds a test. Addresses Copilot round-3 feedback on #9. Co-Authored-By: Claude Opus 4.8 (1M context) --- rust/src/site_rules.rs | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/rust/src/site_rules.rs b/rust/src/site_rules.rs index cc66ae2..853acab 100644 --- a/rust/src/site_rules.rs +++ b/rust/src/site_rules.rs @@ -35,8 +35,12 @@ const SEED: &str = include_str!("../data/site-rules.json"); #[derive(Debug, Clone, Deserialize)] pub struct SiteRule { - /// What transport is known to work: `"cef"` or `"cronet"`. Unknown - /// values are treated as "no opinion" by the consumer (`fetch.rs`). + /// What transport is known to work: `"cef"` or `"cronet"`. Defaulted so a + /// single overlay entry missing `render` (manual edit, a residential-only + /// rule, a partial doc) can't fail the whole-file parse and silently + /// disable every overlay rule — an empty/unknown value is "no opinion" to + /// the consumer (`fetch.rs::should_use_cef_first`), the same as no rule. + #[serde(default)] pub render: String, /// Datacenter IPs are blocked here — a residential exit is needed for a /// reliable fetch. Advisory for clients without a residential transport @@ -240,4 +244,20 @@ mod tests { assert!(get("example.com").is_none()); assert!(get("news.ycombinator.com").is_none()); } + + #[test] + fn rule_missing_render_still_parses() { + // A partial / residential-only entry must NOT fail the whole-file + // parse and silently disable every overlay rule. render defaults to + // "" (no opinion), the rest of the entry still applies. + let doc = r#"{"version":1,"rules":{ + "a.com":{"needs_residential":true}, + "b.com":{"render":"cef"} + }}"#; + let m = parse(doc); + assert_eq!(m.len(), 2, "a malformed entry must not drop the whole file"); + assert_eq!(m["a.com"].render, ""); + assert!(m["a.com"].needs_residential); + assert_eq!(m["b.com"].render, "cef"); + } } From a89f5acf75ef71185d5023590f4f02301e29ddff Mon Sep 17 00:00:00 2001 From: Adam Fisk Date: Fri, 26 Jun 2026 15:56:24 -0600 Subject: [PATCH 6/8] review: store the EFFECTIVE residential mode on the CEF daemon DaemonProcess.use_residential stored the *requested* mode, but the LD_PRELOAD tunnel only applies when WireGuard is up AND bindwg is present. Compute the effective mode once (want_residential) and use it for the reuse check, the preload decision, and the stored flag. Fixes two issues: (a) a daemon spawned non-residential because the tunnel was down would never switch to residential when the tunnel later came up, and (b) non-residential requests triggered needless respawns against a daemon that was already effectively non-residential. Addresses Copilot round-4 feedback on #9. Co-Authored-By: Claude Opus 4.8 (1M context) --- rust/src/cef.rs | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/rust/src/cef.rs b/rust/src/cef.rs index 1ecdf9d..35e5825 100644 --- a/rust/src/cef.rs +++ b/rust/src/cef.rs @@ -20,9 +20,11 @@ struct DaemonProcess { child: Child, stdin: ChildStdin, stdout: BufReader, - /// Whether this daemon was spawned with the residential tunnel - /// (LD_PRELOAD bindwg.so). The daemon is a process-wide singleton, so a - /// later request in the other mode must respawn it — see `ensure_daemon`. + /// Whether this daemon was *effectively* spawned with the residential + /// tunnel (LD_PRELOAD bindwg.so actually applied — i.e. requested AND the + /// WireGuard interface + bindwg were present). The daemon is a + /// process-wide singleton, so a later request whose effective mode differs + /// must respawn it — see `ensure_daemon`. use_residential: bool, } @@ -99,18 +101,26 @@ fn render_blocking(url: &str, opts: &RenderOptions) -> Result { } fn ensure_daemon(use_residential: bool) -> Result<()> { + // EFFECTIVE residential mode: the tunnel only actually applies when a + // WireGuard interface is up AND bindwg is present. Track and compare the + // effective mode (not the requested one) so we neither (a) keep a + // non-residential daemon when residential was requested but the tunnel was + // down at spawn and later comes up, nor (b) respawn needlessly when the + // effective mode already matches a non-residential request. + let want_residential = + use_residential && wireguard_active() && std::path::Path::new(BINDWG_PATH).exists(); + let mut daemon = DAEMON.lock().map_err(|e| anyhow::anyhow!("lock: {}", e))?; - // Check if existing daemon is still alive AND in the requested - // residential mode. The daemon is a process-wide singleton whose - // residential tunnel is fixed at spawn (LD_PRELOAD), so reusing one - // started in the other mode would silently route through the wrong exit - // — e.g. a needs_residential site served over the datacenter IP it was - // flagged as blocking. On a mode mismatch, kill and respawn. + // Reuse the existing daemon only if it's alive AND already in the + // effective residential mode we need. The daemon is a process-wide + // singleton whose tunnel is fixed at spawn (LD_PRELOAD), so reusing one in + // the wrong mode would route through the wrong exit. On a mismatch, kill + // and respawn. if let Some(ref mut d) = *daemon { match d.child.try_wait() { Ok(Some(_)) => { *daemon = None; } - Ok(None) if d.use_residential == use_residential => return Ok(()), + Ok(None) if d.use_residential == want_residential => return Ok(()), Ok(None) => { let _ = d.child.kill(); *daemon = None; @@ -171,7 +181,7 @@ fn ensure_daemon(use_residential: bool) -> Result<()> { }; cmd.env("LD_LIBRARY_PATH", &lib_path); - if use_residential && wireguard_active() && std::path::Path::new(BINDWG_PATH).exists() { + if want_residential { // Append rather than clobber any existing LD_PRELOAD. let preload = match std::env::var("LD_PRELOAD") { Ok(existing) if !existing.trim().is_empty() => { @@ -196,7 +206,7 @@ fn ensure_daemon(use_residential: bool) -> Result<()> { // Wait for CEF to initialize std::thread::sleep(Duration::from_secs(2)); - *daemon = Some(DaemonProcess { child, stdin, stdout, use_residential }); + *daemon = Some(DaemonProcess { child, stdin, stdout, use_residential: want_residential }); Ok(()) } From e1aa21868642824c8a66590db3f8c4f71f8336b1 Mon Sep 17 00:00:00 2001 From: Adam Fisk Date: Fri, 26 Jun 2026 16:04:01 -0600 Subject: [PATCH 7/8] review: avoid env mutation for probe proxy; macOS timeout fallback - fetch/main: record the resolved proxy in a OnceLock that connectivity_ok reads, instead of std::env::set_var. set_var is unsound under the multi-thread Tokio runtime (it can race env reads on worker threads), and the prior "no threads spawned yet" rationale was wrong (#[tokio::main] spawns workers before the body runs). - bench/probe.sh: resolve `timeout` vs `gtimeout` (macOS ships neither by default) and run without a per-request timeout + warn if absent, rather than failing the whole sweep on a default Mac. Addresses Copilot round-5 feedback on #9. Co-Authored-By: Claude Opus 4.8 (1M context) --- bench/probe.sh | 21 ++++++++++++++++++--- rust/src/fetch.rs | 25 +++++++++++++++++++++---- rust/src/main.rs | 16 +++++++--------- 3 files changed, 46 insertions(+), 16 deletions(-) diff --git a/bench/probe.sh b/bench/probe.sh index eeb7778..17be391 100755 --- a/bench/probe.sh +++ b/bench/probe.sh @@ -76,6 +76,16 @@ if [[ -z "$WICK_BIN" && "$DRY_RUN" -eq 0 ]]; then fi mkdir -p "$OUT_DIR" +# Resolve a timeout command: GNU coreutils ships `timeout`; macOS only has it +# as `gtimeout` (after `brew install coreutils`). Without either, run with no +# per-request timeout (and warn) rather than failing the whole sweep — the +# README documents macOS/launchd usage, so a hard dependency on `timeout` +# would break the default Mac. +TIMEOUT_BIN="$(command -v timeout 2>/dev/null || command -v gtimeout 2>/dev/null || true)" +if [[ -z "$TIMEOUT_BIN" && "$DRY_RUN" -eq 0 ]]; then + echo "WARN: no 'timeout'/'gtimeout' on PATH — running without a per-request timeout (macOS: brew install coreutils)" >&2 +fi + # ── Step 1: candidate selection ───────────────────────────────────────────── # Aggregate the per-(host,strategy) rows into per-host totals. A host is a # candidate when it has real volume, a low overall success rate, AND its @@ -148,9 +158,14 @@ probe_cell() { local args=(fetch --json --no-robots --render "$render") [[ -n "$proxy" ]] && args+=(--proxy "$proxy") args+=("$url") - local out - if ! out="$(WICK_AUTO_INSTALL_CEF=1 timeout "$PER_REQUEST_TIMEOUT" "$WICK_BIN" "${args[@]}" 2>/dev/null)"; then - echo "fail $?" + local out rc + if [[ -n "$TIMEOUT_BIN" ]]; then + out="$(WICK_AUTO_INSTALL_CEF=1 "$TIMEOUT_BIN" "$PER_REQUEST_TIMEOUT" "$WICK_BIN" "${args[@]}" 2>/dev/null)"; rc=$? + else + out="$(WICK_AUTO_INSTALL_CEF=1 "$WICK_BIN" "${args[@]}" 2>/dev/null)"; rc=$? + fi + if [[ $rc -ne 0 ]]; then + echo "fail $rc" return fi local status bytes diff --git a/rust/src/fetch.rs b/rust/src/fetch.rs index a174bb9..058fc34 100644 --- a/rust/src/fetch.rs +++ b/rust/src/fetch.rs @@ -854,6 +854,18 @@ const CONNECTIVITY_PROBE_URL: &str = "https://releases.getwick.dev/install-pro.s static CONNECTIVITY_CACHE: std::sync::LazyLock>> = std::sync::LazyLock::new(|| std::sync::Mutex::new(None)); +/// The proxy real fetches use, recorded once by `main` from the resolved +/// `--proxy` / `WICK_PROXY`. The connectivity probe reads it so it routes +/// identically. A `OnceLock` rather than process-env mutation, which is +/// unsound under the multi-thread Tokio runtime (`set_var` can race env reads +/// on worker threads). +static EFFECTIVE_PROXY: std::sync::OnceLock> = std::sync::OnceLock::new(); + +/// Record the effective proxy at startup. Idempotent — later calls are no-ops. +pub fn set_proxy(proxy: Option<&str>) { + let _ = EFFECTIVE_PROXY.set(proxy.map(|s| s.to_string())); +} + /// True when the error is *definitively* the user's own network being gone — /// the OS told Cronet so. These never need the connectivity probe and must /// never be attributed to the site. @@ -944,11 +956,16 @@ async fn connectivity_ok() -> bool { } } } - // Probe through the same proxy real fetches use (WICK_PROXY), so a host - // whose only route off-box is a tunnel isn't falsely judged "offline" on - // every ambiguous failure. + // Probe through the same proxy real fetches use, so a host whose only + // route off-box is a tunnel isn't falsely judged "offline" on every + // ambiguous failure. Prefer the proxy main recorded; fall back to the env + // for entry points that didn't record one (read-only — no env mutation). let mut builder = reqwest::Client::builder().timeout(Duration::from_secs(2)); - if let Ok(proxy) = std::env::var("WICK_PROXY") { + let proxy_url = EFFECTIVE_PROXY + .get() + .and_then(|p| p.clone()) + .or_else(|| std::env::var("WICK_PROXY").ok()); + if let Some(proxy) = proxy_url { let proxy = proxy.trim(); if !proxy.is_empty() { if let Ok(p) = reqwest::Proxy::all(proxy) { diff --git a/rust/src/main.rs b/rust/src/main.rs index 41e2eb0..0e30a38 100644 --- a/rust/src/main.rs +++ b/rust/src/main.rs @@ -175,15 +175,13 @@ async fn main() -> Result<()> { let cli = Cli::parse(); let proxy = cli.proxy.as_deref(); - // Canonicalize the resolved proxy into WICK_PROXY so the connectivity - // probe (fetch::connectivity_ok) routes the same way real fetches do — - // clap fills this field from a `--proxy` CLI arg without exporting the - // env var, so a proxied-only host would otherwise probe direct and - // misclassify a site failure as "offline". Safe here: main start, no - // threads spawned yet. - if let Some(p) = proxy { - std::env::set_var("WICK_PROXY", p); - } + // Record the resolved proxy so the connectivity probe + // (fetch::connectivity_ok) routes the same way real fetches do — clap + // fills this from a `--proxy` CLI arg without exporting WICK_PROXY, so a + // proxied-only host would otherwise probe direct and misclassify a site + // failure as "offline". Recorded in a OnceLock rather than mutated into + // process env, which is unsound under the multi-thread Tokio runtime. + fetch::set_proxy(proxy); match cli.command { Command::Serve { mcp: true, .. } => { From 2e061b0606b0432d1c13743670fb725b81355ec5 Mon Sep 17 00:00:00 2001 From: Adam Fisk Date: Sat, 27 Jun 2026 06:56:01 -0600 Subject: [PATCH 8/8] probe: add explicit sort_by(.host) before group_by MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Belt-and-suspenders / clarity for the host aggregation. jq's group_by already sorts by the key internally (so the candidate totals were correct — verified live), but the explicit sort makes intent obvious and closes the review thread. Co-Authored-By: Claude Opus 4.8 (1M context) --- bench/probe.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bench/probe.sh b/bench/probe.sh index 17be391..a1ecf3f 100755 --- a/bench/probe.sh +++ b/bench/probe.sh @@ -112,6 +112,10 @@ CANDIDATES="$(printf '%s' "$STATS_JSON" | jq -r --argjson minf "$MIN_FETCHES" -- | { host, fetches, successes, offline: ((.error_kind_dist // {}).offline // 0) } ] + # group_by already sorts by the key internally in jq, so this sort_by is + # belt-and-suspenders: it makes the host-grouping intent explicit and is + # robust to any future jq change. + | sort_by(.host) | group_by(.host) | map({ host: .[0].host,