From 9d55fd9660e57f1e76d478a9e653dfe4d04cc6ea Mon Sep 17 00:00:00 2001 From: ehsan shariati Date: Wed, 17 Jun 2026 02:30:22 -0400 Subject: [PATCH] fix(client): v1->v7 migration tolerates a gc'd (410 Gone) backup source Large-file uploads failed on GC-damaged buckets: a large upload tips the forest past the v7 sharding threshold, triggering the v1->v7 migration, whose first step is a server-side copy_object of the v1 index to a backup key. A server-side COPY must read the source, but that index object's backing CID was garbage-collected (one-off manual `ipfs repo gc`; HEAD returns the ETag, a content read 410s) -> copy fails -> migration defers -> upload fails after all chunks already uploaded. Fix (Option B, advisor + gemini endorsed): treat a 410/Gone backup copy as "source content already gone, nothing to back up" -> skip the backup and proceed. v7 is rebuilt faithfully from the already-loaded in-memory v1 forest (monolithic = whole-or-nothing, no entry dropped); with auto-gc off the fresh v7 nodes persist; once migrated the bucket never hits this path again. Every OTHER copy error still defers -- a transient must not masquerade as gc'd. Safe because: the v1 source is already gc'd (no restore point could exist anyway); the backup is read only by try_v1_backup_fallback, which returns None gracefully when absent and triggers only if a future v7 manifest is unreadable. - error.rs: ClientError::is_gone() -- narrow match on Gone/HTTP410/410 only. - encryption.rs: migrate_v1_to_v7_internal Step 4 skips backup on is_gone(), defers on every other error. - test: is_gone_matches_only_410_gone (narrow-boundary guard). Workspace 0.6.11 -> 0.6.12. Migration-completes-despite-410 validated on the live gateway (real large-upload retry); the migration path is real-server-only (advisory lock + heartbeat), so it isn't wiremock-mockable. Co-Authored-By: Claude Opus 4.8 --- Cargo.lock | 12 +++---- Cargo.toml | 2 +- crates/fula-client/src/encryption.rs | 32 ++++++++++++++--- crates/fula-client/src/error.rs | 52 +++++++++++++++++++++++++++- 4 files changed, 85 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c73bcaf..9b89c52 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1736,7 +1736,7 @@ dependencies = [ [[package]] name = "fula-api" -version = "0.6.11" +version = "0.6.12" dependencies = [ "anyhow", "axum", @@ -1765,7 +1765,7 @@ dependencies = [ [[package]] name = "fula-blockstore" -version = "0.6.11" +version = "0.6.12" dependencies = [ "anyhow", "async-trait", @@ -1803,7 +1803,7 @@ dependencies = [ [[package]] name = "fula-cli" -version = "0.6.11" +version = "0.6.12" dependencies = [ "anyhow", "async-trait", @@ -1857,7 +1857,7 @@ dependencies = [ [[package]] name = "fula-client" -version = "0.6.11" +version = "0.6.12" dependencies = [ "anyhow", "async-trait", @@ -1900,7 +1900,7 @@ dependencies = [ [[package]] name = "fula-core" -version = "0.6.11" +version = "0.6.12" dependencies = [ "anyhow", "async-trait", @@ -1935,7 +1935,7 @@ dependencies = [ [[package]] name = "fula-crypto" -version = "0.6.11" +version = "0.6.12" dependencies = [ "aes-gcm", "anyhow", diff --git a/Cargo.toml b/Cargo.toml index f4075d3..bbc0ec2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -79,7 +79,7 @@ name = "encrypted_upload_test" path = "examples/encrypted_upload_test.rs" [workspace.package] -version = "0.6.11" +version = "0.6.12" edition = "2021" license = "MIT OR Apache-2.0" repository = "https://github.com/functionland/fula-api" diff --git a/crates/fula-client/src/encryption.rs b/crates/fula-client/src/encryption.rs index 0710aae..b0c575d 100644 --- a/crates/fula-client/src/encryption.rs +++ b/crates/fula-client/src/encryption.rs @@ -6207,11 +6207,33 @@ impl EncryptedClient { V1_BACKUP_PREFIX, chrono::Utc::now().timestamp_millis(), ); - if let Err(e) = self.inner.copy_object(bucket, &index_key, bucket, &backup_key).await { - release_best_effort(lock_token.clone()).await; - return Ok(MigrationOutcome::DeferredTransientError { - reason: format!("v1 backup COPY failed: {}", e), - }); + match self.inner.copy_object(bucket, &index_key, bucket, &backup_key).await { + Ok(_) => {} + // The v1 index object's backing content was garbage-collected + // (gc-orphaned CID → 410 Gone): a server-side COPY can't read it, + // so there is no v1 content to back up and the restore point can't + // exist regardless. Skip the backup and proceed — v7 is rebuilt + // faithfully from the in-memory `v1_forest` (monolithic = + // whole-or-nothing, so no entry is dropped), and with auto-gc off + // the fresh v7 nodes won't be collected. The backup only matters if + // a FUTURE v7 manifest becomes unreadable, and `try_v1_backup_fallback` + // already returns None gracefully when no backup exists. + Err(e) if e.is_gone() => { + tracing::warn!( + %bucket, + %backup_key, + error = %e, + "v1 backup COPY hit 410 Gone (source content gc'd); skipping backup, proceeding with migration" + ); + } + // Every OTHER copy error (transient 5xx, throttling, auth, network) + // is a real failure — defer so a retry can make the restore point. + Err(e) => { + release_best_effort(lock_token.clone()).await; + return Ok(MigrationOutcome::DeferredTransientError { + reason: format!("v1 backup COPY failed: {}", e), + }); + } } // ── Step 5: Build the v7 forest in memory ────────────────────────── diff --git a/crates/fula-client/src/error.rs b/crates/fula-client/src/error.rs index 7fce7b2..2428e1b 100644 --- a/crates/fula-client/src/error.rs +++ b/crates/fula-client/src/error.rs @@ -320,6 +320,24 @@ impl ClientError { pub fn is_cache_error(&self) -> bool { matches!(self, Self::BlockTooLarge { .. } | Self::BlockCache(_)) } + + /// Check if this is an HTTP 410 Gone error. + /// + /// Distinct from not-found (404): a 410 means the object's metadata still + /// exists but its backing content is unretrievable — on this gateway, + /// that's a garbage-collected IPFS CID (a server-side content read like + /// `copy_object` fails, though `head_object` still returns the ETag). + /// + /// The v1→v7 forest migration uses this to treat a backup `copy_object` + /// that 410s as "the source content is already gone, so there is nothing + /// to back up" and proceed (rebuilding v7 from the in-memory forest), + /// rather than deferring as if it were a transient failure. The match is + /// deliberately NARROW — every other error (transient 5xx, throttling, + /// auth, precondition) must still be treated as a real failure. + pub fn is_gone(&self) -> bool { + matches!(self, Self::S3Error { code, .. } + if code == "Gone" || code == "HTTP410" || code == "410") + } } fn extract_xml_element(xml: &str, element: &str) -> Option { @@ -350,7 +368,7 @@ mod tests { "#; let error = ClientError::from_s3_xml(xml, 404); - + match error { ClientError::S3Error { code, message, request_id } => { assert_eq!(code, "NoSuchKey"); @@ -360,4 +378,36 @@ mod tests { _ => panic!("Expected S3Error"), } } + + #[test] + fn is_gone_matches_only_410_gone() { + let s3 = |code: &str| ClientError::S3Error { + code: code.to_string(), + message: String::new(), + request_id: None, + }; + // 410 Gone == the object's metadata exists but its backing content is + // unretrievable (gc-orphaned CID). The v1->v7 migration treats this as + // "no server-side content to back up", NOT a transient failure. + assert!(s3("Gone").is_gone()); + assert!(s3("HTTP410").is_gone()); + assert!(s3("410").is_gone()); + assert!(ClientError::from_s3_xml("Gone", 410).is_gone()); + // A 410 body with no falls back to "HTTP410". + assert!(ClientError::from_s3_xml("", 410).is_gone()); + + // CRITICAL: must NOT match anything else — every other copy error must + // still defer/fail (a transient masquerading as "gc'd" is the one way + // the skip-the-backup fix turns dangerous). + assert!(!s3("NoSuchKey").is_gone()); + assert!(!s3("NoSuchBucket").is_gone()); + assert!(!s3("PreconditionFailed").is_gone()); + assert!(!s3("HTTP412").is_gone()); + assert!(!s3("InternalError").is_gone()); + assert!(!s3("HTTP500").is_gone()); + assert!(!s3("SlowDown").is_gone()); + assert!(!ClientError::from_s3_xml("InternalError", 500).is_gone()); + assert!(!ClientError::NotFound { bucket: "b".into(), key: "k".into() }.is_gone()); + assert!(!ClientError::BucketNotFound("b".into()).is_gone()); + } }