From ff0d3ab0ecd3cedf849e069f0cbad301ea42f9d4 Mon Sep 17 00:00:00 2001 From: Jeonghwan Lee Date: Tue, 9 Jun 2026 08:41:17 +0900 Subject: [PATCH] fix(bootstrap): retry transient manifest fetch failures at boot FetchManifest did a single HTTP GET, so a transient GitHub CDN failure (e.g. a 504) hard-failed the daemon at boot even though artifact downloads already retry via downloadWithRetry. This wraps the manifest fetch in the same bounded exponential-backoff retry (3 attempts, backoff 5s -> 15s -> 45s, ctx-cancel aware), reusing the existing download retry constants. Only the network fetch is retried; deterministic parse/version errors fail fast. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/bootstrap/manifest.go | 41 ++++++++++++++++++++++++++--- internal/bootstrap/manifest_test.go | 40 ++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 3 deletions(-) diff --git a/internal/bootstrap/manifest.go b/internal/bootstrap/manifest.go index 9f7a05c..01472d7 100644 --- a/internal/bootstrap/manifest.go +++ b/internal/bootstrap/manifest.go @@ -6,6 +6,7 @@ import ( "errors" "fmt" "io" + "log/slog" "net/http" "os" "time" @@ -106,10 +107,44 @@ func FetchManifest(ctx context.Context) (*Manifest, error) { if url == "" { return nil, ErrManifestURLMissing } - return fetchManifestFrom(ctx, url) + + // Only the network fetch is retried (reusing the download retry budget): + // a transient GitHub CDN 504 at boot must not hard-fail the daemon, but a + // deterministic parse/version error is not worth re-fetching for. ctx + // cancellation aborts immediately without burning the remaining attempts. + var ( + body []byte + lastErr error + ) + backoff := downloadRetryBackoff + for attempt := 1; attempt <= maxDownloadAttempts; attempt++ { + if attempt > 1 { + slog.Warn("retrying manifest fetch", + "attempt", attempt, + "max", maxDownloadAttempts, + "after", lastErr, + "backoff", backoff.String()) + select { + case <-time.After(backoff): + case <-ctx.Done(): + return nil, fmt.Errorf("manifest fetch retry aborted: %w", ctx.Err()) + } + backoff *= retryBackoffMultiplier + } + body, lastErr = fetchManifestBody(ctx, url) + if lastErr == nil { + return parseManifest(body) + } + if ctx.Err() != nil { + return nil, lastErr + } + } + return nil, fmt.Errorf("manifest fetch failed after %d attempts: %w", maxDownloadAttempts, lastErr) } -func fetchManifestFrom(ctx context.Context, url string) (*Manifest, error) { +// fetchManifestBody performs a single manifest GET and returns the raw +// body. It is the retryable unit wrapped by FetchManifest. +func fetchManifestBody(ctx context.Context, url string) ([]byte, error) { fctx, cancel := context.WithTimeout(ctx, manifestFetchTimeout) defer cancel() req, err := http.NewRequestWithContext(fctx, http.MethodGet, url, nil) @@ -131,7 +166,7 @@ func fetchManifestFrom(ctx context.Context, url string) (*Manifest, error) { if int64(len(body)) > manifestMaxBytes { return nil, fmt.Errorf("manifest: body exceeds %d bytes", manifestMaxBytes) } - return parseManifest(body) + return body, nil } func parseManifest(body []byte) (*Manifest, error) { diff --git a/internal/bootstrap/manifest_test.go b/internal/bootstrap/manifest_test.go index 3701758..402c0e2 100644 --- a/internal/bootstrap/manifest_test.go +++ b/internal/bootstrap/manifest_test.go @@ -6,8 +6,18 @@ import ( "net/http/httptest" "strings" "testing" + "time" ) +// fastRetry compresses the retry backoff so tests that exercise the retry +// path don't sleep for real seconds. It restores the original on cleanup. +func fastRetry(t *testing.T) { + t.Helper() + orig := downloadRetryBackoff + downloadRetryBackoff = time.Millisecond + t.Cleanup(func() { downloadRetryBackoff = orig }) +} + const sampleManifest = `{ "version": 1, "runed_version": "v9.9.9", @@ -145,6 +155,7 @@ func TestFetchManifest_HTTP(t *testing.T) { } func TestFetchManifest_HTTPError(t *testing.T) { + fastRetry(t) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { http.Error(w, "boom", http.StatusInternalServerError) })) @@ -156,3 +167,32 @@ func TestFetchManifest_HTTPError(t *testing.T) { t.Errorf("expected HTTP 500 error, got %v", err) } } + +// TestFetchManifest_RetriesTransient verifies a transient 5xx (e.g. a +// GitHub CDN 504) is retried and the fetch ultimately succeeds. +func TestFetchManifest_RetriesTransient(t *testing.T) { + fastRetry(t) + var attempts int + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + attempts++ + if attempts < 3 { + http.Error(w, "gateway timeout", http.StatusGatewayTimeout) + return + } + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(sampleManifest)) + })) + defer srv.Close() + t.Setenv(EnvManifest, srv.URL) + + m, err := FetchManifest(t.Context()) + if err != nil { + t.Fatalf("FetchManifest after transient 504s: %v", err) + } + if m.Version != 1 { + t.Errorf("Version = %d, want 1", m.Version) + } + if attempts != 3 { + t.Errorf("attempts = %d, want 3 (two 504s then success)", attempts) + } +}