diff --git a/bin/rune b/bin/rune index 8f50bb8..14e3d18 100755 --- a/bin/rune +++ b/bin/rune @@ -86,9 +86,11 @@ if [ -z "$RUNE_VERSION" ]; then token="${GITHUB_TOKEN:-${GH_TOKEN:-}}" if [ -n "$token" ]; then body="$(curl --fail --silent --show-error --location --connect-timeout 10 --max-time 20 \ + --retry 3 --retry-delay 2 \ --header "Authorization: Bearer $token" "$api" || true)" else - body="$(curl --fail --silent --show-error --location --connect-timeout 10 --max-time 20 "$api" || true)" + body="$(curl --fail --silent --show-error --location --connect-timeout 10 --max-time 20 \ + --retry 3 --retry-delay 2 "$api" || true)" fi RUNE_VERSION="$(printf '%s' "$body" \ @@ -126,8 +128,10 @@ mkdir -p "$(dirname "$TARGET")" TMP="$(mktemp "$(dirname "$TARGET")/.rune-bootstrap-XXXXXX")" SUMS="$(mktemp -t rune-bootstrap-sums-XXXXXX)" -curl --fail --silent --show-error --location --connect-timeout 10 --max-time 120 "$RELEASE_BASE/$ASSET" -o "$TMP" -curl --fail --silent --show-error --location --connect-timeout 10 --max-time 30 "$RELEASE_BASE/checksums.txt" -o "$SUMS" +# --retry rides out transient GitHub CDN failures (504, timeouts) instead +# of aborting the whole bootstrap on the first blip. +curl --fail --silent --show-error --location --connect-timeout 10 --max-time 120 --retry 3 --retry-delay 2 "$RELEASE_BASE/$ASSET" -o "$TMP" +curl --fail --silent --show-error --location --connect-timeout 10 --max-time 30 --retry 3 --retry-delay 2 "$RELEASE_BASE/checksums.txt" -o "$SUMS" EXPECTED="$(grep " $ASSET\$" "$SUMS" | cut -d' ' -f1)" if [ -z "$EXPECTED" ]; then diff --git a/internal/bootstrap/download.go b/internal/bootstrap/download.go index c4c7e68..339f57b 100644 --- a/internal/bootstrap/download.go +++ b/internal/bootstrap/download.go @@ -30,6 +30,53 @@ func newDownloadTransport() *http.Transport { type ProgressFunc func(downloaded, total int64) +// maxDownloadAttempts caps retries for a single network fetch (manifest +// or artifact). A transient blip — most notably a GitHub CDN 504 during +// install — recovers within a couple of attempts; beyond that the +// failure is most likely persistent (wrong SHA, missing asset) where +// further retries only add latency. +const maxDownloadAttempts = 3 + +// downloadRetryBackoff is the initial wait between attempts; it is +// multiplied by retryBackoffMultiplier each retry so a transient blip +// recovers quickly while a server-side cold-start has time to warm up. +// var so tests can compress the real waits. +var downloadRetryBackoff = 2 * time.Second + +const retryBackoffMultiplier = 3 + +// withRetry runs fn up to maxDownloadAttempts times with bounded +// exponential backoff so a transient network failure (e.g. a GitHub CDN +// 504) surfaces as a retry rather than a hard install failure. ctx +// cancellation aborts immediately without burning the remaining +// attempts. logf may be nil. +func withRetry(ctx context.Context, logf func(string, ...any), label string, fn func() error) error { + var lastErr error + backoff := downloadRetryBackoff + for attempt := 1; attempt <= maxDownloadAttempts; attempt++ { + if attempt > 1 { + if logf != nil { + logf("retrying %s (attempt %d/%d) after %s: %v", label, attempt, maxDownloadAttempts, backoff, lastErr) + } + select { + case <-time.After(backoff): + case <-ctx.Done(): + return fmt.Errorf("%s: retry aborted: %w", label, ctx.Err()) + } + backoff *= retryBackoffMultiplier + } + if err := fn(); err != nil { + if ctx.Err() != nil { + return err + } + lastErr = err + continue + } + return nil + } + return fmt.Errorf("%s: failed after %d attempts: %w", label, maxDownloadAttempts, lastErr) +} + func DownloadAndVerify(ctx context.Context, spec ArtifactSpec, destPath string, progress ProgressFunc) error { if spec.URL == "" || spec.SHA256 == "" { return errors.New("download: spec missing url or sha256") diff --git a/internal/bootstrap/install.go b/internal/bootstrap/install.go index dcca241..74c6b0a 100644 --- a/internal/bootstrap/install.go +++ b/internal/bootstrap/install.go @@ -76,7 +76,7 @@ func Install(ctx context.Context, opts InstallOptions) (*Result, error) { // Fetch manifest logf("[1/%d] manifest: fetching from %s", total, opts.ManifestURL) - manifest, err := FetchManifest(ctx, opts.ManifestURL) + manifest, err := FetchManifest(ctx, opts.ManifestURL, logf) if err != nil { r.Failed[StepManifest] = err.Error() r.Status = "partial" @@ -123,7 +123,7 @@ func Install(ctx context.Context, opts InstallOptions) (*Result, error) { } logf("[%d/%d] %s (%d bytes): downloading...", stepNum, total, in.step, in.spec.Size) - if err := installArtifact(ctx, paths, in.spec, in.dest, opts.Progress); err != nil { + if err := installArtifact(ctx, paths, in.spec, in.dest, opts.Progress, logf); err != nil { r.Failed[in.step] = err.Error() r.Status = "partial" return r, err @@ -198,10 +198,12 @@ func onlySkipped(r *Result) bool { // spec.Extract == "" : raw binaries are downloaded directly to dest // spec.Extract == "tar.gz": archive is extracted into dest // Others treat as error -func installArtifact(ctx context.Context, paths *Paths, spec ArtifactSpec, dest string, progress ProgressFunc) error { +func installArtifact(ctx context.Context, paths *Paths, spec ArtifactSpec, dest string, progress ProgressFunc, logf func(string, ...any)) error { switch spec.Extract { case "": // raw binaries - if err := DownloadAndVerify(ctx, spec, dest, progress); err != nil { + if err := withRetry(ctx, logf, "download "+filepath.Base(dest), func() error { + return DownloadAndVerify(ctx, spec, dest, progress) + }); err != nil { return err } if err := os.Chmod(dest, binaryMode); err != nil { @@ -212,7 +214,9 @@ func installArtifact(ctx context.Context, paths *Paths, spec ArtifactSpec, dest case "tar.gz": // Tarball archive // Download archive under paths.Cache first tarPath := filepath.Join(paths.Cache, filepath.Base(dest)+".tar.gz") - if err := DownloadAndVerify(ctx, spec, tarPath, progress); err != nil { + if err := withRetry(ctx, logf, "download "+filepath.Base(dest), func() error { + return DownloadAndVerify(ctx, spec, tarPath, progress) + }); err != nil { return err } defer os.Remove(tarPath) diff --git a/internal/bootstrap/manifest.go b/internal/bootstrap/manifest.go index 3469551..0296c4b 100644 --- a/internal/bootstrap/manifest.go +++ b/internal/bootstrap/manifest.go @@ -54,7 +54,7 @@ var ( ErrNoArtifactForPlatform = errors.New("manifest: no artifacts for this platform") ) -func FetchManifest(ctx context.Context, manifestURL string) (*Manifest, error) { +func FetchManifest(ctx context.Context, manifestURL string, logf func(string, ...any)) (*Manifest, error) { if v := os.Getenv(envManifest); v != "" { manifestURL = v } @@ -62,6 +62,37 @@ func FetchManifest(ctx context.Context, manifestURL string) (*Manifest, error) { return nil, errors.New("manifest: no URL provided (default missing; set RUNE_MANIFEST?)") } + // Only the network fetch is retried; a transient GitHub CDN 504 should + // not abort install, but a parse/version error below is deterministic + // and retrying it would just waste the backoff budget. + var body []byte + if err := withRetry(ctx, logf, "fetch manifest", func() error { + var ferr error + body, ferr = fetchManifestBody(ctx, manifestURL) + return ferr + }); err != nil { + return nil, err + } + + var m Manifest + dec := json.NewDecoder(bytes.NewReader(body)) + dec.DisallowUnknownFields() + if err := dec.Decode(&m); err != nil { + return nil, fmt.Errorf("manifest: parse JSON: %w", err) + } + if m.Version != ManifestVersion { + return nil, fmt.Errorf("%w: got %d, want %d", ErrUnsupportedManifestVersion, m.Version, ManifestVersion) + } + if len(m.Platforms) == 0 { + return nil, errors.New("manifest: platforms is empty") + } + + return &m, nil +} + +// fetchManifestBody performs a single manifest GET and returns the raw +// body. It is the retryable unit wrapped by FetchManifest's withRetry. +func fetchManifestBody(ctx context.Context, manifestURL string) ([]byte, error) { ctx, cancel := context.WithTimeout(ctx, defaultManifestFetchTimeout) defer cancel() @@ -88,20 +119,7 @@ func FetchManifest(ctx context.Context, manifestURL string) (*Manifest, error) { return nil, fmt.Errorf("manifest: body exceeds %d bytes", maxBody) } - var m Manifest - dec := json.NewDecoder(bytes.NewReader(body)) - dec.DisallowUnknownFields() - if err := dec.Decode(&m); err != nil { - return nil, fmt.Errorf("manifest: parse JSON: %w", err) - } - if m.Version != ManifestVersion { - return nil, fmt.Errorf("%w: got %d, want %d", ErrUnsupportedManifestVersion, m.Version, ManifestVersion) - } - if len(m.Platforms) == 0 { - return nil, errors.New("manifest: platforms is empty") - } - - return &m, nil + return body, nil } func (m *Manifest) ArtifactsForCurrentPlatform() (PlatformArtifacts, error) { diff --git a/internal/bootstrap/manifest_test.go b/internal/bootstrap/manifest_test.go index c1fdfd1..972efbb 100644 --- a/internal/bootstrap/manifest_test.go +++ b/internal/bootstrap/manifest_test.go @@ -8,8 +8,18 @@ import ( "net/http/httptest" "strings" "testing" + "time" ) +// fastRetry compresses the retry backoff so tests that exercise the retry +// path don't sleep for real seconds. It restores the original on cleanup. +func fastRetry(t *testing.T) { + t.Helper() + orig := downloadRetryBackoff + downloadRetryBackoff = time.Millisecond + t.Cleanup(func() { downloadRetryBackoff = orig }) +} + func fullManifestJSON(extraFields string) string { return fmt.Sprintf(`{ "version": 1, @@ -31,7 +41,7 @@ func TestFetchManifest_HappyPath(t *testing.T) { })) defer srv.Close() - m, err := FetchManifest(context.Background(), srv.URL) + m, err := FetchManifest(context.Background(), srv.URL, nil) if err != nil { t.Fatalf("FetchManifest: %v", err) } @@ -66,7 +76,7 @@ func TestFetchManifest_EnvOverride(t *testing.T) { defer srv.Close() t.Setenv(envManifest, srv.URL) - if _, err := FetchManifest(context.Background(), "https://default/manifest.json"); err != nil { + if _, err := FetchManifest(context.Background(), "https://default/manifest.json", nil); err != nil { t.Fatalf("FetchManifest: %v", err) } if !served { @@ -81,7 +91,7 @@ func TestFetchManifest_RejectsUnsupportedVersion(t *testing.T) { })) defer srv.Close() - _, err := FetchManifest(context.Background(), srv.URL) + _, err := FetchManifest(context.Background(), srv.URL, nil) if !errors.Is(err, ErrUnsupportedManifestVersion) { t.Fatalf("want ErrUnsupportedManifestVersion, got %v", err) } @@ -94,7 +104,7 @@ func TestFetchManifest_RejectsEmptyPlatforms(t *testing.T) { })) defer srv.Close() - _, err := FetchManifest(context.Background(), srv.URL) + _, err := FetchManifest(context.Background(), srv.URL, nil) if err == nil || !strings.Contains(err.Error(), "platforms is empty") { t.Errorf("want empty-platforms error, got %v", err) } @@ -106,24 +116,53 @@ func TestFetchManifest_RejectsUnknownFields(t *testing.T) { })) defer srv.Close() - _, err := FetchManifest(context.Background(), srv.URL) + _, err := FetchManifest(context.Background(), srv.URL, nil) if err == nil || !strings.Contains(err.Error(), "unknown field") { t.Errorf("want unknown-field rejection, got %v", err) } } func TestFetchManifest_HTTPError(t *testing.T) { + fastRetry(t) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { http.Error(w, "not found", http.StatusNotFound) })) defer srv.Close() - _, err := FetchManifest(context.Background(), srv.URL) + _, err := FetchManifest(context.Background(), srv.URL, nil) if err == nil || !strings.Contains(err.Error(), "404") { t.Errorf("want HTTP 404, got %v", err) } } +// TestFetchManifest_RetriesTransient verifies a transient 5xx (e.g. a +// GitHub CDN 504) is retried and the fetch ultimately succeeds. +func TestFetchManifest_RetriesTransient(t *testing.T) { + fastRetry(t) + var attempts int + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + attempts++ + if attempts < 3 { + http.Error(w, "gateway timeout", http.StatusGatewayTimeout) + return + } + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(fullManifestJSON(""))) + })) + defer srv.Close() + + m, err := FetchManifest(context.Background(), srv.URL, nil) + if err != nil { + t.Fatalf("FetchManifest after transient 504s: %v", err) + } + if m.Version != 1 { + t.Errorf("Version = %d, want 1", m.Version) + } + if attempts != 3 { + t.Errorf("attempts = %d, want 3 (two 504s then success)", attempts) + } +} + func TestArtifactsForCurrentPlatform_NotFound(t *testing.T) { m := &Manifest{ Version: 1,