From fa77c6d2281cf5fc40f76915b124573ccba6186b Mon Sep 17 00:00:00 2001 From: Elizabeth Worstell Date: Thu, 7 May 2026 16:31:35 -0700 Subject: [PATCH 1/2] fix(metadatadb): avoid lazy-creating namespace under read lock Query held RLock while m.ns() wrote to m.state on first access for a namespace, racing with concurrent Apply on other namespaces. Read the map directly; queryState handles a nil namespace map. --- internal/metadatadb/memory.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/internal/metadatadb/memory.go b/internal/metadatadb/memory.go index 9a32a37..b42af46 100644 --- a/internal/metadatadb/memory.go +++ b/internal/metadatadb/memory.go @@ -46,8 +46,13 @@ func (m *MemoryBackend) Apply(_ context.Context, namespace string, ops ...Op) er func (m *MemoryBackend) Query(_ context.Context, namespace string, q ReadOp, target any) error { m.mu.RLock() - defer m.mu.RUnlock() - result := queryState(m.ns(namespace), q) + // Do not lazy-create the namespace under a read lock — that races with + // concurrent Apply on a different namespace. queryState handles a nil + // map by returning zero values, which is the right answer for a + // namespace that has never been written. + ns := m.state[namespace] + result := queryState(ns, q) + m.mu.RUnlock() return errors.Wrap(jsonUnmarshalInto(result, target), "memory query") } From 84b4d6936502b0916bf61dbfc9cd0329f0753d5d Mon Sep 17 00:00:00 2001 From: Elizabeth Worstell Date: Thu, 7 May 2026 16:31:46 -0700 Subject: [PATCH 2/2] feat(git): gate readiness on prewarming the most-cloned repos Add prewarm-most-cloned-repos to the git strategy. When > 0, /_readiness stays false until that many of the most-cloned repos (per the per-repo histogram, summed over the last 14 days) are cloned or freshened locally. Empty histogram, nil metadata store, or per-repo failures are logged and treated as complete so a misconfig or one bad upstream cannot brick a deploy. Missing SetMetadataStore wiring keeps readiness closed. --- internal/strategy/git/git.go | 107 ++++++++++++++++++++++++++++-- internal/strategy/git/git_test.go | 63 ++++++++++++++++++ 2 files changed, 164 insertions(+), 6 deletions(-) diff --git a/internal/strategy/git/git.go b/internal/strategy/git/git.go index 9f0c27a..9c2c2c1 100644 --- a/internal/strategy/git/git.go +++ b/internal/strategy/git/git.go @@ -42,8 +42,11 @@ type Config struct { RepackInterval time.Duration `hcl:"repack-interval,optional" help:"How often to run full repack. 0 disables." default:"0"` ZstdThreads int `hcl:"zstd-threads,optional" help:"Threads for zstd compression/decompression (0 = all CPU cores)." default:"0"` BundleCacheTTL time.Duration `hcl:"bundle-cache-ttl,optional" help:"TTL of cached server-side git bundles." default:"2h"` + PrewarmMostClonedRepos int `hcl:"prewarm-most-cloned-repos,optional" help:"Number of most-cloned repositories to warm before /_readiness reports ready. 0 disables the gate." default:"0"` } +const prewarmPopularityWindowDays = 14 + type Strategy struct { config Config cache cache.Cache @@ -61,7 +64,8 @@ type Strategy struct { deferredRestoreOnce sync.Map // keyed by upstream URL, ensures at most one deferred restore per repo metrics *gitMetrics repoCounts *RepoCounts - ready atomic.Bool + existingWarmed atomic.Bool + prewarmComplete atomic.Bool } func New( @@ -140,8 +144,13 @@ func New( if err := s.warmExistingRepos(warmCtx); err != nil { logger.WarnContext(warmCtx, "Failed to warm existing repos", "error", err) } - s.ready.Store(true) - logger.InfoContext(warmCtx, "Git strategy ready") + s.existingWarmed.Store(true) + if s.config.PrewarmMostClonedRepos <= 0 { + logger.InfoContext(warmCtx, "Git strategy ready") + } else { + logger.InfoContext(warmCtx, "Existing repos warmed; waiting for prewarm pass before reporting ready", + "prewarm_most_cloned_repos", s.config.PrewarmMostClonedRepos) + } }() s.proxy = &httputil.ReverseProxy{ @@ -185,19 +194,35 @@ var _ strategy.Strategy = (*Strategy)(nil) var _ strategy.Readier = (*Strategy)(nil) var _ strategy.MetadataConsumer = (*Strategy)(nil) -// Ready reports whether startup warm-up has completed. +// Ready reports whether the strategy is ready to serve traffic. With +// PrewarmMostClonedRepos > 0, the strategy is ready only after both the +// existing-mirror warm-up and the prewarm pass have completed. func (s *Strategy) Ready() bool { - return s.ready.Load() + if !s.existingWarmed.Load() { + return false + } + if s.config.PrewarmMostClonedRepos > 0 && !s.prewarmComplete.Load() { + return false + } + return true } // SetMetadataStore enables the per-repo clone histogram and schedules its // daily reaper. Called by config.Load after the metadata backend is built. +// When PrewarmMostClonedRepos > 0, also kicks off the background prewarm pass +// that gates Ready(). func (s *Strategy) SetMetadataStore(store *metadatadb.Store) { + logger := logging.FromContext(s.ctx) if store == nil { + if s.config.PrewarmMostClonedRepos > 0 { + logger.WarnContext(s.ctx, "prewarm-most-cloned-repos is set but no metadata store was provided; "+ + "reporting ready without prewarm (misconfiguration)") + s.prewarmComplete.Store(true) + } return } s.repoCounts = NewRepoCounts(store.Namespace("git")) - logging.FromContext(s.ctx).InfoContext(s.ctx, "Per-repo clone histogram enabled", + logger.InfoContext(s.ctx, "Per-repo clone histogram enabled", "retention_days", s.repoCounts.retentionDays) s.scheduler.SubmitPeriodicJob("repo-counts-reaper", "reap-repo-counts", defaultRepoCountsReapInterval, func(ctx context.Context) error { if deleted := s.repoCounts.Reap(); deleted > 0 { @@ -205,6 +230,76 @@ func (s *Strategy) SetMetadataStore(store *metadatadb.Store) { } return nil }) + if s.config.PrewarmMostClonedRepos > 0 { + go func() { + warmCtx := context.WithoutCancel(s.ctx) + s.prewarmMostCloned(warmCtx) + s.prewarmComplete.Store(true) + logging.FromContext(warmCtx).InfoContext(warmCtx, "Prewarm pass complete; git strategy ready") + }() + } +} + +// prewarmMostCloned blocks until the existing-repo warm-up has finished, then +// fetches (or clones) the most-cloned repositories from the histogram so that +// the local mirror is warm before the node starts serving traffic. +// +// Per-repo failures are logged but do not block readiness: a single broken +// upstream must not permanently brick a deploy. Same for an empty histogram +// (e.g. brand-new metadata backend) — we log a warning and consider the pass +// complete. +func (s *Strategy) prewarmMostCloned(ctx context.Context) { + logger := logging.FromContext(ctx) + for !s.existingWarmed.Load() { + t := time.NewTimer(100 * time.Millisecond) + select { + case <-ctx.Done(): + t.Stop() + return + case <-t.C: + } + } + top := s.repoCounts.TopRepos(prewarmPopularityWindowDays, s.config.PrewarmMostClonedRepos) + if len(top) == 0 { + logger.WarnContext(ctx, "prewarm-most-cloned-repos is set but the clone histogram is empty; reporting ready") + return + } + logger.InfoContext(ctx, "Prewarming most-cloned repositories for readiness gate", "count", len(top)) + for _, rc := range top { + if ctx.Err() != nil { + return + } + start := time.Now() + if err := s.prewarmRepo(ctx, rc.Repo); err != nil { + logger.ErrorContext(ctx, "Failed to prewarm repo for readiness gate", + "upstream", rc.Repo, "clone_count", rc.Count, "duration", time.Since(start), "error", err) + continue + } + logger.InfoContext(ctx, "Prewarmed repo for readiness gate", + "upstream", rc.Repo, "clone_count", rc.Count, "duration", time.Since(start)) + } +} + +// prewarmRepo ensures the mirror for upstreamURL is present locally and +// freshened with at least one fetch. If the mirror does not yet exist it +// triggers a clone (using the same snapshot-restore-then-fetch path as a +// request-driven clone); otherwise it does a lenient fetch so a slow upstream +// cannot block readiness indefinitely. +func (s *Strategy) prewarmRepo(ctx context.Context, upstreamURL string) error { + repo, err := s.cloneManager.GetOrCreate(ctx, upstreamURL) + if err != nil { + return errors.Wrapf(err, "resolve clone for %s", upstreamURL) + } + if repo.State() != gitclone.StateReady { + if err := s.ensureCloneReady(ctx, repo); err != nil { + return errors.Wrapf(err, "clone %s", upstreamURL) + } + return nil + } + if err := repo.FetchLenient(ctx, s.cloneManager.Config().CloneTimeout); err != nil { + return errors.Wrapf(err, "fetch %s", upstreamURL) + } + return nil } func (s *Strategy) warmExistingRepos(ctx context.Context) error { diff --git a/internal/strategy/git/git_test.go b/internal/strategy/git/git_test.go index e2a4ff2..c550fa0 100644 --- a/internal/strategy/git/git_test.go +++ b/internal/strategy/git/git_test.go @@ -344,6 +344,69 @@ func TestSetMetadataStore(t *testing.T) { s.SetMetadataStore(store) } +// TestReadyGate_Prewarm_NoStore verifies that the strategy still reports +// ready when prewarm-most-cloned-repos is set but no metadata store was +// supplied — the misconfiguration is logged but must not brick the deploy. +func TestReadyGate_Prewarm_NoStore(t *testing.T) { + _, ctx := logging.Configure(context.Background(), logging.Config{}) + + mux := newTestMux() + cm := gitclone.NewManagerProvider(ctx, gitclone.Config{ + MirrorRoot: filepath.Join(t.TempDir(), "clones"), + FetchInterval: 15, + }, nil) + s, err := git.New(ctx, git.Config{PrewarmMostClonedRepos: 10}, newTestScheduler(ctx, t), nil, mux, cm, + func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil + assert.NoError(t, err) + + s.SetMetadataStore(nil) + + waitForReady(t, s) +} + +// TestReadyGate_Prewarm_EmptyHistogram verifies that the strategy reports +// ready when prewarm-most-cloned-repos is set, a metadata store is supplied, +// but the histogram is empty (e.g. brand-new region). +func TestReadyGate_Prewarm_EmptyHistogram(t *testing.T) { + _, ctx := logging.Configure(context.Background(), logging.Config{}) + + mux := newTestMux() + cm := gitclone.NewManagerProvider(ctx, gitclone.Config{ + MirrorRoot: filepath.Join(t.TempDir(), "clones"), + FetchInterval: 15, + }, nil) + s, err := git.New(ctx, git.Config{PrewarmMostClonedRepos: 10}, newTestScheduler(ctx, t), nil, mux, cm, + func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil + assert.NoError(t, err) + + store := metadatadb.New(ctx, metadatadb.NewMemoryBackend()) + s.SetMetadataStore(store) + + waitForReady(t, s) +} + +// TestReadyGate_Prewarm_NoStoreCall verifies that with prewarm-most-cloned-repos +// set, the strategy never reports ready until the metadata store wiring contract +// is honoured. This guards against silent regressions where SetMetadataStore is +// dropped from the call chain. +func TestReadyGate_Prewarm_NoStoreCall(t *testing.T) { + _, ctx := logging.Configure(context.Background(), logging.Config{}) + + mux := newTestMux() + cm := gitclone.NewManagerProvider(ctx, gitclone.Config{ + MirrorRoot: filepath.Join(t.TempDir(), "clones"), + FetchInterval: 15, + }, nil) + s, err := git.New(ctx, git.Config{PrewarmMostClonedRepos: 10}, newTestScheduler(ctx, t), nil, mux, cm, + func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil + assert.NoError(t, err) + + // Give the existing-warm goroutine ample time to complete; Ready() must + // still report false because the prewarm pass has not run. + time.Sleep(200 * time.Millisecond) + assert.False(t, s.Ready(), "strategy must not report ready while prewarm-most-cloned-repos is set and SetMetadataStore has not been called") +} + func TestParseGitRefs(t *testing.T) { _, ctx := logging.Configure(context.Background(), logging.Config{}) _ = ctx