diff --git a/.gitignore b/.gitignore index c553d7b..2af1b1f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Binaries /bin/ /artemis +/loadgen *.exe *.test *.out diff --git a/cmd/artemis/gcwire.go b/cmd/artemis/gcwire.go new file mode 100644 index 0000000..7f515fe --- /dev/null +++ b/cmd/artemis/gcwire.go @@ -0,0 +1,122 @@ +package main + +import ( + "fmt" + "strings" + "time" + + "github.com/freeCodeCamp/artemis/internal/backfill" + "github.com/freeCodeCamp/artemis/internal/config" + "github.com/freeCodeCamp/artemis/internal/gc" + "github.com/freeCodeCamp/artemis/internal/handler" + "github.com/freeCodeCamp/artemis/internal/pg" + "github.com/freeCodeCamp/artemis/internal/r2" + "github.com/freeCodeCamp/artemis/internal/registry/valkey" +) + +var ( + _ handler.SiteChangeEmitter = (*pg.Repo)(nil) + _ handler.TombstoneStore = (*pg.Repo)(nil) + _ handler.RepoStore = (*pg.RepoQueue)(nil) + _ backfill.Lister = (*r2.Client)(nil) + _ backfill.Indexer = (*pg.Repo)(nil) + _ pg.SitesSource = (*valkey.Store)(nil) +) + +func openRepoQueue(pgDB *pg.DB) (handler.RepoStore, error) { + if pgDB == nil { + return nil, fmt.Errorf("repo-creation feature requires DATABASE_URL") + } + return pg.NewRepoQueue(pgDB), nil +} + +const deployIDToken = "-" + +type gcLayout struct { + sitePrefix func(site string) string + deployPrefix func(site, id string) string + trashPrefix func(site, id string) string +} + +func newGCLayout(format, trashBase string) (gcLayout, error) { + idx := strings.Index(format, deployIDToken) + if idx < 0 { + return gcLayout{}, fmt.Errorf("DEPLOY_PREFIX_FORMAT %q must contain %s", format, deployIDToken) + } + head := format[:idx] + tail := format[idx+len(deployIDToken):] + slash := strings.IndexByte(head, '/') + if slash < 0 { + return gcLayout{}, fmt.Errorf("DEPLOY_PREFIX_FORMAT %q must contain '/' after the site segment", format) + } + subPath := head[slash+1:] + if trashBase == "" { + trashBase = "_trash/" + } + return gcLayout{ + sitePrefix: func(site string) string { return site + "/" + subPath }, + deployPrefix: func(site, id string) string { + p := site + "/" + subPath + id + tail + if !strings.HasSuffix(p, "/") { + p += "/" + } + return p + }, + trashPrefix: func(site, id string) string { return trashBase + site + "/" + id + "/" }, + }, nil +} + +func gcPolicy(c config.CleanupConfig) gc.Policy { + return gc.Policy{ + RecentKeep: c.RecentKeep, + Grace: c.Grace, + Retention: time.Duration(c.RetentionDays) * 24 * time.Hour, + ServeCacheTTL: c.ServeCacheTTL, + } +} + +type gcWiring struct { + Repo *pg.Repo + SiteGC *gc.SiteGC + Reconciler *gc.Reconciler + Purge *gc.TombstonePurge +} + +func newGCWiring(cfg *config.Config, repo *pg.Repo, r2c *r2.Client, metrics *gc.Metrics) (*gcWiring, error) { + layout, err := newGCLayout(cfg.DeployPrefixFormat, cfg.Cleanup.TrashPrefix) + if err != nil { + return nil, err + } + return &gcWiring{ + Repo: repo, + SiteGC: &gc.SiteGC{ + Store: repo, + Mover: r2c, + Policy: gcPolicy(cfg.Cleanup), + BlastCap: cfg.Cleanup.BlastCap, + DeployPrefix: layout.deployPrefix, + TrashPrefix: layout.trashPrefix, + Now: time.Now, + Metrics: metrics, + }, + Reconciler: &gc.Reconciler{ + Lister: r2c, + Store: repo, + Mover: r2c, + Grace: cfg.Cleanup.Grace, + SitePrefix: layout.sitePrefix, + DeployPrefix: layout.deployPrefix, + TrashPrefix: layout.trashPrefix, + Now: time.Now, + Metrics: metrics, + }, + Purge: &gc.TombstonePurge{ + Store: repo, + Deleter: r2c, + Recovery: time.Duration(cfg.Cleanup.RecoveryDays) * 24 * time.Hour, + TrashBase: cfg.Cleanup.TrashPrefix, + Now: time.Now, + Metrics: metrics, + }, + }, nil +} diff --git a/cmd/artemis/gcwire_test.go b/cmd/artemis/gcwire_test.go new file mode 100644 index 0000000..1bdf600 --- /dev/null +++ b/cmd/artemis/gcwire_test.go @@ -0,0 +1,130 @@ +package main + +import ( + "testing" + "time" + + "github.com/freeCodeCamp/artemis/internal/config" + "github.com/freeCodeCamp/artemis/internal/pg" + "github.com/freeCodeCamp/artemis/internal/r2" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestOpenRepoQueue_RequiresDatabase(t *testing.T) { + q, err := openRepoQueue(nil) + require.Error(t, err, "repo feature without a database must be rejected at boot") + require.Nil(t, q) +} + +func TestOpenRepoQueue_IsPostgresBacked(t *testing.T) { + q, err := openRepoQueue(&pg.DB{}) + require.NoError(t, err) + _, ok := q.(*pg.RepoQueue) + assert.True(t, ok, "repo queue must be backed by pg.RepoQueue") +} + +func TestBootWiringProdLayout(t *testing.T) { + cases := []struct { + name string + format string + trashBase string + site string + id string + wantSite string + wantDeploy string + wantTrash string + }{ + { + name: "default-dev-layout", + format: "/deploys/-/", + trashBase: "_trash/", + site: "www", + id: "20260101-000000-abc1234", + wantSite: "www/deploys/", + wantDeploy: "www/deploys/20260101-000000-abc1234/", + wantTrash: "_trash/www/20260101-000000-abc1234/", + }, + { + name: "prod-dirname-layout", + format: ".freecode.camp/deploys/-/", + trashBase: "_trash/", + site: "www.freecode.camp", + id: "20260101-000000-abc1234", + wantSite: "www.freecode.camp/deploys/", + wantDeploy: "www.freecode.camp/deploys/20260101-000000-abc1234/", + wantTrash: "_trash/www.freecode.camp/20260101-000000-abc1234/", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + l, err := newGCLayout(tc.format, tc.trashBase) + require.NoError(t, err) + assert.Equal(t, tc.wantSite, l.sitePrefix(tc.site), "sitePrefix") + assert.Equal(t, tc.wantDeploy, l.deployPrefix(tc.site, tc.id), "deployPrefix") + assert.Equal(t, tc.wantTrash, l.trashPrefix(tc.site, tc.id), "trashPrefix") + }) + } +} + +func TestBootWiring_LayoutRejectsBadFormat(t *testing.T) { + _, err := newGCLayout("/deploys/", "_trash/") + require.Error(t, err, "format without the deploy-id token must be rejected") +} + +func TestNewGCWiring_PlumbsBlastCapAndPrefixes(t *testing.T) { + cfg := &config.Config{ + DeployPrefixFormat: "/deploys/-/", + Cleanup: config.CleanupConfig{ + BlastCap: 5, + RetentionDays: 7, + RecoveryDays: 3, + TrashPrefix: "_trash/", + }, + } + repo := &pg.Repo{} + r2c := &r2.Client{} + + w, err := newGCWiring(cfg, repo, r2c, nil) + require.NoError(t, err) + require.NotNil(t, w) + + assert.Same(t, repo, w.Repo, "repo must be plumbed through") + assert.Equal(t, 5, w.SiteGC.BlastCap, "BlastCap=0 would disable the mass-delete safety cap") + assert.Equal(t, 7*24*time.Hour, w.SiteGC.Policy.Retention, "policy retention must derive from RetentionDays") + assert.Equal(t, "_trash/", w.Purge.TrashBase, "purge must scan the configured trash base") + assert.Equal(t, 3*24*time.Hour, w.Purge.Recovery, "recovery window must derive from RecoveryDays") + + require.NotNil(t, w.SiteGC.DeployPrefix) + require.NotNil(t, w.SiteGC.TrashPrefix) + require.NotNil(t, w.Reconciler.SitePrefix) + require.NotNil(t, w.Reconciler.DeployPrefix) + + assert.Equal(t, "www/deploys/id/", w.SiteGC.DeployPrefix("www", "id"), + "a wrong deploy-prefix closure would mass-move the wrong R2 prefix") + assert.Equal(t, "_trash/www/id/", w.SiteGC.TrashPrefix("www", "id")) + assert.Equal(t, "www/deploys/", w.Reconciler.SitePrefix("www")) +} + +func TestNewGCWiring_RejectsBadFormat(t *testing.T) { + cfg := &config.Config{ + DeployPrefixFormat: "/deploys/", + Cleanup: config.CleanupConfig{BlastCap: 5, TrashPrefix: "_trash/"}, + } + w, err := newGCWiring(cfg, &pg.Repo{}, &r2.Client{}, nil) + require.Error(t, err, "a format missing the deploy-id token must fail boot wiring, not produce a degenerate prefix fn") + require.Nil(t, w) +} + +func TestGCPolicyFromConfig(t *testing.T) { + p := gcPolicy(config.CleanupConfig{ + RecentKeep: 3, + Grace: time.Hour, + RetentionDays: 7, + ServeCacheTTL: 15 * time.Second, + }) + assert.Equal(t, 3, p.RecentKeep) + assert.Equal(t, time.Hour, p.Grace) + assert.Equal(t, 7*24*time.Hour, p.Retention) + assert.Equal(t, 15*time.Second, p.ServeCacheTTL) +} diff --git a/cmd/artemis/gcworkflows.go b/cmd/artemis/gcworkflows.go new file mode 100644 index 0000000..341c460 --- /dev/null +++ b/cmd/artemis/gcworkflows.go @@ -0,0 +1,113 @@ +package main + +import ( + "context" + "errors" + "log/slog" + "time" + + "github.com/freeCodeCamp/artemis/internal/observability" + "github.com/freeCodeCamp/artemis/internal/pg" + "github.com/freeCodeCamp/artemis/internal/worker" +) + +const ( + topicSiteReconcile = "site.reconcile" + cronTombstonePurge = "0 3 * * *" + relayInterval = 5 * time.Second +) + +func runRelayLoop(ctx context.Context, relay *worker.Relay, interval time.Duration, metrics *worker.Metrics) { + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + n, err := relay.RunOnce(ctx) + metrics.ObserveRelay(n, err) + if err != nil { + slog.Error("relay.run", "err", err) + observability.CaptureBackground("relay.run", err) + } + } + } +} + +func observeWorkflow(metrics *worker.Metrics, name string, fn worker.Handler) worker.Handler { + return func(ctx context.Context, input map[string]any) error { + err := fn(ctx, input) + outcome := "ok" + if err != nil { + outcome = "failed" + } + metrics.ObserveRun(name, outcome) + return err + } +} + +func gcWorkflowDefs(gcw *gcWiring, dryRun bool, metrics *worker.Metrics) []worker.WorkflowDef { + return []worker.WorkflowDef{ + { + Name: worker.WorkflowGCSite, + ConcurrencyKey: worker.ConcurrencyKeySite, + EventTriggers: []string{pg.TopicSiteChanged}, + Handler: observeWorkflow(metrics, worker.WorkflowGCSite, func(ctx context.Context, input map[string]any) error { + site, err := siteFromInput(input) + if err != nil { + return err + } + if _, err := gcw.SiteGC.Run(ctx, site, dryRun); err != nil { + observability.CaptureBackground("gc.site.run", err) + return err + } + return nil + }), + }, + { + Name: worker.WorkflowTombstonePurge, + Cron: []string{cronTombstonePurge}, + Handler: observeWorkflow(metrics, worker.WorkflowTombstonePurge, func(ctx context.Context, _ map[string]any) error { + if _, err := gcw.Purge.Run(ctx, dryRun); err != nil { + observability.CaptureBackground("tombstone.purge", err) + return err + } + return nil + }), + }, + { + Name: worker.WorkflowReconcile, + ConcurrencyKey: worker.ConcurrencyKeySite, + EventTriggers: []string{topicSiteReconcile}, + Handler: observeWorkflow(metrics, worker.WorkflowReconcile, func(ctx context.Context, input map[string]any) error { + site, err := siteFromInput(input) + if err != nil { + return err + } + if _, err := gcw.Reconciler.ReconcileSite(ctx, site); err != nil { + observability.CaptureBackground("reconcile.run", err) + return err + } + return nil + }), + }, + } +} + +func siteFromInput(input map[string]any) (string, error) { + s, ok := input["site"].(string) + if !ok || s == "" { + return "", errors.New("workflow input missing site") + } + return s, nil +} + +func registerGCWorkflows(rt *worker.Runtime, gcw *gcWiring, dryRun bool, metrics *worker.Metrics) error { + for _, def := range gcWorkflowDefs(gcw, dryRun, metrics) { + if err := rt.Register(def); err != nil { + return err + } + } + return nil +} diff --git a/cmd/artemis/gcworkflows_test.go b/cmd/artemis/gcworkflows_test.go new file mode 100644 index 0000000..80943ba --- /dev/null +++ b/cmd/artemis/gcworkflows_test.go @@ -0,0 +1,125 @@ +package main + +import ( + "context" + "errors" + "testing" + + "github.com/freeCodeCamp/artemis/internal/gc" + "github.com/freeCodeCamp/artemis/internal/pg" + "github.com/freeCodeCamp/artemis/internal/worker" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestGCWorkflowDefs(t *testing.T) { + gcw := &gcWiring{SiteGC: &gc.SiteGC{}, Purge: &gc.TombstonePurge{}, Reconciler: &gc.Reconciler{}} + defs := gcWorkflowDefs(gcw, true, nil) + require.Len(t, defs, 3) + + byName := map[string]worker.WorkflowDef{} + for _, d := range defs { + byName[d.Name] = d + require.NotNilf(t, d.Handler, "%s handler must be set", d.Name) + } + + gcSite := byName[worker.WorkflowGCSite] + assert.Equal(t, worker.ConcurrencyKeySite, gcSite.ConcurrencyKey, "gc-site serialized per site (V3)") + assert.Equal(t, []string{pg.TopicSiteChanged}, gcSite.EventTriggers, "gc-site triggered by the outbox topic") + + purge := byName[worker.WorkflowTombstonePurge] + assert.Empty(t, purge.ConcurrencyKey, "tombstone-purge is global") + assert.NotEmpty(t, purge.Cron, "tombstone-purge is scheduled") + + rec := byName[worker.WorkflowReconcile] + assert.Equal(t, worker.ConcurrencyKeySite, rec.ConcurrencyKey, "reconcile serialized per site") +} + +func TestSiteFromInput(t *testing.T) { + s, err := siteFromInput(map[string]any{"site": "www.freecode.camp"}) + require.NoError(t, err) + assert.Equal(t, "www.freecode.camp", s) + + _, err = siteFromInput(map[string]any{}) + require.Error(t, err, "missing site rejected") + _, err = siteFromInput(map[string]any{"site": ""}) + require.Error(t, err, "empty site rejected") +} + +func TestObserveWorkflow_RecordsOutcome(t *testing.T) { + cases := []struct { + name string + inner error + wantErr bool + wantRuns float64 + wantFailures float64 + outcome string + }{ + {name: "failed-run-bumps-failures", inner: errors.New("boom"), wantErr: true, wantRuns: 1, wantFailures: 1, outcome: "failed"}, + {name: "ok-run-leaves-failures-zero", inner: nil, wantErr: false, wantRuns: 1, wantFailures: 0, outcome: "ok"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + m := worker.NewMetrics(prometheus.NewRegistry()) + wrapped := observeWorkflow(m, worker.WorkflowGCSite, func(context.Context, map[string]any) error { + return tc.inner + }) + + err := wrapped(context.Background(), nil) + if tc.wantErr { + require.Error(t, err, "wrapper must propagate the inner error") + } else { + require.NoError(t, err) + } + + assert.Equal(t, tc.wantRuns, + testutil.ToFloat64(m.WorkflowRuns.WithLabelValues(worker.WorkflowGCSite, tc.outcome)), + "runs{outcome=%s} must be recorded", tc.outcome) + assert.Equal(t, tc.wantFailures, + testutil.ToFloat64(m.WorkflowFailures.WithLabelValues(worker.WorkflowGCSite)), + "WorkflowFailures is the alerting signal; failed runs must bump it, ok runs must not") + }) + } +} + +func TestGCWorkflowHandlers_RejectMissingSite(t *testing.T) { + gcw := &gcWiring{SiteGC: &gc.SiteGC{}, Reconciler: &gc.Reconciler{}, Purge: &gc.TombstonePurge{}} + defs := gcWorkflowDefs(gcw, true, nil) + byName := map[string]worker.WorkflowDef{} + for _, d := range defs { + byName[d.Name] = d + } + + cases := []struct { + name string + workflow string + input map[string]any + }{ + {name: "gc-site-empty-input", workflow: worker.WorkflowGCSite, input: map[string]any{}}, + {name: "reconcile-empty-site", workflow: worker.WorkflowReconcile, input: map[string]any{"site": ""}}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + def := byName[tc.workflow] + require.NotNil(t, def.Handler, "%s handler must be set", tc.workflow) + err := def.Handler(context.Background(), tc.input) + require.ErrorContains(t, err, "missing site", + "the siteFromInput guard must short-circuit before SiteGC.Run on a nil/empty site, or a mass-move could target the wrong prefix") + }) + } +} + +type captureEngine struct{ defs []worker.WorkflowDef } + +func (c *captureEngine) Register(d worker.WorkflowDef) error { c.defs = append(c.defs, d); return nil } +func (c *captureEngine) Start(context.Context) error { return nil } +func (c *captureEngine) Stop(context.Context) error { return nil } + +func TestRegisterGCWorkflows(t *testing.T) { + gcw := &gcWiring{SiteGC: &gc.SiteGC{}, Purge: &gc.TombstonePurge{}, Reconciler: &gc.Reconciler{}} + rt := worker.NewRuntime(&captureEngine{}) + require.NoError(t, registerGCWorkflows(rt, gcw, false, nil)) + assert.Len(t, rt.Registered(), 3) +} diff --git a/cmd/artemis/main.go b/cmd/artemis/main.go index a636c8c..4ffdd63 100644 --- a/cmd/artemis/main.go +++ b/cmd/artemis/main.go @@ -19,15 +19,22 @@ import ( "time" "github.com/freeCodeCamp/artemis/internal/auth" + "github.com/freeCodeCamp/artemis/internal/backfill" "github.com/freeCodeCamp/artemis/internal/config" + "github.com/freeCodeCamp/artemis/internal/gc" "github.com/freeCodeCamp/artemis/internal/githubapp" "github.com/freeCodeCamp/artemis/internal/handler" + "github.com/freeCodeCamp/artemis/internal/hatchet" "github.com/freeCodeCamp/artemis/internal/observability" + "github.com/freeCodeCamp/artemis/internal/pg" "github.com/freeCodeCamp/artemis/internal/r2" + "github.com/freeCodeCamp/artemis/internal/registry" "github.com/freeCodeCamp/artemis/internal/registry/valkey" - repovalkey "github.com/freeCodeCamp/artemis/internal/reporequest/valkey" "github.com/freeCodeCamp/artemis/internal/server" + "github.com/freeCodeCamp/artemis/internal/teamcache" + "github.com/freeCodeCamp/artemis/internal/worker" "github.com/prometheus/client_golang/prometheus" + "github.com/redis/go-redis/v9" ) // Build-time identity, injected via -ldflags "-X main.version=... -X main.commit=...". @@ -89,7 +96,18 @@ func run() error { rootCtx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) defer stop() - registryStore, registryReader, registryCleanup, err := openRegistry(rootCtx, cfg) + pgDB, pgCleanup, err := openPostgres(rootCtx, cfg) + if err != nil { + return fmt.Errorf("open postgres: %w", err) + } + defer pgCleanup() + if pgDB != nil { + slog.Info("postgres: connected, migrations applied") + } else { + slog.Info("postgres disabled (DATABASE_URL unset); deploy-only mode, GC off") + } + + registryWriter, registryReader, registryHealth, registryCleanup, err := openRegistry(rootCtx, cfg, pgDB) if err != nil { return fmt.Errorf("open registry: %w", err) } @@ -107,11 +125,18 @@ func run() error { return fmt.Errorf("init r2: %w", err) } + githubTeamCache, teamCacheCleanup, err := openTeamCache(rootCtx, cfg) + if err != nil { + return fmt.Errorf("open team cache: %w", err) + } + defer teamCacheCleanup() + // GitHub identity client. ghClient := auth.NewGitHubClient(auth.GitHubClientConfig{ - APIBase: cfg.GitHub.APIBase, - Org: cfg.GitHub.Org, - CacheTTL: cfg.GitHub.MembershipCacheTTL, + APIBase: cfg.GitHub.APIBase, + Org: cfg.GitHub.Org, + CacheTTL: cfg.GitHub.MembershipCacheTTL, + TeamCache: githubTeamCache, }) // JWT signer. @@ -125,7 +150,7 @@ func run() error { // routes left unmounted. repoGH probes membership in the Universe org // (cfg.Repo.Org), distinct from ghClient's site-registry org. var ( - repoStore *repovalkey.Store + repoStore handler.RepoStore repoGH *auth.GitHubClient appClient *githubapp.Client ) @@ -143,14 +168,10 @@ func run() error { if err != nil { return fmt.Errorf("init github app client: %w", err) } - repoStore, err = repovalkey.New(rootCtx, repovalkey.Config{ - Addr: cfg.Registry.Valkey.Addr, - Password: cfg.Registry.Valkey.Password, - }) + repoStore, err = openRepoQueue(pgDB) if err != nil { return fmt.Errorf("open repo-request store: %w", err) } - defer func() { _ = repoStore.Close() }() repoGH = auth.NewGitHubClient(auth.GitHubClientConfig{ APIBase: cfg.GitHub.APIBase, Org: cfg.Repo.Org, @@ -171,21 +192,77 @@ func run() error { metricsReg := prometheus.NewRegistry() metrics := handler.NewMetrics(metricsReg) handler.SetMetrics(metrics) + workerMetrics := worker.NewMetrics(metricsReg) registryReader.SetOnRefreshError(func(err error) { metrics.RegistryRefreshFailures.Inc() observability.CaptureBackground("registry.refresh", err) }) + var gcw *gcWiring + if pgDB != nil { + gcw, err = newGCWiring(cfg, pg.NewRepo(pgDB), r2Client, gc.NewMetrics(metricsReg)) + if err != nil { + return fmt.Errorf("wire gc: %w", err) + } + slog.Info("gc: wired", + "siteGCReady", gcw.SiteGC != nil, + "blastCap", cfg.Cleanup.BlastCap, + "retentionDays", cfg.Cleanup.RetentionDays, + "dryRun", cfg.Cleanup.DryRun, + ) + } + + var pgRepo *pg.Repo + if gcw != nil { + pgRepo = gcw.Repo + } + + if cfg.BackfillOnBoot { + if pgRepo == nil { + return fmt.Errorf("BACKFILL_ON_BOOT set but DATABASE_URL is unset") + } + res, err := (&backfill.Backfill{Lister: r2Client, Indexer: pgRepo, Now: time.Now}).Run(rootCtx) + if err != nil { + return fmt.Errorf("backfill: %w", err) + } + slog.Info("backfill complete (one-shot)", + "sites", res.Sites, "deploys", res.Deploys, "aliases", res.Aliases) + return nil + } + + var hatchetAdapter *hatchet.Adapter + workerErrCh := make(chan error, 1) + if gcw != nil && cfg.Hatchet.Addr != "" { + hatchetAdapter = hatchet.New(hatchet.Config{ + Token: cfg.Hatchet.ClientToken, + Addr: cfg.Hatchet.Addr, + WorkerName: "artemis", + }) + workerRuntime := worker.NewRuntime(hatchetAdapter) + if err := registerGCWorkflows(workerRuntime, gcw, cfg.Cleanup.DryRun, workerMetrics); err != nil { + return fmt.Errorf("register gc workflows: %w", err) + } + go func() { + slog.Info("worker: starting", "addr", cfg.Hatchet.Addr) + workerErrCh <- workerRuntime.Start(rootCtx) + }() + + relay := &worker.Relay{Source: pgRepo, Publisher: hatchetAdapter, Batch: 100, Now: time.Now} + go runRelayLoop(rootCtx, relay, relayInterval, workerMetrics) + slog.Info("outbox relay: started", "interval", relayInterval) + } + h := &handler.Handlers{ GH: ghClient, JWT: signer, Sites: registryReader, - Registry: registryStore, - Health: registryStore, + Registry: registryWriter, + Health: registryHealth, R2: r2Client, AliasProductionFmt: cfg.Aliases.ProductionKeyFormat, AliasPreviewFmt: cfg.Aliases.PreviewKeyFormat, DeployPrefix: deployPrefix, + TrashPrefixBase: cfg.Cleanup.TrashPrefix, UploadMaxBytes: cfg.UploadMaxBytes, RegistryAuthzTeam: cfg.Registry.AuthzTeam, RepoOrg: cfg.Repo.Org, @@ -205,6 +282,14 @@ func run() error { h.GitHubApp = appClient } + if pgRepo != nil { + h.Outbox = pgRepo + h.Tombstones = pgRepo + } + if pgDB != nil { + h.PGHealth = pgDB + } + addr := ":" + strconv.Itoa(cfg.Port) srv := &http.Server{ Addr: addr, @@ -228,6 +313,10 @@ func run() error { slog.Info("artemis: shutdown signal received") case err := <-errCh: return fmt.Errorf("listen: %w", err) + case err := <-workerErrCh: + if err != nil { + return fmt.Errorf("worker: %w", err) + } } shutdownCtx, cancel := context.WithTimeout(context.Background(), 15*time.Second) @@ -239,24 +328,74 @@ func run() error { return nil } -// openRegistry constructs the Valkey-backed registry store + reader. -// The store is the Writer surface used by /api/site/{register,update, -// delete}; the reader is the Reader surface used by every read-side -// handler. Cleanup MUST be called on shutdown to close the connection. -func openRegistry(ctx context.Context, cfg *config.Config) (*valkey.Store, *valkey.Reader, func(), error) { +func openPostgres(ctx context.Context, cfg *config.Config) (*pg.DB, func(), error) { + if !cfg.GCEnabled() { + return nil, func() {}, nil + } + db, err := pg.New(ctx, pg.Config{DatabaseURL: cfg.DatabaseURL}) + if err != nil { + return nil, nil, fmt.Errorf("connect: %w", err) + } + if err := pg.Migrate(ctx, db.Pool); err != nil { + db.Close() + return nil, nil, fmt.Errorf("migrate: %w", err) + } + return db, db.Close, nil +} + +// openRegistry constructs the registry Writer, read-side Reader, and +// health probe. When pgDB is non-nil, pg.RegistryStore is the +// source-of-truth (Writer + Reader source) and Valkey is the +// OnChange-published cache-front transport; otherwise Valkey is the +// source-of-truth. Cleanup MUST be called on shutdown. +func openRegistry(ctx context.Context, cfg *config.Config, pgDB *pg.DB) (registry.Writer, *valkey.Reader, *valkey.Store, func(), error) { store, err := valkey.New(ctx, valkey.Config{ Addr: cfg.Registry.Valkey.Addr, Password: cfg.Registry.Valkey.Password, }) if err != nil { - return nil, nil, nil, fmt.Errorf("valkey: %w", err) + return nil, nil, nil, nil, fmt.Errorf("valkey: %w", err) } - reader, err := valkey.NewReader(ctx, store, valkey.DefaultRefreshFallback) + + var ( + writer registry.Writer = store + source valkey.SitesSource = store + ) + if pgDB != nil { + pgReg := pg.NewRegistryStore(pgDB).WithOnChange(valkey.PublishOnChange(ctx, store)) + imported, err := pgReg.Import(ctx, store) + if err != nil { + _ = store.Close() + return nil, nil, nil, nil, fmt.Errorf("registry import: %w", err) + } + if imported > 0 { + slog.Info("registry import complete (one-shot)", "sites", imported) + } + writer = pgReg + source = pgReg + } + + reader, err := valkey.NewReaderFromSource(ctx, source, store, valkey.DefaultRefreshFallback) if err != nil { _ = store.Close() - return nil, nil, nil, fmt.Errorf("valkey reader: %w", err) + return nil, nil, nil, nil, fmt.Errorf("valkey reader: %w", err) + } + return writer, reader, store, func() { _ = store.Close() }, nil +} + +func openTeamCache(ctx context.Context, cfg *config.Config) (auth.TeamCache, func(), error) { + if cfg.Registry.Valkey.Addr == "" { + return nil, func() {}, nil + } + client := redis.NewClient(&redis.Options{ + Addr: cfg.Registry.Valkey.Addr, + Password: cfg.Registry.Valkey.Password, + }) + if err := client.Ping(ctx).Err(); err != nil { + _ = client.Close() + return nil, func() {}, fmt.Errorf("teamcache ping %s: %w", cfg.Registry.Valkey.Addr, err) } - return store, reader, func() { _ = store.Close() }, nil + return teamcache.New(client, cfg.GitHub.MembershipCacheTTL), func() { _ = client.Close() }, nil } func parseLogLevel(level string) slog.Level { diff --git a/cmd/artemis/main_test.go b/cmd/artemis/main_test.go new file mode 100644 index 0000000..2ca41e1 --- /dev/null +++ b/cmd/artemis/main_test.go @@ -0,0 +1,48 @@ +package main + +import ( + "context" + "testing" + + "github.com/freeCodeCamp/artemis/internal/config" + "github.com/stretchr/testify/require" + "github.com/testcontainers/testcontainers-go" + "github.com/testcontainers/testcontainers-go/modules/postgres" +) + +func TestBootMigrations(t *testing.T) { + ctx := context.Background() + + db, cleanup, err := openPostgres(ctx, &config.Config{}) + require.NoError(t, err, "empty DATABASE_URL must not error") + require.Nil(t, db, "no DATABASE_URL -> no pool (deploy-only mode)") + require.NotNil(t, cleanup, "cleanup must be safe to call when gated off") + cleanup() + + testcontainers.SkipIfProviderIsNotHealthy(t) + + container, err := postgres.Run(ctx, "postgres:16-alpine", + postgres.WithDatabase("artemis_test"), + postgres.WithUsername("artemis"), + postgres.WithPassword("artemis"), + postgres.BasicWaitStrategies(), + ) + require.NoError(t, err) + t.Cleanup(func() { _ = container.Terminate(ctx) }) + + connStr, err := container.ConnectionString(ctx, "sslmode=disable") + require.NoError(t, err) + + db, cleanup, err = openPostgres(ctx, &config.Config{DatabaseURL: connStr}) + require.NoError(t, err) + require.NotNil(t, db, "DATABASE_URL set -> pool opened") + t.Cleanup(cleanup) + + for _, table := range []string{"deploys", "aliases", "tombstones", "outbox", "schema_migrations"} { + var exists bool + require.NoError(t, db.Pool.QueryRow(ctx, + "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = $1)", + table).Scan(&exists)) + require.Truef(t, exists, "table %q must exist after boot migrations", table) + } +} diff --git a/cmd/artemis/relayloop_test.go b/cmd/artemis/relayloop_test.go new file mode 100644 index 0000000..dab89df --- /dev/null +++ b/cmd/artemis/relayloop_test.go @@ -0,0 +1,112 @@ +package main + +import ( + "context" + "errors" + "sync" + "testing" + "time" + + "github.com/freeCodeCamp/artemis/internal/pg" + "github.com/freeCodeCamp/artemis/internal/worker" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/require" +) + +type fakeOutbox struct { + mu sync.Mutex + events []pg.OutboxEvent + marked []int64 +} + +func (f *fakeOutbox) FetchUnpublished(_ context.Context, limit int) ([]pg.OutboxEvent, error) { + f.mu.Lock() + defer f.mu.Unlock() + out := make([]pg.OutboxEvent, 0, len(f.events)) + for _, e := range f.events { + if len(out) >= limit { + break + } + out = append(out, e) + } + return out, nil +} + +func (f *fakeOutbox) MarkPublished(_ context.Context, ids []int64, _ time.Time) error { + f.mu.Lock() + defer f.mu.Unlock() + f.marked = append(f.marked, ids...) + f.events = nil + return nil +} + +type fakePublisher struct { + mu sync.Mutex + n int +} + +func (p *fakePublisher) Publish(context.Context, string, []byte) error { + p.mu.Lock() + p.n++ + p.mu.Unlock() + return nil +} + +func (p *fakePublisher) count() int { + p.mu.Lock() + defer p.mu.Unlock() + return p.n +} + +func TestRelayLoop(t *testing.T) { + src := &fakeOutbox{events: []pg.OutboxEvent{ + {ID: 1, Topic: pg.TopicSiteChanged, Payload: []byte(`{"site":"www.freecode.camp"}`)}, + }} + pub := &fakePublisher{} + relay := &worker.Relay{Source: src, Publisher: pub, Batch: 10, Now: time.Now} + metrics := worker.NewMetrics(prometheus.NewRegistry()) + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { runRelayLoop(ctx, relay, time.Millisecond, metrics); close(done) }() + + require.Eventually(t, func() bool { return pub.count() >= 1 }, 2*time.Second, time.Millisecond, + "relay loop must drain the outbox on tick") + require.Eventually(t, func() bool { return testutil.ToFloat64(metrics.RelayPublished) >= 1 }, 2*time.Second, time.Millisecond, + "relay loop must record published rows on /metrics") + + cancel() + select { + case <-done: + case <-time.After(time.Second): + t.Fatal("runRelayLoop must return when ctx is cancelled") + } +} + +type erroringOutbox struct{} + +func (erroringOutbox) FetchUnpublished(context.Context, int) ([]pg.OutboxEvent, error) { + return nil, errors.New("db down") +} + +func (erroringOutbox) MarkPublished(context.Context, []int64, time.Time) error { return nil } + +func TestRelayLoop_FailedTickBumpsFailures(t *testing.T) { + relay := &worker.Relay{Source: erroringOutbox{}, Publisher: &fakePublisher{}, Batch: 10, Now: time.Now} + metrics := worker.NewMetrics(prometheus.NewRegistry()) + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { runRelayLoop(ctx, relay, time.Millisecond, metrics); close(done) }() + + require.Eventually(t, func() bool { return testutil.ToFloat64(metrics.RelayFailures) >= 1 }, 2*time.Second, time.Millisecond, + "a relay RunOnce error must bump RelayFailures so a stalled outbox alerts") + + cancel() + select { + case <-done: + case <-time.After(time.Second): + t.Fatal("runRelayLoop must return when ctx is cancelled even after error ticks") + } +} diff --git a/cmd/loadgen/main.go b/cmd/loadgen/main.go new file mode 100644 index 0000000..c26fa70 --- /dev/null +++ b/cmd/loadgen/main.go @@ -0,0 +1,259 @@ +//go:build load + +package main + +import ( + "context" + "encoding/json" + "flag" + "fmt" + "os" + "sort" + "sync" + "sync/atomic" + "time" + + "github.com/freeCodeCamp/artemis/internal/gc" + "github.com/freeCodeCamp/artemis/internal/pg" + "github.com/freeCodeCamp/artemis/internal/worker" +) + +type config struct { + dsn string + sites int + deploysPerSite int + concurrency int + relayBatch int + keep bool +} + +type stageResult struct { + Stage string `json:"stage"` + Ops int `json:"ops"` + Millis float64 `json:"millis"` + OpsPerSec float64 `json:"ops_per_sec"` + P50Micros float64 `json:"p50_micros"` + P95Micros float64 `json:"p95_micros"` + P99Micros float64 `json:"p99_micros"` + MaxMicros float64 `json:"max_micros"` + Errors int `json:"errors"` +} + +type report struct { + StartedAt string `json:"started_at"` + Sites int `json:"sites"` + DeploysPerSite int `json:"deploys_per_site"` + Concurrency int `json:"concurrency"` + RelayBatch int `json:"relay_batch"` + PoolMaxConns int32 `json:"pool_max_conns"` + TotalDeploys int `json:"total_deploys"` + Stages []stageResult `json:"stages"` +} + +func main() { + cfg := parseFlags() + + ctx := context.Background() + db, err := pg.New(ctx, pg.Config{DatabaseURL: cfg.dsn}) + if err != nil { + fatal("connect: %v", err) + } + defer db.Close() + + if err := pg.Migrate(ctx, db.Pool); err != nil { + fatal("migrate: %v", err) + } + if !cfg.keep { + if err := truncate(ctx, db); err != nil { + fatal("truncate: %v", err) + } + } + + rep := report{ + StartedAt: time.Now().UTC().Format(time.RFC3339), + Sites: cfg.sites, + DeploysPerSite: cfg.deploysPerSite, + Concurrency: cfg.concurrency, + RelayBatch: cfg.relayBatch, + PoolMaxConns: db.Pool.Config().MaxConns, + TotalDeploys: cfg.sites * cfg.deploysPerSite, + } + + reg := pg.NewRegistryStore(db) + repo := pg.NewRepo(db) + + rep.Stages = append(rep.Stages, runRegister(ctx, cfg, reg)) + rep.Stages = append(rep.Stages, runDeploys(ctx, cfg, repo)) + rep.Stages = append(rep.Stages, runOutboxEnqueue(ctx, cfg, repo)) + rep.Stages = append(rep.Stages, runRelay(ctx, cfg, repo)) + rep.Stages = append(rep.Stages, runGCPlan(ctx, cfg, repo)) + + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + if err := enc.Encode(rep); err != nil { + fatal("encode: %v", err) + } +} + +func parseFlags() config { + var cfg config + flag.StringVar(&cfg.dsn, "dsn", envOr("LOADGEN_DATABASE_URL", "postgres://artemis:artemis@localhost:55433/artemis?sslmode=disable"), "postgres DSN") + flag.IntVar(&cfg.sites, "sites", 500, "number of sites to register") + flag.IntVar(&cfg.deploysPerSite, "deploys-per-site", 40, "deploys upserted per site") + flag.IntVar(&cfg.concurrency, "concurrency", 16, "worker goroutines driving PG") + flag.IntVar(&cfg.relayBatch, "relay-batch", 100, "outbox relay batch size") + flag.BoolVar(&cfg.keep, "keep", false, "skip TRUNCATE before the run") + flag.Parse() + return cfg +} + +func runRegister(ctx context.Context, cfg config, reg *pg.RegistryStore) stageResult { + teams := []string{"staff"} + return drive("register", cfg.sites, cfg.concurrency, func(i int) error { + _, err := reg.Register(ctx, siteSlug(i), teams, "loadgen") + return err + }) +} + +func runDeploys(ctx context.Context, cfg config, repo *pg.Repo) stageResult { + total := cfg.sites * cfg.deploysPerSite + base := time.Now().Add(-90 * 24 * time.Hour) + return drive("deploy_upsert", total, cfg.concurrency, func(i int) error { + site := siteSlug(i % cfg.sites) + seq := i / cfg.sites + id := fmt.Sprintf("%d-%08x", base.Add(time.Duration(seq)*time.Hour).Unix(), i) + return repo.UpsertDeploy(ctx, site, id, base.Add(time.Duration(seq)*time.Hour), 1<<20, true, "active") + }) +} + +func runOutboxEnqueue(ctx context.Context, cfg config, repo *pg.Repo) stageResult { + return drive("outbox_enqueue", cfg.sites, cfg.concurrency, func(i int) error { + return repo.EnqueueSiteChanged(ctx, siteSlug(i)) + }) +} + +func runRelay(ctx context.Context, cfg config, repo *pg.Repo) stageResult { + relay := &worker.Relay{Source: repo, Publisher: nopPublisher{}, Batch: cfg.relayBatch, Now: time.Now} + start := time.Now() + published := 0 + var samples []time.Duration + errs := 0 + for { + t0 := time.Now() + n, err := relay.RunOnce(ctx) + samples = append(samples, time.Since(t0)) + if err != nil { + errs++ + break + } + published += n + if n == 0 { + break + } + } + return summarize("relay_drain", published, errs, time.Since(start), samples) +} + +func runGCPlan(ctx context.Context, cfg config, repo *pg.Repo) stageResult { + g := &gc.SiteGC{ + Store: repo, + Mover: nopMover{}, + Policy: gc.Policy{RecentKeep: 10, Grace: 24 * time.Hour, Retention: 30 * 24 * time.Hour, ServeCacheTTL: time.Hour}, + BlastCap: 1000, + DeployPrefix: func(site, id string) string { return site + "/deploys/" + id + "/" }, + TrashPrefix: func(site, id string) string { return "_trash/" + site + "/" + id + "/" }, + Now: time.Now, + } + return drive("gc_plan_dryrun", cfg.sites, cfg.concurrency, func(i int) error { + _, err := g.Run(ctx, siteSlug(i), true) + return err + }) +} + +func drive(stage string, n, concurrency int, op func(i int) error) stageResult { + if concurrency < 1 { + concurrency = 1 + } + samples := make([]time.Duration, n) + var errs atomic.Int64 + var next atomic.Int64 + next.Store(-1) + + start := time.Now() + var wg sync.WaitGroup + for w := 0; w < concurrency; w++ { + wg.Add(1) + go func() { + defer wg.Done() + for { + i := int(next.Add(1)) + if i >= n { + return + } + t0 := time.Now() + if err := op(i); err != nil { + errs.Add(1) + } + samples[i] = time.Since(t0) + } + }() + } + wg.Wait() + return summarize(stage, n, int(errs.Load()), time.Since(start), samples) +} + +func summarize(stage string, ops, errs int, wall time.Duration, samples []time.Duration) stageResult { + millis := float64(wall.Nanoseconds()) / 1e6 + r := stageResult{Stage: stage, Ops: ops, Millis: round2(millis), Errors: errs} + if millis > 0 { + r.OpsPerSec = round2(float64(ops) / (millis / 1000)) + } + if len(samples) > 0 { + sorted := append([]time.Duration(nil), samples...) + sort.Slice(sorted, func(i, j int) bool { return sorted[i] < sorted[j] }) + r.P50Micros = micros(pct(sorted, 0.50)) + r.P95Micros = micros(pct(sorted, 0.95)) + r.P99Micros = micros(pct(sorted, 0.99)) + r.MaxMicros = micros(sorted[len(sorted)-1]) + } + return r +} + +func pct(sorted []time.Duration, p float64) time.Duration { + if len(sorted) == 0 { + return 0 + } + idx := int(p * float64(len(sorted)-1)) + return sorted[idx] +} + +func micros(d time.Duration) float64 { return round2(float64(d.Nanoseconds()) / 1e3) } + +func round2(f float64) float64 { return float64(int64(f*100+0.5)) / 100 } + +func truncate(ctx context.Context, db *pg.DB) error { + _, err := db.Pool.Exec(ctx, `TRUNCATE sites, deploys, aliases, tombstones, outbox RESTART IDENTITY CASCADE`) + return err +} + +func siteSlug(i int) string { return fmt.Sprintf("loadgen-site-%06d.freecode.camp", i) } + +type nopPublisher struct{} + +func (nopPublisher) Publish(context.Context, string, []byte) error { return nil } + +type nopMover struct{} + +func (nopMover) MovePrefix(context.Context, string, string) (int, error) { return 0, nil } + +func envOr(key, def string) string { + if v := os.Getenv(key); v != "" { + return v + } + return def +} + +func fatal(format string, args ...any) { + fmt.Fprintf(os.Stderr, format+"\n", args...) + os.Exit(1) +} diff --git a/docker-compose.yml b/docker-compose.yml index 5578718..4509d52 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,18 @@ name: artemis-local services: + postgres: + image: postgres:17-alpine + environment: + POSTGRES_USER: artemis + POSTGRES_PASSWORD: artemis + POSTGRES_DB: artemis + healthcheck: + test: ["CMD-SHELL", "pg_isready -U artemis -d artemis"] + interval: 2s + timeout: 3s + retries: 30 + valkey: image: valkey/valkey:8-alpine healthcheck: @@ -34,8 +46,12 @@ services: build: context: . dockerfile: Dockerfile.fakegithub + network_mode: "service:artemis" + depends_on: + artemis: + condition: service_started environment: - FAKE_GH_ADDR: ":9001" + FAKE_GH_ADDR: "127.0.0.1:9001" FAKE_GH_ORG: "freeCodeCamp-Universe" FAKE_GH_USER: "smoke-bot" FAKE_GH_TEAMS: "staff,apollo-11-approvers" @@ -47,12 +63,12 @@ services: build: context: . depends_on: + postgres: + condition: service_healthy valkey: condition: service_healthy minio-setup: condition: service_completed_successfully - fakegithub: - condition: service_started environment: PORT: "8080" LOG_LEVEL: "debug" @@ -61,7 +77,8 @@ services: R2_SECRET_ACCESS_KEY: "minioadmin" R2_BUCKET: "universe-static-apps-01" VALKEY_ADDR: "valkey:6379" - GH_API_BASE: "http://fakegithub:9001" + DATABASE_URL: "postgres://artemis:artemis@postgres:5432/artemis?sslmode=disable" + GH_API_BASE: "http://127.0.0.1:9001" GH_ORG: "freeCodeCamp-Universe" GH_REPO_ORG: "freeCodeCamp-Universe" GH_CLIENT_ID: "Iv1.fakelocalclientid" @@ -69,5 +86,6 @@ services: GH_APP_ID: "123" GH_APP_INSTALLATION_ID: "456" GH_APP_PRIVATE_KEY: "${GH_APP_PRIVATE_KEY:-}" + REPO_APPROVE_AUTHZ_TEAM: "apollo-11-approvers" ports: - "8080:8080" diff --git a/docs/design/0001-durable-execution-model.md b/docs/design/0001-durable-execution-model.md new file mode 100644 index 0000000..07c469d --- /dev/null +++ b/docs/design/0001-durable-execution-model.md @@ -0,0 +1,167 @@ +# Local ADR 0001 — Durable execution model + deploy retention GC + +> **Status:** Accepted (direction locked) · **Date:** 2026-06-01 **Amends:** ADR-016 §"interaction-agnostic, synchronous, no background tasks" + ADR-017 §Pillar (artemis stateless→stateful) · **Lifted →** Universe **ADR-020 — Platform Durable Execution + Windmill Role Reframe** (Proposed, 2026-06-02; cross-ref `~/DEV/fCC-U/Architecture/decisions/020-durable-execution.md`). ADR-020 generalises this design to platform-ops and EXPANDS scope beyond the cleanup cron — see §13 note. **Reasoning trail / research:** `.scratchpad/2026-06-01-retention-gc-design.md` (GC flow research, 23 verified claims), dossier `.scratchpad/dossier/2026-06-01-artemis-retention-gc/`. **Platform alignment:** see §13 (ADR-001/008/017/019 drift). + +______________________________________________________________________ + +## 1. Context + +Moving deploy-retention GC off the Windmill cron into artemis surfaced a deeper truth: **every state-mutating artemis operation (deploy, promote, rollback, delete, GC, purge) is a durable, multi-step, crash-prone, concurrent saga**, not a request/response. Today they run synchronously in HTTP handlers with ad-hoc coordination. That mismatch is the root of the races, the orphan classes, and the "where does the sweep run" problem. + +Target scale: **10s of thousands of sites → millions of deploys** (Vercel/Netlify-class). At that scale full-bucket-scan GC is impossible (event-driven incremental is mandatory) and in-RAM state is untenable (need disk-durable, queryable metadata). + +Operator constraints (locked): + +- **No infra-primitive coupling** — app owns scheduling, coordination, reclaim. Object store stays the dumb vendor-neutral S3 subset (ADR-016: "swap R2 for MinIO = config change, not code"). +- **Self-hosted OSS only**, no SaaS, no feature paywall. +- **artemis owns its own data** — dedicated Postgres, not the platform's shared DB. + +## 2. Decision — the architecture + +```text + commands (CLI / CI / curl) + │ HTTP API (public contract, unchanged) + ▼ + ┌───────────────────────────────────────────────────────────┐ + │ artemis (HTTP + Hatchet workers, same binary/deployment) │ + │ handler ──tx──▶ Postgres(artemis-owned): metadata+outbox │ + │ Hatchet ───────▶ durable workflows, fair-sched by site │ + │ activities ────▶ R2 (bytes: dumb S3 put/get/list/delete) │ + │ hot reads ─────▶ Valkey (cache only, reconstructable) │ + └───────────────────────────────────────────────────────────┘ + + end users ──▶ CF ──▶ Caddy r2_alias (gxy-cassiopeia, RO token) ──▶ R2 [serve plane, independent] +``` + +| Plane | Owns | Authoritative for | +| ---------------------------------------- | ------------------------ | ------------------------------------------------------------- | +| **R2** `universe-static-apps-01` | bytes | object existence (dumb, swappable S3) | +| **Postgres** (artemis-dedicated) | metadata + jobs + outbox | deploy index, alias pointers, lifecycle state, workflow state | +| **Hatchet** (on artemis's Postgres, MIT) | durable execution | retries, timers, fair-scheduling, event triggers | +| **Valkey** | hot cache | nothing — pure speed layer, loss-safe | +| **Caddy `r2_alias`** | serve | independent; reads R2 directly, never calls artemis | + +Engine **Hatchet** (verified MIT, Postgres-native, no engine paywall — gating is hosting/SLA/RPS only). Chosen centerpiece feature: **concurrency / fair-scheduling with dynamic keys** — `key = site` gives structural per-tenant isolation (one site's burst cannot starve another's GC/deploy), replacing all hand-rolled per-site serialization. Composes with **event triggers** (event-driven incremental GC) and **dynamic rate-limit keys** (protect the shared GitHub App quota per tenant). + +## 3. Data & trust ownership + +- **artemis owns a dedicated Postgres** (bounded context — no shared-DB coupling with Apollo or other services). Same custody class as the R2 admin token + Valkey: provisioned to artemis, creds in `infra-secrets/management/artemis.env.enc` (sops+age). +- **Engine, phased.** **M1 = app-bundled single-instance Postgres** (Hatchet's own chart PG or a simple PG subchart) — matching the **live precedent on gxy-management/backoffice** (Windmill's bundled `postgresql` subchart, Outline's `postgres:16-alpine`; **CNPG operator is not deployed anywhere yet**). M1 backup = Windmill-style `postgres-rclone` CronJob → R2. **Later = CNPG sweep** (platform-wide, operator-managed, when multiple PG instances get standardized) folds artemis's PG in + adds the formal stateful-pillar backup floor. +- **Stateful-pillar trajectory.** ADR-017 lists artemis as a *stateless* pillar. Any in-cluster primary store makes it stateful — but M1's bundled-PG + rclone→R2 backup follows the **already-blessed Windmill pattern** (pragmatic, no new ceremony). The **formal ADR-019 carve-out** (CNPG T1/T2 + per-galaxy `management-cnpg-backups` + RPO ≤ 5 min / RTO ≤ 60 min + restore drill) lands at the CNPG sweep, gating artemis-PG **GA** — not M1 build. See §13 D1. +- **Hatchet co-locates** on artemis's Postgres (own schema) — required so the outbox relay + app tables share one instance for the §6 single-tx outbox. +- **HA scope = artemis only.** A Postgres outage pauses *new deploys + GC* only — the **serve plane (Caddy + R2) keeps serving every site** (ADR-016 consequence, verified in serve code §8). + +## 4. Retention GC policy — safety invariants (engine-independent) + +Policy rides *on top of* the engine; it does not change with the substrate. Retain predicate = **reachability from live aliases + keep-N + grace**, NOT raw age (industry-verified: Docker/Harbor/ Vercel/Netlify all converge; age can't identify in-use objects under pointer indirection). + +| id | invariant (never violate) | +| --- | --------------------------------------------------------------------------------------------------------------- | +| V1 | a deploy any alias targets is never deleted; **re-checked immediately before delete** (TOCTOU) | +| V2 | newest `recentKeep` (3) per site never deleted, any age (rollback floor) | +| V3 | deploy younger than `graceMs` never deleted; `graceMs ≥ JWT_TTL` (max upload→finalize) | +| V4 | no flow deletes bytes another is writing (grace + finalize-marker reachability) | +| V5 | every delete is a tombstone move first; byte-reclaim is a later app-driven purge pass | +| V6 | sweep aborts a site (deletes nothing) if its plan exceeds the blast-cap; plan persisted pre-delete | +| V7 | per-site mutations serialized + fairly scheduled (Hatchet concurrency key = site) | +| V8 | alias mutations are last-writer-safe (single-writer-per-site via V7; optimistic re-read) | +| V9 | same store state + same `now()` → identical delete set (pure predicate, injectable clock) | +| V10 | every activity idempotent: re-run deletes nothing new; tombstone/delete of gone prefix = no-op | +| V11 | a deploy is not deleted within `serve_cache_ttl` (15s) of losing alias status; `graceMs ≥ serve_cache_ttl` (§8) | + +Orphan class (never-finalized / aborted upload): finalize writes a `_artemis_meta.json` marker; a prefix **without** a marker past `graceMs` is an orphan → reclaimed fast (separate from the 7d retention for completed deploys). Reclaim is two-phase: tombstone → `_trash///`, a later GC pass purges tombstones past the recovery window. **App-driven, not R2 lifecycle** (§11). + +## 5. Engine invariants (durability layer) + +| id | invariant | +| --- | --------------------------------------------------------------------------------------------------- | +| E1 | every activity idempotent (no non-idempotent side effect) — the keystone making at-least-once safe | +| E2 | all events for one site process in per-site order (Hatchet concurrency key) | +| E3 | R2 authoritative for bytes; Postgres authoritative for metadata; Valkey reconstructable | +| E4 | every event-GC gap (orphan, missed event) is closed by the reconciliation drift-audit | +| E5 | no operation holds a lock across a crash; durability is the engine's, resume is replay-from-journal | +| E6 | a workflow exceeding max attempts dead-letters + alerts; never blocks its concurrency key | + +## 6. Transactional integrity — outbox-row + relay + +Hatchet events are sent via its API (not inside the app's Postgres tx), so we use the **transactional outbox**: a mutation's handler, in **one Postgres tx**, writes the metadata change **and** an `outbox` row; a relay worker reads new outbox rows and emits the Hatchet event at-least-once; idempotent consumers (E1) give exact-once effect. This closes the dual-write / event-loss / leak class (Harbor's 74 TB storage-orphan lesson) *at the metadata layer* — stronger than grace-window patching. Possible because artemis owns the Postgres (outbox + app tables, one instance, §3). + +## 7. Event-driven incremental GC + +Steady-state GC is **never** a full-bucket scan. A successful finalize/promote/rollback emits `site.changed{site}` (via outbox) → a **debounced** per-site GC workflow (Hatchet `concurrency:{key:site}` + debounce) evaluates retention for that site only → O(changed sites). A low-cadence **reconciliation** workflow does a sharded R2 ↔ Postgres drift audit (orphan bytes w/ no row → tombstone; row w/ no bytes → alert) — DR + the event-miss backstop (E4), never the steady-state path. + +## 8. Serve plane coupling (verified) + +`infra/docker/images/caddy-s3/modules/r2alias/r2alias.go`: `.freecode.camp` → Caddy → in-process **alias LRU (TTL 15s, 10k entries)** → miss = R2 `GetObject /` → deployID → path-rewrite `//deploys//` → `file_server` streams R2 bytes. Caddy holds a **separate read-only R2 token**, never calls artemis (serve plane independent). 404s negative-cached; R2 5xx → `503 Retry-After:30`, never cached. + +Consequence = **V11**: a just-superseded deploy may still be served from a 15s-stale cache; deleting it inside that window 404s in-flight requests. Covered by keep-N + grace (1h ≫ 15s), now explicit. + +## 9. Valkey role — hot cache only (justified, not load-bearing) + +With Postgres as durable truth, Valkey is **optional speed**. **Operator directive: the gxy-management Valkey stays artemis-EXCLUSIVE** — it is already a dedicated instance in its own `valkey` namespace, AOF on PVC, **NetworkPolicy-locked to artemis pods** (CiliumNetworkPolicy, `valkey.valkey.svc.cluster.local:6379`). No other component shares it. Caddy or any other stack component needing a hot cache provisions **its own** Valkey instance (e.g. a cassiopeia-local Valkey for a serve-plane alias cache) — **never** this one. + +artemis-only Valkey use, justified: + +- **Shared GitHub team-membership cache** across artemis replicas (today in-process) → protects the shared GitHub App quota. +- Registry + repo-queue hot reads (today's use) — may stay on Valkey as cache fronting Postgres, or fold into Postgres; decide at M2. + +Loss = cache-cold (slower), never wrong. (A serve-plane Caddy alias cache is a **separate** instance on cassiopeia — out of artemis scope, noted for the platform, not this dossier.) + +## 10. Migration (phased — prove engine on new surface first) + +- **M1** — Stand up artemis-owned Postgres + Hatchet; model `deploys`/`aliases`/`tombstones`/`outbox` tables; run **retention GC + manual delete + purge** as Hatchet workflows (concurrency key=site). Backfill index from a one-time R2 scan. **Retire Windmill cron.** Deploy hot path untouched. +- **M2** — Emit `site.changed` via outbox from existing deploy/promote/rollback handlers (additive) → event-driven incremental GC (§7). Migrate registry + repo-queue off Valkey to Postgres. +- **M3** — (optional) move deploy/promote/rollback execution onto workflows; upload streaming likely stays synchronous, finalize becomes a durable workflow. +- **M4** — Reconciliation drift-audit workflow + observability (Hatchet/queue metrics, drift counts); optional Valkey serve-cache + GH-membership cache. + +## 11. Alternatives rejected + +| Considered | Rejected because | +| -------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| k8s CronJob for scheduling | infra-primitive coupling; app must own scheduling | +| R2 object-lifecycle for reclaim | app correctness would depend on a bucket policy; R2 lifecycle is **prefix+age(days) only, no tag filters** (verified) — can't target tombstones in place | +| R2 conditional-write (`If-None-Match`/`If-Match`) for lock/CAS | vendor-specific S3 extension; breaks ADR-016 vendor-neutrality | +| In-process Valkey-Streams hand-rolled runtime | RAM-bound; can't be durable metadata SOT at 10k+ sites; reinvents a DB's job/lock/ordering primitives | +| Temporal | full engine, but heaviest ops (multi-service cluster + DB[+ES]) for lean gxy-management | +| River (Postgres job-queue lib) | viable lighter floor, but a job-queue, not event-driven; Hatchet chosen for fair-scheduling keys + event triggers + DAG | +| Inngest | excellent event DX, but self-hosted **server is SSPL** (source-available); Hatchet is MIT | +| Asynq (Valkey) + Postgres | splits queue (Valkey) from store (Postgres) → reintroduces dual-write, loses §6 transactional integrity | +| Shared platform Postgres | violates artemis data-ownership / bounded context (§3) | + +## 12. Open questions + +- **Postgres HA posture** on gxy-management: single + PITR backups (MVP) vs replicated operator — artemis mutation availability now depends on it (serve plane does not). +- **Hatchet self-host footprint** on gxy-management (API server + engine + dashboard) — confirm the ops weight is acceptable vs River-floor fallback. +- **Platform-wide Hatchet**: adopt as shared orchestration for Apollo (constellations) + repo-mgmt too, or artemis-scoped first? (Multi-lang SDKs make platform-wide viable.) +- **M1 scope**: GC/delete/purge only, or also migrate registry/repo-queue to Postgres in M1? +- Platform Postgres scout (2026-06-01, corrected): **two live PG instances, both app-bundled single-node** — **Windmill** (`postgresql` subchart on gxy-management + `postgres-rclone`→R2 backup) and **Outline** (`postgres:16-alpine` on ops-backoffice-tools). **CNPG operator NOT deployed anywhere** (Veritas, the first CNPG user, still future). Apollo has no SQL DB. → artemis provisions **net-new bundled single-node PG** matching the Windmill/Outline precedent; joins the future CNPG sweep. (Earlier "none live" note was wrong — it missed Windmill's bundled subchart.) + +______________________________________________________________________ + +## 13. Platform alignment & drift (ADR-001 / 008 / 017 / 019) + +Audited this plan against the Universe Architecture ADRs. Building blocks + placement (verified): artemis = **gxy-management** (P1 control-plane brain); caddy-s3 serve plane = **gxy-cassiopeia** (P2); Valkey platform-svc = **gxy-management** (backs artemis registry, ADR-008 2026-05-25); **Windmill** = gxy-management, **permanent**, the sanctioned platform workflow engine (provisions DNS/OIDC/DB, constellation teardown, the cleanup cron we're replacing). Static bucket `universe-static-apps-01` on R2 (→ Ceph RGW only on bare metal; gxy-management is cloud-forever, so artemis stays on R2). Observability = Vector→ClickHouse, vmagent→VictoriaMetrics, **GlitchTip** (Sentry-compat), Grafana (ADR-015). + +> **2026-06-02 (ADR-020):** Universe ADR-020 (Proposed) EXTENDS the scope of the Windmill→artemis transfer beyond the cleanup cron analysed below. Constellation provisioning (DNS / OIDC / DB) + teardown ALSO transfer to artemis durable-exec (Hatchet, concurrency key = constellation; governance only, impl deferred). Windmill is demoted from "permanent P1 platform-ops shepherd" to staff/interactive tooling — it stays physically on gxy-management for now, relocation to gxy-backoffice deferred (not retired). Apollo Chat full-code app stays on Windmill (out of scope). The §13 analysis below (D2 + the ✓ "cleanup cron only" row) remains the historical 2026-06-01 record; ADR-020 §Scope supersedes the "Windmill stays permanent, only the cron retired" framing. + +| # | Finding | Severity | Resolution | +| --- | ----------------------------------------------------------------------------------------------- | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| D1 | **artemis is an ADR-017 *stateless* pillar**. In-cluster PG makes it stateful. | **drift — but not an M1 blocker** | **M1**: bundled single-node PG + `postgres-rclone`→R2, the already-blessed **Windmill precedent** — no new ceremony. **GA (CNPG sweep)**: formal ADR-019 carve-out — CNPG + T1/T2 floor + `management-cnpg-backups` R2 + restore drill; amend ADR-017 §Pillar + ADR-008. (§3) | +| D2 | **Two workflow engines** — Hatchet alongside the sanctioned Windmill (P1, permanent). | **coherence — needs ADR justification** | ADR-020 must position them: Windmill = platform-ops automation (provisioning/teardown); Hatchet = service-level durable execution (artemis sagas, later Apollo). Precedent: Windmill already bundles its own PG on gxy-management, so a 2nd engine+DB there isn't novel. | +| D3 | **gxy-management P1 charter = "thin and reliable"** (ADR-019); adding CNPG+Hatchet thickens it. | tension — accept w/ note | Mitigated by the Windmill+PG precedent already on P1. Keep artemis CNPG small (local-path, replicas for HA), headroom-aware. | +| D4 | **ADR number**: doc said ADR-018 (taken=early-access; 019=cassiopeia). | doc bug | → **ADR-020** (fixed in header). | +| D5 | **Postgres engine**: doc implied generic PG/Patroni. | align | Use **CNPG** (ADR-008 sanctioned), local-path PV, CNPG-native HA — matches Veritas (§3). | +| D6 | **Per-galaxy backup bucket** convention (never shared). | align | New `management-cnpg-backups` R2 bucket; T3 cross-vendor shipper runs on an orthogonal-blast-radius galaxy (per ADR-019, cassiopeia's shipper runs on mgmt — for a mgmt pillar, pick cassiopeia/external). | +| ✓ | R2 = dumb swappable S3 (ADR-008 "swap R2→Ceph RGW = config change") | **aligned** | Our dumb-S3 port stance matches exactly. | +| ✓ | Valkey hot-cache on gxy-management | **aligned** | Already a platform svc there; our cache role fits. (Caddy alias-cache would be cassiopeia-local Valkey — cross-galaxy, flag in M4.) | +| ✓ | Observability slog→Vector / Sentry→GlitchTip / /metrics→vmagent | **aligned** | artemis already wired; Hatchet metrics ride the same. Hatchet's own dashboard is an extra UI vs Grafana/HyperDX — minor. | +| ✓ | Retiring Windmill **cleanup cron** (not Windmill itself) | **aligned** | Windmill stays for platform-ops; only its `cleanup_old_deploys` flow is boneyard'd. | + +**Net:** the plan is *architecturally sound* but lands two platform decisions that need ADR-020 + amendments to ADR-017/008 before GA: **(D1) artemis becomes a stateful pillar**, and **(D2) Hatchet joins Windmill as a second, role-distinct engine.** Neither is a blocker — both have precedent (Veritas for stateful-on-cloud-via-CNPG; Windmill+PG for engine+DB on gxy-management) — but both must be ratified, not assumed. The dossier carries these as explicit tasks. + +### Appendix — verified-fact citations (fetched 2026-06-01) + +- Hatchet MIT, Postgres-only, no `ee/` dir, engine features un-paywalled — `gh api repos/hatchet-dev/hatchet/license`; pricing gates hosting/SLA/RPS/retention/HIPAA only. +- R2 lifecycle = prefix + age(days), no tag filters, has AbortIncompleteMultipartUpload — developers.cloudflare.com/r2/buckets/object-lifecycles. +- Serve plane: Caddy `r2_alias` 15s LRU, separate RO token — `infra/docker/images/caddy-s3/modules/r2alias/r2alias.go`. +- GC patterns (reachability-not-age, registry-GC race, grace window, soft-delete) — Docker/Distribution #3045, Harbor #10167/#23199, Vercel/Netlify retention docs, Git GC; full set in the research scratchpad. diff --git a/docs/design/0002-scalability-capacity.md b/docs/design/0002-scalability-capacity.md new file mode 100644 index 0000000..2ca19d8 --- /dev/null +++ b/docs/design/0002-scalability-capacity.md @@ -0,0 +1,176 @@ +# Local design 0002 -- Scalability posture + capacity envelope (R14) + +> **Status:** Measured baseline (2026-06-04) -- numbers below come from the gated load harness `cmd/loadgen` (build tag `load`) and direct PG/Valkey probes, NOT from synthetic estimates. Re-run `just loadgen` to refresh. **Invariant:** R14 (dossier `artemis-durable-exec-cutover`) -- per-site Hatchet concurrency bounds fan-out at target scale (10k sites -> millions of historical deploys); PG pool + Valkey cache sized + documented. **Engine bound cite:** `cd3b012` (T12 real-Hatchet integration suite). + +______________________________________________________________________ + +## 1. Target scale + +Local ADR 0001 fixes the design target at **10s of thousands of sites -> millions of deploys** (Vercel/Netlify-class). For capacity planning this doc uses a concrete reference point: + +- **10,000 sites** +- **300 deploys/site retained in the index** (active + recently tombstoned) -> **3,000,000 deploy rows** + +The serve plane (Caddy + R2) is untouched by artemis scale -- every site keeps serving from R2 even when PG/Hatchet are down (ADR 0001 sections 3 and 8). This doc covers only the control plane: PG metadata, the outbox relay, the Hatchet worker fan-out, and the Valkey registry cache. + +## 2. How the numbers were measured + +`cmd/loadgen` drives the real store code paths -- `pg.RegistryStore.Register`, `pg.Repo.UpsertDeploy`, `pg.Repo.EnqueueSiteChanged`, `worker.Relay.RunOnce`, and `gc.SiteGC.Run` (dry-run, no-op Mover) -- against a throwaway Postgres and reports per-stage throughput and latency percentiles as JSON. The R2 Mover and the Hatchet Publisher are stubbed because they are not the control-plane scale bound (R2 is the serve plane; the per-site engine bound is covered separately in section 5). + +Reference run environment: + +- Host: darwin/arm64, `NumCPU=10`, Go 1.26.3 +- PG: `postgres:17-alpine` (PostgreSQL 17.10), `max_connections=200`, `shared_buffers=256MB`, single container +- Workload: `-sites 500 -deploys-per-site 40` = **20,000 deploy rows** + +This is a laptop single-node Postgres over the Docker network loopback, NOT the bundled production StatefulSet. Treat the absolute throughput as a conservative floor (production PG on real disk with a warm cache does better); treat the **ratios and the per-row sizing as the portable result**. + +## 3. Measured throughput + +Two runs, default pool vs an enlarged pool, both at 20,000 deploy rows, zero errors across every stage: + +| stage | default pool (10 conns, 16 goroutines) | enlarged pool (25 conns, 24 goroutines) | +| ---------------- | -------------------------------------- | --------------------------------------- | +| `register` | 3,895 ops/s | 4,937 ops/s | +| `deploy_upsert` | 5,760 ops/s (20k rows in 3.47s) | 10,309 ops/s (20k rows in 1.94s) | +| `outbox_enqueue` | 4,232 ops/s | 7,426 ops/s | +| `relay_drain` | 39,869 rows/s (batch 100) | 58,095 rows/s (batch 100) | +| `gc_plan_dryrun` | 10,266 sites/s | 13,944 sites/s | + +Latency at the default pool (microseconds): + +| stage | p50 | p99 | max | +| ---------------- | ----- | ----- | ------ | +| `register` | 3,723 | 9,738 | 12,566 | +| `deploy_upsert` | 2,611 | 7,343 | 27,705 | +| `outbox_enqueue` | 3,347 | 8,274 | 9,336 | +| `relay_drain` | 1,612 | 3,370 | 4,215 | +| `gc_plan_dryrun` | 1,388 | 4,355 | 4,920 | + +### What this means for the target scale + +- **Backfill of 3M deploy rows** (one-shot `BACKFILL_ON_BOOT`) at the measured default-pool `deploy_upsert` rate of 5,760 rows/s = **~8.7 minutes**; at the enlarged-pool rate of 10,309 rows/s = **~4.9 minutes**. Backfill is a single cold run, so even the conservative figure is acceptable. Formula: `seconds = rows / upsert_ops_per_sec`. +- **Steady-state event fan-out:** every site mutation enqueues one outbox row, drained by the relay every 5s (`relayInterval`, `cmd/artemis/gcworkflows.go`). A 100-row batch drains in ~2.5ms (39,869 rows/s); even a burst of 10,000 queued events drains in `10000 / 39869 = ~0.25s` of relay work, far inside one tick. The relay is never the bottleneck at this scale. +- **GC sweep of 10,000 sites** (the scheduled per-site pass) plans at 10,266 sites/s = **under 1s** of planning work for the whole fleet. Actual reclaim is gated by R2 MovePrefix latency and `CLEANUP_BLAST_CAP`, not PG. + +## 4. Postgres pool sizing + +### What the code sets today + +`pg.New` (`internal/pg/pg.go`) calls `pgxpool.New(ctx, DatabaseURL)` with **no explicit `MaxConns`**. pgx v5 therefore defaults to `max(4, runtime.NumCPU())` (`pgxpool/pool.go`). On the production pod the connection cap is whatever the container sees as `NumCPU`, floored at 4. On the 10-core reference host the load harness reported `pool_max_conns: 10` -- confirming the default path. + +### Why the default under-provisions, and the fix + +The default ties the pool to CPU count, but artemis's PG concurrency is driven by **three independent producers per pod**: HTTP handlers, the Hatchet worker callbacks, and the outbox relay loop. The measured runs show the cost: at the default 10-conn pool `deploy_upsert` tops out at 5,760 ops/s; widening to 25 conns nearly doubles it to 10,309 ops/s with no error increase. The pool, not PG, was the limiter. + +No code change is required to tune it: **pgx honours `pool_max_conns` as a DSN query parameter**. The harness proved this -- a DSN ending in `...?sslmode=disable&pool_max_conns=25` reported `pool_max_conns: 25` and the enlarged-pool throughput above. Operators set it in `DATABASE_URL`: + +```bash +DATABASE_URL=postgres://artemis:pw@artemis-postgresql:5432/artemis?sslmode=disable&pool_max_conns=20 +``` + +### Per-replica multiplication at N >= 2 + +Total PG connections = `pool_max_conns x replica_count` (R13: N >= 2 stateless replicas). The bundled PG StatefulSet must size `max_connections` above that ceiling plus headroom for the Hatchet engine's own pool and for admin/backup sessions. Recommended starting point: + +| knob | value | rationale | +| ----------------------------- | ------------ | ------------------------------------------------------ | +| `pool_max_conns` (per pod) | 15-20 | ~2x the worst measured limiter, room to spare | +| artemis replicas (R13/HPA) | 2-6 | T29 HPA bound | +| artemis PG conns (worst case) | 20 x 6 = 120 | pool x max replicas | +| Hatchet engine conns | ~20 | separate role/db on the same instance (ADR 0001 / T13) | +| PG `max_connections` | 200 | 120 + 20 + ~60 headroom (admin, backup, autovac) | + +The reference harness ran PG with `max_connections=200`, which comfortably holds this envelope. Below ~120 effective `max_connections` the fleet risks `too many clients` at full replica scale -- that is the first hard cliff (section 6). + +## 5. Hatchet per-site concurrency bound + +The fan-out safety property is enforced by the engine, not by PG throughput. The adapter (`internal/hatchet/adapter.go`) registers every per-site workflow with: + +```go +types.Concurrency{ + Expression: "input.site", + MaxRuns: 1, + LimitStrategy: GroupRoundRobin, +} +``` + +`MaxRuns=1` keyed on `input.site` means **at most one workflow run executes per site at any instant**; `GroupRoundRobin` fairly interleaves distinct sites so a hot site cannot starve the rest. This is the property that makes 10k sites x millions of deploys tractable: concurrency is bounded per key, never globally unbounded, and the engine queues same-site events rather than running them concurrently. + +This is empirically validated, not assumed. The T12 gated integration suite (`cd3b012`) ran against real `hatchet-lite v0.88.1`: + +- `TestR3SameSiteNeverConcurrent` -- three events for one site, observed peak concurrency `<= 1`. +- `TestR3DistinctSitesRunConcurrent` -- two distinct sites both start and run concurrently (the round-robin does not serialise the whole fleet). +- `TestR4` (poison) -- a dead-lettering workflow never blocks its own key. + +Because the bound is per key, adding sites adds independent queues; it does not raise the concurrency any single site sees. The engine, its own Postgres, and worker slot count -- not artemis's metadata pool -- govern aggregate workflow throughput, and are sized in the infra chart (T13/T14). + +## 6. Valkey cache envelope + +Valkey is a reconstructable cache, not a source of truth (ADR 0001). The registry cache holds one hash per site plus a `sites:all` index set (`internal/registry/valkey/store.go`): `site:` with fields `teams`, `created_at`, `updated_at`, `created_by`. + +Measured against `valkey/valkey:8-alpine`, representative rows (two teams, RFC3339 timestamps): + +| population | `used_memory` | delta over empty | +| ------------------ | --------------------- | --------------------- | +| empty | 955,096 B (0.91 MB) | -- | +| 10,000 sites + set | 3,851,624 B (3.67 MB) | 2,896,528 B (2.76 MB) | + +That is **~290 bytes/site** of resident memory (dataset portion ~263 B/site). The full 10k-site registry cache fits in **under 3 MB** on top of the ~1 MB Valkey baseline. Extrapolation is linear in site count (one hash + one set member per site); deploy count does NOT enter the registry cache. Formula: `valkey_bytes ~= 1_000_000 + 290 * sites`. + +The auth `teamcache` (`internal/teamcache`) is a separate key space bounded by distinct GitHub logins seen within the membership TTL, not by site count; at fCC staff cardinality (hundreds of logins) it is negligible. A 256 MB Valkey `maxmemory` -- already over 80x the projected registry envelope -- leaves ample room; Valkey stays artemis-exclusive and NetworkPolicy-locked. + +## 7. PG storage envelope + +Measured table sizes after the 20,000-row run (heap + indexes, `pg_total_relation_size`): + +| table | rows | total bytes | bytes/row | +| --------- | ------ | ----------- | ------------------------------------- | +| `deploys` | 20,000 | 6,529,024 | **326.5** | +| `sites` | 500 | 344,064 | 688 (inflated at low row count) | +| `outbox` | 500 | 212,992 | 426 (transient -- drained + prunable) | + +The deploy row is the dominant term at scale. At **326.5 bytes/row** including the `deploys_site_mtime_idx` index, **3,000,000 deploy rows ~= 980 MB**. Add the sites table (10k x ~500 B heap ~= 5 MB) and tombstones (bounded by the recovery window) and the artemis metadata DB lands **comfortably under 2 GB** at full target scale -- well inside a single bundled StatefulSet PVC with backup headroom (T13/T17). Formula: `deploys_bytes ~= 327 * deploy_rows`. + +## 8. Known cliffs + headroom + +| cliff | trigger | mitigation | +| ------------------------ | ------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | +| `too many clients` on PG | `pool_max_conns x replicas + hatchet` > `max_connections` | size `max_connections` >= 200 (section 4); cap `pool_max_conns` | +| outbox unbounded growth | relay stalled (PG/engine down) -> rows never marked published | published rows are prunable; relay is at-least-once and resumes (R5); `/readyz` degraded surfaces the stall | +| relay batch starvation | sustained enqueue rate > drain-per-tick | drain rate (~40k rows/s) is ~10,000x the realistic enqueue rate; raise `Batch` (currently 100) or shorten `relayInterval` if ever needed | +| hot-site queue backlog | one site mutated faster than its single workflow drains | by design (MaxRuns=1); GroupRoundRobin keeps other sites moving; surfaced via `artemis_worker_queue_depth{workflow}` | +| Valkey eviction | `maxmemory` set below registry envelope | envelope is < 3 MB / 10k sites; keep `maxmemory` >= 64 MB (80x headroom) | +| backfill window | 3M-row cold backfill | ~5-9 min one-shot (section 3); acceptable, runs before serving | + +## 9. Observability hooks for capacity + +The control-plane counters are exposed at `GET /metrics` (no auth). Capacity signals to watch: + +| metric | what it tells you | +| ------------------------------------------------------ | --------------------------------------------------- | +| `artemis_worker_workflow_runs_total{workflow,outcome}` | per-workflow run volume + failure ratio | +| `artemis_worker_queue_depth{workflow}` | per-workflow backlog (hot-site / engine saturation) | +| `artemis_worker_dlq_depth` | dead-lettered runs awaiting operator | +| `artemis_relay_published_total` | outbox drain volume (relay liveness) | +| `artemis_relay_failures_total` | relay passes that errored before draining | +| `artemis_gc_runs_total{workflow,outcome}` | GC pass volume + aborts (blast-cap trips) | +| `artemis_gc_deploys_tombstoned_total` | reclaim progress | + +The `artemis_worker_*` and `artemis_relay_*` counters were wired into the boot path as part of R14 (worker-run + relay observation deferred from the readyz work, `d075130`). + +## 10. Reproducing + +```bash +just loadgen # default 500 sites x 40 deploys +SITES=1000 DEPLOYS_PER_SITE=50 just loadgen # heavier local run +``` + +The script spins up an ephemeral `postgres:17-alpine`, runs migrations via the harness, drives the load, prints the JSON report to stdout, and tears the container down on exit. To tune the pool during a run, pass a DSN with `pool_max_conns`: + +```bash +LOADGEN_DATABASE_URL='postgres://artemis:artemis@localhost:55433/artemis?sslmode=disable&pool_max_conns=25' \ + CONCURRENCY=24 just loadgen +``` + +Every number in sections 3, 6, and 7 is reproducible from this harness plus the direct PG/Valkey size probes documented inline. No figure here is estimated. diff --git a/go.mod b/go.mod index 848a420..b96078e 100644 --- a/go.mod +++ b/go.mod @@ -13,13 +13,25 @@ require ( github.com/getsentry/sentry-go/slog v0.46.2 github.com/go-chi/chi/v5 v5.2.5 github.com/golang-jwt/jwt/v5 v5.3.1 + github.com/google/uuid v1.6.0 + github.com/hatchet-dev/hatchet v0.88.1 + github.com/jackc/pgx/v5 v5.9.2 github.com/prometheus/client_golang v1.23.2 github.com/redis/go-redis/v9 v9.19.0 github.com/stretchr/testify v1.11.1 + github.com/testcontainers/testcontainers-go v0.42.0 + github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0 golang.org/x/sync v0.20.0 ) require ( + cel.dev/expr v0.25.1 // indirect + dario.cat/mergo v1.0.2 // indirect + github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect + github.com/Masterminds/semver/v3 v3.4.0 // indirect + github.com/Microsoft/go-winio v0.6.2 // indirect + github.com/antlr4-go/antlr/v4 v4.13.1 // indirect + github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.9 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.22 // indirect github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.22 // indirect @@ -34,19 +46,121 @@ require ( github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.20 // indirect github.com/aws/aws-sdk-go-v2/service/sts v1.42.0 // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect + github.com/cockroachdb/errors v1.13.0 // indirect + github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b // indirect + github.com/cockroachdb/redact v1.1.5 // indirect + github.com/containerd/errdefs v1.0.0 // indirect + github.com/containerd/errdefs/pkg v0.3.0 // indirect + github.com/containerd/log v0.1.0 // indirect + github.com/containerd/platforms v0.2.1 // indirect + github.com/cpuguy83/dockercfg v0.3.2 // indirect + github.com/creasty/defaults v1.8.0 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/distribution/reference v0.6.0 // indirect + github.com/docker/go-connections v0.7.0 // indirect + github.com/docker/go-units v0.5.0 // indirect + github.com/ebitengine/purego v0.10.0 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/fsnotify/fsnotify v1.9.0 // indirect + github.com/gabriel-vasile/mimetype v1.4.13 // indirect + github.com/getkin/kin-openapi v0.135.0 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/go-ole/go-ole v1.3.0 // indirect + github.com/go-openapi/jsonpointer v0.21.0 // indirect + github.com/go-openapi/swag v0.23.0 // indirect + github.com/go-playground/locales v0.14.1 // indirect + github.com/go-playground/universal-translator v0.18.1 // indirect + github.com/go-playground/validator/v10 v10.30.2 // indirect + github.com/go-viper/mapstructure/v2 v2.4.0 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/google/cel-go v0.28.0 // indirect + github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.3 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect + github.com/hashicorp/errwrap v1.1.0 // indirect + github.com/hashicorp/go-multierror v1.1.1 // indirect + github.com/jackc/pgpassfile v1.0.0 // indirect + github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect + github.com/jackc/puddle/v2 v2.2.2 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/klauspost/compress v1.18.5 // indirect + github.com/kr/pretty v0.3.1 // indirect + github.com/kr/text v0.2.0 // indirect github.com/kylelemons/godebug v1.1.0 // indirect + github.com/labstack/echo/v4 v4.15.1 // indirect + github.com/labstack/gommon v0.4.2 // indirect + github.com/leodido/go-urn v1.4.0 // indirect + github.com/lufia/plan9stats v0.0.0-20250827001030-24949be3fa54 // indirect + github.com/magiconair/properties v1.8.10 // indirect + github.com/mailru/easyjson v0.7.7 // indirect + github.com/mattn/go-colorable v0.1.14 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/moby/docker-image-spec v1.3.1 // indirect + github.com/moby/go-archive v0.2.0 // indirect + github.com/moby/moby/api v1.54.1 // indirect + github.com/moby/moby/client v0.4.0 // indirect + github.com/moby/patternmatcher v0.6.1 // indirect + github.com/moby/sys/sequential v0.6.0 // indirect + github.com/moby/sys/user v0.4.0 // indirect + github.com/moby/sys/userns v0.1.0 // indirect + github.com/moby/term v0.5.2 // indirect + github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/oapi-codegen/runtime v1.4.0 // indirect + github.com/oasdiff/yaml v0.0.9 // indirect + github.com/oasdiff/yaml3 v0.0.9 // indirect + github.com/opencontainers/go-digest v1.0.0 // indirect + github.com/opencontainers/image-spec v1.1.1 // indirect + github.com/pelletier/go-toml/v2 v2.2.4 // indirect + github.com/perimeterx/marshmallow v1.1.5 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect - github.com/prometheus/procfs v0.16.1 // indirect + github.com/prometheus/procfs v0.19.2 // indirect + github.com/robfig/cron/v3 v3.0.1 // indirect + github.com/rogpeppe/go-internal v1.14.1 // indirect + github.com/rs/zerolog v1.35.1 // indirect + github.com/sagikazarmark/locafero v0.11.0 // indirect + github.com/shirou/gopsutil/v4 v4.26.3 // indirect + github.com/sirupsen/logrus v1.9.4 // indirect + github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8 // indirect + github.com/spf13/afero v1.15.0 // indirect + github.com/spf13/cast v1.10.0 // indirect + github.com/spf13/pflag v1.0.10 // indirect + github.com/spf13/viper v1.21.0 // indirect + github.com/subosito/gotenv v1.6.0 // indirect + github.com/tklauser/go-sysconf v0.3.16 // indirect + github.com/tklauser/numcpus v0.11.0 // indirect + github.com/valyala/bytebufferpool v1.0.0 // indirect + github.com/valyala/fasttemplate v1.2.2 // indirect + github.com/woodsbury/decimal128 v1.3.0 // indirect github.com/yuin/gopher-lua v1.1.1 // indirect + github.com/yusufpapurcu/wmi v1.2.4 // indirect + go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0 // indirect + go.opentelemetry.io/otel v1.43.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 // indirect + go.opentelemetry.io/otel/metric v1.43.0 // indirect + go.opentelemetry.io/otel/sdk v1.43.0 // indirect + go.opentelemetry.io/otel/trace v1.43.0 // indirect + go.opentelemetry.io/proto/otlp v1.10.0 // indirect go.uber.org/atomic v1.11.0 // indirect - go.yaml.in/yaml/v2 v2.4.2 // indirect - golang.org/x/sys v0.35.0 // indirect - golang.org/x/text v0.28.0 // indirect - google.golang.org/protobuf v1.36.8 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/crypto v0.50.0 // indirect + golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa // indirect + golang.org/x/net v0.52.0 // indirect + golang.org/x/sys v0.43.0 // indirect + golang.org/x/text v0.36.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260406210006-6f92a3bedf2d // indirect + google.golang.org/grpc v1.80.0 // indirect + google.golang.org/protobuf v1.36.11 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index a718f78..99d8529 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,22 @@ +cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4= +cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4= +dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8= +dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA= +github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk= +github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8= +github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg= +github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= +github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= +github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk= github.com/alicebob/miniredis/v2 v2.37.0 h1:RheObYW32G1aiJIj81XVt78ZHJpHonHLHW7OLIshq68= github.com/alicebob/miniredis/v2 v2.37.0/go.mod h1:TcL7YfarKPGDAthEtl5NBeHZfeUQj6OXMm/+iu5cLMM= +github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ= +github.com/antlr4-go/antlr/v4 v4.13.1/go.mod h1:GKmUxMtwp6ZgGwZSva4eWPC5mS6vUAmOABFgjdkM7Nw= +github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ= +github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk= github.com/aws/aws-sdk-go-v2 v1.41.6 h1:1AX0AthnBQzMx1vbmir3Y4WsnJgiydmnJjiLu+LvXOg= github.com/aws/aws-sdk-go-v2 v1.41.6/go.mod h1:dy0UzBIfwSeot4grGvY1AqFWN5zgziMmWGzysDnHFcQ= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.9 h1:adBsCIIpLbLmYnkQU+nAChU5yhVTvu5PerROm+/Kq2A= @@ -38,14 +55,59 @@ github.com/aws/smithy-go v1.25.1 h1:J8ERsGSU7d+aCmdQur5Txg6bVoYelvQJgtZehD12GkI= github.com/aws/smithy-go v1.25.1/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w= github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/cockroachdb/errors v1.13.0 h1:BoCcJeiP9hpBJDETkX19qi8Tb8So37srSsp3stTaDMQ= +github.com/cockroachdb/errors v1.13.0/go.mod h1:bjxt/4E5+OyuAnacpTIU9rn2mzPu1VlthvHP+xpROq0= +github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b h1:r6VH0faHjZeQy818SGhaone5OnYfxFR/+AzdY3sf5aE= +github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b/go.mod h1:Vz9DsVWQQhf3vs21MhPMZpMGSht7O/2vFW2xusFUVOs= +github.com/cockroachdb/redact v1.1.5 h1:u1PMllDkdFfPWaNGMyLD1+so+aq3uUItthCFqzwPJ30= +github.com/cockroachdb/redact v1.1.5/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg= +github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= +github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= +github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= +github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk= +github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= +github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= +github.com/containerd/platforms v0.2.1 h1:zvwtM3rz2YHPQsF2CHYM8+KtB5dvhISiXh5ZpSBQv6A= +github.com/containerd/platforms v0.2.1/go.mod h1:XHCb+2/hzowdiut9rkudds9bE5yJ7npe7dG/wG+uFPw= +github.com/cpuguy83/dockercfg v0.3.2 h1:DlJTyZGBDlXqUZ2Dk2Q3xHs/FtnooJJVaad2S9GKorA= +github.com/cpuguy83/dockercfg v0.3.2/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s= +github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE= +github.com/creasty/defaults v1.8.0 h1:z27FJxCAa0JKt3utc0sCImAEb+spPucmKoOdLHvHYKk= +github.com/creasty/defaults v1.8.0/go.mod h1:iGzKe6pbEHnpMPtfDXZEr0NVxWnPTjb1bbDy08fPzYM= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= +github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= +github.com/docker/go-connections v0.7.0 h1:6SsRfJddP22WMrCkj19x9WKjEDTB+ahsdiGYf0mN39c= +github.com/docker/go-connections v0.7.0/go.mod h1:no1qkHdjq7kLMGUXYAduOhYPSJxxvgWBh7ogVvptn3Q= +github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= +github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/ebitengine/purego v0.10.0 h1:QIw4xfpWT6GWTzaW5XEKy3HXoqrJGx1ijYHzTF0/ISU= +github.com/ebitengine/purego v0.10.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= +github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/gabriel-vasile/mimetype v1.4.13 h1:46nXokslUBsAJE/wMsp5gtO500a4F3Nkz9Ufpk2AcUM= +github.com/gabriel-vasile/mimetype v1.4.13/go.mod h1:d+9Oxyo1wTzWdyVUPMmXFvp4F9tea18J8ufA774AB3s= +github.com/getkin/kin-openapi v0.135.0 h1:751SjYfbiwqukYuVjwYEIKNfrSwS5YpA7DZnKSwQgtg= +github.com/getkin/kin-openapi v0.135.0/go.mod h1:6dd5FJl6RdX4usBtFBaQhk9q62Yb2J0Mk5IhUO/QqFI= github.com/getsentry/sentry-go v0.46.2 h1:1jhYwrKGa3sIpo/y5iDNXS5wDoT7I1KNzMHrnK6ojns= github.com/getsentry/sentry-go v0.46.2/go.mod h1:evVbw2qotNUdYG8KxXbAdjOQWWvWIwKxpjdZZIvcIPw= github.com/getsentry/sentry-go/slog v0.46.2 h1:LvlIgQtGPWrzXSwnuyid4lCcRPJA+32CdHz6E2Zy4iE= @@ -54,12 +116,72 @@ github.com/go-chi/chi/v5 v5.2.5 h1:Eg4myHZBjyvJmAFjFvWgrqDTXFyOzjj7YIm3L3mu6Ug= github.com/go-chi/chi/v5 v5.2.5/go.mod h1:X7Gx4mteadT3eDOMTsXzmI4/rwUpOwBHLpAfupzFJP0= github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA= github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= +github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE= +github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78= +github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= +github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= +github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= +github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s= +github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= +github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA= +github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY= +github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY= +github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY= +github.com/go-playground/validator/v10 v10.30.2 h1:JiFIMtSSHb2/XBUbWM4i/MpeQm9ZK2xqPNk8vgvu5JQ= +github.com/go-playground/validator/v10 v10.30.2/go.mod h1:mAf2pIOVXjTEBrwUMGKkCWKKPs9NheYGabeB04txQSc= +github.com/go-test/deep v1.0.8 h1:TDsG77qcSprGbC6vTN8OuXp5g+J+b5Pcguhf7Zt61VM= +github.com/go-test/deep v1.0.8/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE= +github.com/go-viper/mapstructure/v2 v2.4.0 h1:EBsztssimR/CONLSZZ04E8qAkxNYq4Qp9LvH92wZUgs= +github.com/go-viper/mapstructure/v2 v2.4.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY= github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/cel-go v0.28.0 h1:KjSWstCpz/MN5t4a8gnGJNIYUsJRpdi/r97xWDphIQc= +github.com/google/cel-go v0.28.0/go.mod h1:X0bD6iVNR8pkROSOoHVdgTkzmRcosof7WQqCD6wcMc8= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= -github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= -github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.3 h1:B+8ClL/kCQkRiU82d9xajRPKYMrB7E0MbtzWVi1K4ns= +github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.3/go.mod h1:NbCUVmiS4foBGBHOYlCT25+YmGpJ32dZPi75pGEUpj4= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c= +github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= +github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= +github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= +github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/hatchet-dev/hatchet v0.88.1 h1:Hu157t6pi9+HNcak6HAlkrq2LmSWcF50uQZ/DK8T/z4= +github.com/hatchet-dev/hatchet v0.88.1/go.mod h1:mcOO2ia03lAbTdG/ubLXFNXQJPltXUmhzM0E9TV4jhU= +github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= +github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= +github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo= +github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= +github.com/jackc/pgx/v5 v5.9.2 h1:3ZhOzMWnR4yJ+RW1XImIPsD1aNSz4T4fyP7zlQb56hw= +github.com/jackc/pgx/v5 v5.9.2/go.mod h1:mal1tBGAFfLHvZzaYh77YS/eC6IX9OWbRV1QIIM0Jn4= +github.com/jackc/pgxlisten v0.0.0-20241106001234-1d6f6656415c h1:bTgmg761ac9Ki27HoLx8IBvc+T+Qj6eptBpKahKIRT4= +github.com/jackc/pgxlisten v0.0.0-20241106001234-1d6f6656415c/go.mod h1:N4E1APLOYrbM11HH5kdqAjDa8RJWVwD3JqWpvH22h64= +github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo= +github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE= +github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ= github.com/klauspost/cpuid/v2 v2.2.10 h1:tBs3QSyvjDyFTq3uoc/9xFpCuOsJQFNPiAhYdw2skhE= github.com/klauspost/cpuid/v2 v2.2.10/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= @@ -68,48 +190,232 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/labstack/echo/v4 v4.15.1 h1:S9keusg26gZpjMmPqB5hOEvNKnmd1lNmcHrbbH2lnFs= +github.com/labstack/echo/v4 v4.15.1/go.mod h1:xmw1clThob0BSVRX1CRQkGQ/vjwcpOMjQZSZa9fKA/c= +github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0= +github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU= +github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ= +github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI= +github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= +github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= +github.com/lufia/plan9stats v0.0.0-20250827001030-24949be3fa54 h1:mFWunSatvkQQDhpdyuFAYwyAan3hzCuma+Pz8sqvOfg= +github.com/lufia/plan9stats v0.0.0-20250827001030-24949be3fa54/go.mod h1:autxFIvghDt3jPTLoqZ9OZ7s9qTGNAWmYCjVFWPX/zg= +github.com/magiconair/properties v1.8.10 h1:s31yESBquKXCV9a/ScB3ESkOjUYYv+X0rg8SYxI99mE= +github.com/magiconair/properties v1.8.10/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= +github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mdelapenya/tlscert v0.2.0 h1:7H81W6Z/4weDvZBNOfQte5GpIMo0lGYEeWbkGp5LJHI= +github.com/mdelapenya/tlscert v0.2.0/go.mod h1:O4njj3ELLnJjGdkN7M/vIVCpZ+Cf0L6muqOG4tLSl8o= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= +github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= +github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= +github.com/moby/go-archive v0.2.0 h1:zg5QDUM2mi0JIM9fdQZWC7U8+2ZfixfTYoHL7rWUcP8= +github.com/moby/go-archive v0.2.0/go.mod h1:mNeivT14o8xU+5q1YnNrkQVpK+dnNe/K6fHqnTg4qPU= +github.com/moby/moby/api v1.54.1 h1:TqVzuJkOLsgLDDwNLmYqACUuTehOHRGKiPhvH8V3Nn4= +github.com/moby/moby/api v1.54.1/go.mod h1:+RQ6wluLwtYaTd1WnPLykIDPekkuyD/ROWQClE83pzs= +github.com/moby/moby/client v0.4.0 h1:S+2XegzHQrrvTCvF6s5HFzcrywWQmuVnhOXe2kiWjIw= +github.com/moby/moby/client v0.4.0/go.mod h1:QWPbvWchQbxBNdaLSpoKpCdf5E+WxFAgNHogCWDoa7g= +github.com/moby/patternmatcher v0.6.1 h1:qlhtafmr6kgMIJjKJMDmMWq7WLkKIo23hsrpR3x084U= +github.com/moby/patternmatcher v0.6.1/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc= +github.com/moby/sys/sequential v0.6.0 h1:qrx7XFUd/5DxtqcoH1h438hF5TmOvzC/lspjy7zgvCU= +github.com/moby/sys/sequential v0.6.0/go.mod h1:uyv8EUTrca5PnDsdMGXhZe6CCe8U/UiTWd+lL+7b/Ko= +github.com/moby/sys/user v0.4.0 h1:jhcMKit7SA80hivmFJcbB1vqmw//wU61Zdui2eQXuMs= +github.com/moby/sys/user v0.4.0/go.mod h1:bG+tYYYJgaMtRKgEmuueC0hJEAZWwtIbZTB+85uoHjs= +github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g= +github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28= +github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ= +github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc= +github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 h1:RWengNIwukTxcDr9M+97sNutRR1RKhG96O6jWumTTnw= +github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826/go.mod h1:TaXosZuwdSHYgviHp1DAtfrULt5eUgsSMsZf+YrPgl8= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/oapi-codegen/runtime v1.4.0 h1:KLOSFOp7UzkbS7Cs1ms6NBEKYr0WmH2wZG0KKbd2er4= +github.com/oapi-codegen/runtime v1.4.0/go.mod h1:5sw5fxCDmnOzKNYmkVNF8d34kyUeejJEY8HNT2WaPec= +github.com/oasdiff/yaml v0.0.9 h1:zQOvd2UKoozsSsAknnWoDJlSK4lC0mpmjfDsfqNwX48= +github.com/oasdiff/yaml v0.0.9/go.mod h1:8lvhgJG4xiKPj3HN5lDow4jZHPlx1i7dIwzkdAo6oAM= +github.com/oasdiff/yaml3 v0.0.9 h1:rWPrKccrdUm8J0F3sGuU+fuh9+1K/RdJlWF7O/9yw2g= +github.com/oasdiff/yaml3 v0.0.9/go.mod h1:y5+oSEHCPT/DGrS++Wc/479ERge0zTFxaF8PbGKcg2o= +github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= +github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= +github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040= +github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M= +github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= +github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= +github.com/perimeterx/marshmallow v1.1.5 h1:a2LALqQ1BlHM8PZblsDdidgv1mWi1DgC2UmX50IvK2s= +github.com/perimeterx/marshmallow v1.1.5/go.mod h1:dsXbUu8CRzfYP5a87xpp0xq9S3u0Vchtcl8we9tYaXw= github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4= github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8= +github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU= +github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= -github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= -github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= +github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws= +github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw= github.com/redis/go-redis/v9 v9.19.0 h1:XPVaaPSnG6RhYf7p+rmSa9zZfeVAnWsH5h3lxthOm/k= github.com/redis/go-redis/v9 v9.19.0/go.mod h1:v/M13XI1PVCDcm01VtPFOADfZtHf8YW3baQf57KlIkA= -github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= -github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= +github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= +github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= +github.com/rs/zerolog v1.35.1 h1:m7xQeoiLIiV0BCEY4Hs+j2NG4Gp2o2KPKmhnnLiazKI= +github.com/rs/zerolog v1.35.1/go.mod h1:EjML9kdfa/RMA7h/6z6pYmq1ykOuA8/mjWaEvGI+jcw= +github.com/sagikazarmark/locafero v0.11.0 h1:1iurJgmM9G3PA/I+wWYIOw/5SyBtxapeHDcg+AAIFXc= +github.com/sagikazarmark/locafero v0.11.0/go.mod h1:nVIGvgyzw595SUSUE6tvCp3YYTeHs15MvlmU87WwIik= +github.com/shirou/gopsutil/v4 v4.26.3 h1:2ESdQt90yU3oXF/CdOlRCJxrP+Am1aBYubTMTfxJ1qc= +github.com/shirou/gopsutil/v4 v4.26.3/go.mod h1:LZ6ewCSkBqUpvSOf+LsTGnRinC6iaNUNMGBtDkJBaLQ= +github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w= +github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g= +github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8 h1:+jumHNA0Wrelhe64i8F6HNlS8pkoyMv5sreGx2Ry5Rw= +github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8/go.mod h1:3n1Cwaq1E1/1lhQhtRK2ts/ZwZEhjcQeJQ1RuC6Q/8U= +github.com/spf13/afero v1.15.0 h1:b/YBCLWAJdFWJTN9cLhiXXcD7mzKn9Dm86dNnfyQw1I= +github.com/spf13/afero v1.15.0/go.mod h1:NC2ByUVxtQs4b3sIUphxK0NioZnmxgyCrfzeuq8lxMg= +github.com/spf13/cast v1.10.0 h1:h2x0u2shc1QuLHfxi+cTJvs30+ZAHOGRic8uyGTDWxY= +github.com/spf13/cast v1.10.0/go.mod h1:jNfB8QC9IA6ZuY2ZjDp0KtFO2LZZlg4S/7bzP6qqeHo= +github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= +github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/viper v1.21.0 h1:x5S+0EU27Lbphp4UKm1C+1oQO+rKx36vfCoaVebLFSU= +github.com/spf13/viper v1.21.0/go.mod h1:P0lhsswPGWD/1lZJ9ny3fYnVqxiegrlNrEmgLjbTCAY= +github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.5.3 h1:jmXUvGomnU1o3W/V5h2VEradbpJDwGrzugQQvL0POH4= +github.com/stretchr/objx v0.5.3/go.mod h1:rDQraq+vQZU7Fde9LOZLr8Tax6zZvy4kuNKF+QYS+U0= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= +github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= +github.com/testcontainers/testcontainers-go v0.42.0 h1:He3IhTzTZOygSXLJPMX7n44XtK+qhjat1nI9cneBbUY= +github.com/testcontainers/testcontainers-go v0.42.0/go.mod h1:vZjdY1YmUA1qEForxOIOazfsrdyORJAbhi0bp8plN30= +github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0 h1:GCbb1ndrF7OTDiIvxXyItaDab4qkzTFJ48LKFdM7EIo= +github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0/go.mod h1:IRPBaI8jXdrNfD0e4Zm7Fbcgaz5shKxOQv4axiL09xs= +github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA= +github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI= +github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw= +github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ= +github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE= +github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg= +github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= +github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= +github.com/valyala/fasttemplate v1.2.2 h1:lxLXG0uE3Qnshl9QyaK6XJxMXlQZELvChBOCmQD0Loo= +github.com/valyala/fasttemplate v1.2.2/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ= +github.com/woodsbury/decimal128 v1.3.0 h1:8pffMNWIlC0O5vbyHWFZAt5yWvWcrHA+3ovIIjVWss0= +github.com/woodsbury/decimal128 v1.3.0/go.mod h1:C5UTmyTjW3JftjUFzOVhC20BEQa2a4ZKOB5I6Zjb+ds= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M= github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw= +github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= +github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= github.com/zeebo/xxh3 v1.1.0 h1:s7DLGDK45Dyfg7++yxI0khrfwq9661w9EN78eP/UZVs= github.com/zeebo/xxh3 v1.1.0/go.mod h1:IisAie1LELR4xhVinxWS5+zf1lA4p0MW4T+w+W07F5s= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0 h1:OyrsyzuttWTSur2qN/Lm0m2a8yqyIjUVBZcxFPuXq2o= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0/go.mod h1:C2NGBr+kAB4bk3xtMXfZ94gqFDtg/GkI7e9zqGh5Beg= +go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= +go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 h1:8UQVDcZxOJLtX6gxtDt3vY2WTgvZqMQRzjsqiIHQdkc= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0/go.mod h1:2lmweYCiHYpEjQ/lSJBYhj9jP1zvCvQW4BqL9dnT7FQ= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 h1:RAE+JPfvEmvy+0LzyUA25/SGawPwIUbZ6u0Wug54sLc= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0/go.mod h1:AGmbycVGEsRx9mXMZ75CsOyhSP6MFIcj/6dnG+vhVjk= +go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= +go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= +go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= +go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg= +go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw= +go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A= +go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A= +go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= +go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g= +go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk= go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE= go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= -go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= -go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI= +golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q= +golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa h1:Zt3DZoOFFYkKhDT3v7Lm9FDMEV06GpzjG2jrqW+QTE0= +golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa/go.mod h1:K79w1Vqn7PoiZn+TkNpx3BUWUQksGO3JcVX6qIjytmA= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= +golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= -golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= -golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= -golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= -google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= -google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= +golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/term v0.42.0 h1:UiKe+zDFmJobeJ5ggPwOshJIVt6/Ft0rcfrXZDLWAWY= +golang.org/x/term v0.42.0/go.mod h1:Dq/D+snpsbazcBG5+F9Q1n2rXV8Ma+71xEjTRufARgY= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= +golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= +gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260406210006-6f92a3bedf2d h1:wT2n40TBqFY6wiwazVK9/iTWbsQrgk5ZfCSVFLO9LQA= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260406210006-6f92a3bedf2d/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= +google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM= +google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= +google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gotest.tools/v3 v3.5.2 h1:7koQfIKdy+I8UTetycgUqXWSDwpgv193Ka+qRsmBY8Q= +gotest.tools/v3 v3.5.2/go.mod h1:LtdLGcnqToBH83WByAAi/wiwSFCArdFIUV/xxN4pcjA= +pgregory.net/rapid v1.2.0 h1:keKAYRcjm+e1F0oAuU5F5+YPAWcyxNNRK2wud503Gnk= +pgregory.net/rapid v1.2.0/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= diff --git a/internal/auth/github.go b/internal/auth/github.go index d60ab74..6acc093 100644 --- a/internal/auth/github.go +++ b/internal/auth/github.go @@ -8,6 +8,7 @@ import ( "errors" "fmt" "io" + "log/slog" "net/http" "net/url" "strings" @@ -17,6 +18,11 @@ import ( "golang.org/x/sync/singleflight" ) +type TeamCache interface { + Get(ctx context.Context, login string) ([]string, bool, error) + Set(ctx context.Context, login string, teams []string) error +} + // GitHubClientConfig configures the GitHub REST client used for identity // validation and team-membership probes. type GitHubClientConfig struct { @@ -25,15 +31,17 @@ type GitHubClientConfig struct { CacheTTL time.Duration // membership + identity cache TTL (default 5 min) HTTPClient *http.Client // optional override (testability) Now func() time.Time + TeamCache TeamCache } // GitHubClient validates GitHub bearer tokens against /user and probes // team memberships in the configured org. Both call paths are cached for // CacheTTL to absorb steady-state pressure on the GitHub REST API. type GitHubClient struct { - cfg GitHubClientConfig - httpClient *http.Client - now func() time.Time + cfg GitHubClientConfig + httpClient *http.Client + now func() time.Time + teamCacheDurable TeamCache mu sync.Mutex userCache map[string]userCacheEntry // hashToken(raw) → cached login @@ -82,12 +90,13 @@ func NewGitHubClient(cfg GitHubClientConfig) *GitHubClient { cfg.Now = time.Now } return &GitHubClient{ - cfg: cfg, - httpClient: cfg.HTTPClient, - now: cfg.Now, - userCache: make(map[string]userCacheEntry), - teamCache: make(map[teamCacheKey]teamCacheEntry), - userTeamsCache: make(map[string]userTeamsCacheEntry), + cfg: cfg, + httpClient: cfg.HTTPClient, + now: cfg.Now, + teamCacheDurable: cfg.TeamCache, + userCache: make(map[string]userCacheEntry), + teamCache: make(map[teamCacheKey]teamCacheEntry), + userTeamsCache: make(map[string]userTeamsCacheEntry), } } @@ -148,7 +157,7 @@ func (c *GitHubClient) fetchUser(ctx context.Context, cacheKey, token string) (s } defer resp.Body.Close() - body, _ := io.ReadAll(resp.Body) + body, readErr := io.ReadAll(resp.Body) switch { case resp.StatusCode == http.StatusOK: @@ -172,6 +181,9 @@ func (c *GitHubClient) fetchUser(ctx context.Context, cacheKey, token string) (s return "", fmt.Errorf("github /user: unexpected status %d: %s", resp.StatusCode, string(body)) } + if readErr != nil { + return "", fmt.Errorf("github /user: read body: %w", readErr) + } var u struct { Login string `json:"login"` } @@ -273,17 +285,21 @@ func (c *GitHubClient) fetchTeamMembership(ctx context.Context, token, user, tea } defer resp.Body.Close() - body, _ := io.ReadAll(resp.Body) + body, readErr := io.ReadAll(resp.Body) var member bool switch { case resp.StatusCode == http.StatusOK: + if readErr != nil { + return false, fmt.Errorf("github team membership: read body: %w", readErr) + } var m struct { State string `json:"state"` } - if err := json.Unmarshal(body, &m); err == nil { - member = m.State == "active" + if err := json.Unmarshal(body, &m); err != nil { + return false, fmt.Errorf("github team membership: parse: %w", err) } + member = m.State == "active" case resp.StatusCode == http.StatusNotFound: member = false case resp.StatusCode == http.StatusForbidden && isRateLimited(resp): @@ -328,7 +344,7 @@ func (c *GitHubClient) UserTeams(ctx context.Context, token string) ([]string, e return append([]string(nil), entry.teams...), nil } c.mu.Unlock() - return c.fetchUserTeams(ctx, cacheKey, token) + return c.userTeamsThroughDurableCache(ctx, cacheKey, token) }) if err != nil { return nil, err @@ -336,6 +352,39 @@ func (c *GitHubClient) UserTeams(ctx context.Context, token string) ([]string, e return v.([]string), nil } +func (c *GitHubClient) userTeamsThroughDurableCache(ctx context.Context, cacheKey, token string) ([]string, error) { + if c.teamCacheDurable == nil { + return c.fetchUserTeams(ctx, cacheKey, token) + } + login, err := c.ValidateToken(ctx, token) + if err != nil { + return nil, err + } + if teams, hit, err := c.teamCacheDurable.Get(ctx, login); err != nil { + return nil, err + } else if hit { + c.storeUserTeams(cacheKey, teams) + return append([]string(nil), teams...), nil + } + teams, err := c.fetchUserTeams(ctx, cacheKey, token) + if err != nil { + return nil, err + } + if err := c.teamCacheDurable.Set(ctx, login, teams); err != nil { + slog.Warn("durable team cache write failed", "login", login, "err", err) + } + return teams, nil +} + +func (c *GitHubClient) storeUserTeams(cacheKey string, teams []string) { + c.mu.Lock() + c.userTeamsCache[cacheKey] = userTeamsCacheEntry{ + teams: append([]string(nil), teams...), + expires: c.now().Add(c.cfg.CacheTTL), + } + c.mu.Unlock() +} + func (c *GitHubClient) fetchUserTeams(ctx context.Context, cacheKey, token string) ([]string, error) { var teams []string page := 1 @@ -353,7 +402,7 @@ func (c *GitHubClient) fetchUserTeams(ctx context.Context, cacheKey, token strin if err != nil { return nil, fmt.Errorf("github: %w", err) } - body, _ := io.ReadAll(resp.Body) + body, readErr := io.ReadAll(resp.Body) _ = resp.Body.Close() switch { @@ -371,6 +420,9 @@ func (c *GitHubClient) fetchUserTeams(ctx context.Context, cacheKey, token strin return nil, fmt.Errorf("github /user/teams: unexpected status %d: %s", resp.StatusCode, string(body)) } + if readErr != nil { + return nil, fmt.Errorf("github /user/teams: read body: %w", readErr) + } var pageTeams []struct { Slug string `json:"slug"` Organization struct { diff --git a/internal/auth/github_errorpaths_test.go b/internal/auth/github_errorpaths_test.go new file mode 100644 index 0000000..13e700b --- /dev/null +++ b/internal/auth/github_errorpaths_test.go @@ -0,0 +1,271 @@ +package auth + +import ( + "context" + "errors" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type failingGetTeamCache struct { + err error +} + +func (f failingGetTeamCache) Get(context.Context, string) ([]string, bool, error) { + return nil, false, f.err +} + +func (failingGetTeamCache) Set(context.Context, string, []string) error { return nil } + +func TestUserTeams_DurableGetError_SurfacesNotRefetches(t *testing.T) { + getErr := errors.New("valkey down") + + teamsCalls := atomic.Int32{} + mux := http.NewServeMux() + mux.HandleFunc("/user", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"login":"alice"}`)) + }) + mux.HandleFunc("/user/teams", func(w http.ResponseWriter, r *http.Request) { + teamsCalls.Add(1) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`[{"slug":"staff","organization":{"login":"freeCodeCamp"}}]`)) + }) + server := httptest.NewServer(mux) + defer server.Close() + + c := NewGitHubClient(GitHubClientConfig{ + APIBase: server.URL, + Org: "freeCodeCamp", + CacheTTL: time.Minute, + TeamCache: failingGetTeamCache{err: getErr}, + }) + + teams, err := c.UserTeams(context.Background(), "ghp_x") + require.Error(t, err, "a durable-cache Get error must fail auth, not silently re-fetch") + require.Nil(t, teams) + assert.ErrorIs(t, err, getErr, + "the Get failure must propagate to the caller so auth fails closed") + assert.EqualValues(t, 0, teamsCalls.Load(), + "a durable Get error must NOT fall through to a fresh GitHub /user/teams fetch") +} + +func TestFetchTeamMembership_StatusClassification(t *testing.T) { + tests := []struct { + name string + configure func(w http.ResponseWriter) + assertErr func(t *testing.T, err error) + expectCache bool + }{ + { + name: "rate limit is transient", + configure: func(w http.ResponseWriter) { + w.Header().Set("X-RateLimit-Remaining", "0") + w.WriteHeader(http.StatusForbidden) + }, + assertErr: func(t *testing.T, err error) { + t.Helper() + require.Error(t, err) + assert.True(t, IsGitHubRateLimited(err), + "403 + X-RateLimit-Remaining:0 must map to rate-limited, got %v", err) + }, + }, + { + name: "5xx is unavailable", + configure: func(w http.ResponseWriter) { + w.WriteHeader(http.StatusBadGateway) + }, + assertErr: func(t *testing.T, err error) { + t.Helper() + require.Error(t, err) + assert.True(t, IsGitHubUnavailable(err), + "502 must map to upstream-unavailable, got %v", err) + }, + }, + { + name: "unexpected status is a generic error", + configure: func(w http.ResponseWriter) { + w.WriteHeader(http.StatusTeapot) + }, + assertErr: func(t *testing.T, err error) { + t.Helper() + require.Error(t, err) + assert.False(t, IsGitHubRateLimited(err)) + assert.False(t, IsGitHubUnavailable(err)) + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + memberCalls := atomic.Int32{} + mux := http.NewServeMux() + mux.HandleFunc("/user", func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(`{"login":"alice"}`)) + }) + mux.HandleFunc("/orgs/", func(w http.ResponseWriter, r *http.Request) { + memberCalls.Add(1) + tt.configure(w) + }) + server := httptest.NewServer(mux) + defer server.Close() + + c := NewGitHubClient(GitHubClientConfig{ + APIBase: server.URL, Org: "freeCodeCamp", CacheTTL: time.Minute, + }) + + _, err := c.IsTeamMember(context.Background(), "ghp_x", "alice", "team-eng") + tt.assertErr(t, err) + + _, _ = c.IsTeamMember(context.Background(), "ghp_x", "alice", "team-eng") + assert.EqualValues(t, 2, memberCalls.Load(), + "a transient/unexpected membership status must NOT be cached; second call re-probes") + }) + } +} + +func TestFetchUserTeams_StatusClassification(t *testing.T) { + tests := []struct { + name string + configure func(w http.ResponseWriter) + assertErr func(t *testing.T, err error) + }{ + { + name: "401 is unauthenticated", + configure: func(w http.ResponseWriter) { + w.WriteHeader(http.StatusUnauthorized) + }, + assertErr: func(t *testing.T, err error) { + t.Helper() + require.Error(t, err) + assert.True(t, IsGitHubUnauthenticated(err), "got %v", err) + }, + }, + { + name: "plain 403 is unauthenticated", + configure: func(w http.ResponseWriter) { + w.WriteHeader(http.StatusForbidden) + }, + assertErr: func(t *testing.T, err error) { + t.Helper() + require.Error(t, err) + assert.True(t, IsGitHubUnauthenticated(err), "got %v", err) + assert.False(t, IsGitHubRateLimited(err)) + }, + }, + { + name: "403 + rate-limit header is rate-limited", + configure: func(w http.ResponseWriter) { + w.Header().Set("X-RateLimit-Remaining", "0") + w.WriteHeader(http.StatusForbidden) + }, + assertErr: func(t *testing.T, err error) { + t.Helper() + require.Error(t, err) + assert.True(t, IsGitHubRateLimited(err), "got %v", err) + assert.False(t, IsGitHubUnauthenticated(err)) + }, + }, + { + name: "5xx is unavailable", + configure: func(w http.ResponseWriter) { + w.WriteHeader(http.StatusBadGateway) + }, + assertErr: func(t *testing.T, err error) { + t.Helper() + require.Error(t, err) + assert.True(t, IsGitHubUnavailable(err), "got %v", err) + }, + }, + { + name: "unexpected status is a generic error", + configure: func(w http.ResponseWriter) { + w.WriteHeader(http.StatusTeapot) + }, + assertErr: func(t *testing.T, err error) { + t.Helper() + require.Error(t, err) + assert.False(t, IsGitHubRateLimited(err)) + assert.False(t, IsGitHubUnavailable(err)) + assert.False(t, IsGitHubUnauthenticated(err)) + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/user", func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(`{"login":"alice"}`)) + }) + mux.HandleFunc("/user/teams", func(w http.ResponseWriter, r *http.Request) { + tt.configure(w) + }) + server := httptest.NewServer(mux) + defer server.Close() + + c := NewGitHubClient(GitHubClientConfig{ + APIBase: server.URL, Org: "freeCodeCamp", CacheTTL: time.Minute, + }) + + _, err := c.UserTeams(context.Background(), "ghp_x") + tt.assertErr(t, err) + }) + } +} + +func TestFetchUser_404CachesNegative(t *testing.T) { + userCalls := atomic.Int32{} + mux := http.NewServeMux() + mux.HandleFunc("/user", func(w http.ResponseWriter, r *http.Request) { + userCalls.Add(1) + w.WriteHeader(http.StatusNotFound) + _, _ = w.Write([]byte(`{"message":"Not Found"}`)) + }) + server := httptest.NewServer(mux) + defer server.Close() + + c := NewGitHubClient(GitHubClientConfig{ + APIBase: server.URL, Org: "freeCodeCamp", CacheTTL: time.Minute, + }) + + _, err := c.ValidateToken(context.Background(), "ghp_x") + require.Error(t, err) + assert.True(t, IsGitHubUnauthenticated(err), + "404 on /user must map to a cached unauthenticated negative, got %v", err) + + for i := 0; i < 3; i++ { + _, err := c.ValidateToken(context.Background(), "ghp_x") + require.Error(t, err) + assert.True(t, IsGitHubUnauthenticated(err)) + } + assert.EqualValues(t, 1, userCalls.Load(), + "404 negative must be cached; repeat calls must not re-hit upstream") +} + +func TestFetchUser_UnexpectedStatus_GenericError(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/user", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusTeapot) + _, _ = w.Write([]byte(`{"message":"teapot"}`)) + }) + server := httptest.NewServer(mux) + defer server.Close() + + c := NewGitHubClient(GitHubClientConfig{ + APIBase: server.URL, Org: "freeCodeCamp", CacheTTL: time.Minute, + }) + + _, err := c.ValidateToken(context.Background(), "ghp_x") + require.Error(t, err) + assert.False(t, IsGitHubUnauthenticated(err), + "an unexpected /user status must not be classified as a cacheable auth negative") + assert.False(t, IsGitHubRateLimited(err)) + assert.False(t, IsGitHubUnavailable(err)) +} diff --git a/internal/auth/github_test.go b/internal/auth/github_test.go index 5046ad1..4a00847 100644 --- a/internal/auth/github_test.go +++ b/internal/auth/github_test.go @@ -2,6 +2,7 @@ package auth import ( "context" + "errors" "net/http" "net/http/httptest" "regexp" @@ -10,6 +11,9 @@ import ( "testing" "time" + "github.com/alicebob/miniredis/v2" + "github.com/freeCodeCamp/artemis/internal/teamcache" + "github.com/redis/go-redis/v9" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -124,6 +128,21 @@ func TestGitHubClient_Cache_HitsAndExpires(t *testing.T) { assert.EqualValues(t, 2, gh.userCalls.Load(), "expected cache to expire and refresh") } +func TestGitHubClient_TeamMembership_MalformedBodyIsTransient(t *testing.T) { + gh := newFakeGH() + defer gh.Close() + gh.memberStatus = 200 + gh.memberBody = `{"state":` // truncated JSON + + c := NewGitHubClient(GitHubClientConfig{APIBase: gh.server.URL, Org: "freeCodeCamp", CacheTTL: time.Minute}) + + _, err := c.IsTeamMember(context.Background(), "ghp_test", "alice", "team-eng") + require.Error(t, err, "a 200 with an unparseable body must surface as a transient error, not a silent non-member") + + _, _ = c.IsTeamMember(context.Background(), "ghp_test", "alice", "team-eng") + assert.EqualValues(t, 2, gh.memberCalls.Load(), "a parse failure must not be cached as a membership denial") +} + func TestGitHubClient_TeamMembership_Active(t *testing.T) { gh := newFakeGH() defer gh.Close() @@ -511,3 +530,123 @@ func TestGitHubClient_AuthorizeForSite_NoTeams(t *testing.T) { require.NoError(t, err) assert.False(t, ok) } + +func TestAuthUsesTeamCache(t *testing.T) { + mr, err := miniredis.Run() + require.NoError(t, err) + defer mr.Close() + rdb := redis.NewClient(&redis.Options{Addr: mr.Addr()}) + defer func() { _ = rdb.Close() }() + tc := teamcache.New(rdb, time.Minute) + + userCalls := atomic.Int32{} + teamsCalls := atomic.Int32{} + mux := http.NewServeMux() + mux.HandleFunc("/user", func(w http.ResponseWriter, r *http.Request) { + userCalls.Add(1) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"login":"alice"}`)) + }) + mux.HandleFunc("/user/teams", func(w http.ResponseWriter, r *http.Request) { + teamsCalls.Add(1) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`[{"slug":"staff","organization":{"login":"freeCodeCamp"}}]`)) + }) + server := httptest.NewServer(mux) + defer server.Close() + + newClient := func() *GitHubClient { + return NewGitHubClient(GitHubClientConfig{ + APIBase: server.URL, + Org: "freeCodeCamp", + CacheTTL: time.Minute, + TeamCache: tc, + }) + } + + for i := 0; i < 3; i++ { + c := newClient() + teams, err := c.UserTeams(context.Background(), "ghp_alice") + require.NoError(t, err) + assert.Equal(t, []string{"staff"}, teams) + } + + assert.EqualValues(t, 1, teamsCalls.Load(), + "durable teamcache must absorb repeated lookups across fresh clients (replicas/restarts): GitHub /user/teams hit once") +} + +type failingSetTeamCache struct{} + +func (failingSetTeamCache) Get(context.Context, string) ([]string, bool, error) { + return nil, false, nil +} + +func (failingSetTeamCache) Set(context.Context, string, []string) error { + return errors.New("valkey down") +} + +func TestAuthTeamCacheSetFailureNonFatal(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/user", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"login":"alice"}`)) + }) + mux.HandleFunc("/user/teams", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`[{"slug":"staff","organization":{"login":"freeCodeCamp"}}]`)) + }) + server := httptest.NewServer(mux) + defer server.Close() + + c := NewGitHubClient(GitHubClientConfig{ + APIBase: server.URL, + Org: "freeCodeCamp", + CacheTTL: time.Minute, + TeamCache: failingSetTeamCache{}, + }) + + teams, err := c.UserTeams(context.Background(), "ghp_alice") + require.NoError(t, err, + "durable cache write failure must not discard successfully fetched teams") + assert.Equal(t, []string{"staff"}, teams) +} + +func TestAuthTeamCacheRespectsTTL(t *testing.T) { + mr, err := miniredis.Run() + require.NoError(t, err) + defer mr.Close() + rdb := redis.NewClient(&redis.Options{Addr: mr.Addr()}) + defer func() { _ = rdb.Close() }() + tc := teamcache.New(rdb, time.Minute) + + teamsCalls := atomic.Int32{} + mux := http.NewServeMux() + mux.HandleFunc("/user", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"login":"alice"}`)) + }) + mux.HandleFunc("/user/teams", func(w http.ResponseWriter, r *http.Request) { + teamsCalls.Add(1) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`[{"slug":"staff","organization":{"login":"freeCodeCamp"}}]`)) + }) + server := httptest.NewServer(mux) + defer server.Close() + + mk := func() *GitHubClient { + return NewGitHubClient(GitHubClientConfig{ + APIBase: server.URL, Org: "freeCodeCamp", CacheTTL: time.Minute, TeamCache: tc, + }) + } + + _, err = mk().UserTeams(context.Background(), "ghp_alice") + require.NoError(t, err) + assert.EqualValues(t, 1, teamsCalls.Load()) + + mr.FastForward(2 * time.Minute) + + _, err = mk().UserTeams(context.Background(), "ghp_alice") + require.NoError(t, err) + assert.EqualValues(t, 2, teamsCalls.Load(), + "expired durable entry must not be served stale; re-probe GitHub after TTL") +} diff --git a/internal/backfill/backfill.go b/internal/backfill/backfill.go new file mode 100644 index 0000000..589c6ef --- /dev/null +++ b/internal/backfill/backfill.go @@ -0,0 +1,107 @@ +package backfill + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/freeCodeCamp/artemis/internal/gc" + "github.com/freeCodeCamp/artemis/internal/r2" +) + +type Lister interface { + ListSites(ctx context.Context) ([]string, error) + ListPrefix(ctx context.Context, prefix string) ([]string, error) + GetAlias(ctx context.Context, key string) (string, error) +} + +type Indexer interface { + UpsertDeploy(ctx context.Context, site, id string, mtime time.Time, bytes int64, hasMarker bool, state string) error + UpsertAlias(ctx context.Context, site, name, deployID string, updatedAt time.Time) error +} + +type Backfill struct { + Lister Lister + Indexer Indexer + Now func() time.Time +} + +type Result struct { + Sites int + Deploys int + Aliases int +} + +func (b *Backfill) Run(ctx context.Context) (Result, error) { + var res Result + sites, err := b.Lister.ListSites(ctx) + if err != nil { + return res, fmt.Errorf("backfill: list sites: %w", err) + } + + for _, site := range sites { + res.Sites++ + deploysPrefix := site + "/deploys/" + keys, err := b.Lister.ListPrefix(ctx, deploysPrefix) + if err != nil { + return res, fmt.Errorf("backfill: list %s: %w", site, err) + } + + markers := map[string]bool{} + seen := map[string]struct{}{} + var order []string + for _, k := range keys { + rest := strings.TrimPrefix(k, deploysPrefix) + seg := rest + if i := strings.IndexByte(rest, '/'); i >= 0 { + seg = rest[:i] + } + if seg == "" { + continue + } + if _, ok := seen[seg]; !ok { + seen[seg] = struct{}{} + order = append(order, seg) + } + if rest == seg+"/"+gc.MarkerObjectName { + markers[seg] = true + } + } + + for _, id := range order { + if err := b.Indexer.UpsertDeploy(ctx, site, id, parseDeployMtime(id, b.Now()), 0, markers[id], "active"); err != nil { + return res, fmt.Errorf("backfill: index deploy %s/%s: %w", site, id, err) + } + res.Deploys++ + } + + for _, mode := range []string{"production", "preview"} { + v, err := b.Lister.GetAlias(ctx, site+"/"+mode) + if err != nil { + if r2.IsNotFound(err) { + continue + } + return res, fmt.Errorf("backfill: alias %s/%s: %w", site, mode, err) + } + v = strings.TrimSpace(v) + if v == "" { + continue + } + if err := b.Indexer.UpsertAlias(ctx, site, mode, v, b.Now()); err != nil { + return res, fmt.Errorf("backfill: index alias %s/%s: %w", site, mode, err) + } + res.Aliases++ + } + } + return res, nil +} + +func parseDeployMtime(id string, fallback time.Time) time.Time { + if len(id) >= 15 { + if t, err := time.Parse("20060102-150405", id[:15]); err == nil { + return t.UTC() + } + } + return fallback +} diff --git a/internal/backfill/backfill_test.go b/internal/backfill/backfill_test.go new file mode 100644 index 0000000..5ed5af4 --- /dev/null +++ b/internal/backfill/backfill_test.go @@ -0,0 +1,119 @@ +package backfill + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/freeCodeCamp/artemis/internal/r2" +) + +type fakeLister struct { + sites []string + byPfx map[string][]string + aliases map[string]string +} + +func (f *fakeLister) ListSites(context.Context) ([]string, error) { return f.sites, nil } + +func (f *fakeLister) ListPrefix(_ context.Context, prefix string) ([]string, error) { + return f.byPfx[prefix], nil +} + +func (f *fakeLister) GetAlias(_ context.Context, key string) (string, error) { + v, ok := f.aliases[key] + if !ok { + return "", r2.ErrNotFound + } + return v, nil +} + +type idxDeploy struct { + site, id string + mtime time.Time + hasMarker bool +} + +type idxAlias struct { + site, name, deployID string +} + +type fakeIndexer struct { + deploys []idxDeploy + aliases []idxAlias +} + +func (f *fakeIndexer) UpsertDeploy(_ context.Context, site, id string, mtime time.Time, _ int64, hasMarker bool, _ string) error { + f.deploys = append(f.deploys, idxDeploy{site, id, mtime, hasMarker}) + return nil +} + +func (f *fakeIndexer) UpsertAlias(_ context.Context, site, name, deployID string, _ time.Time) error { + f.aliases = append(f.aliases, idxAlias{site, name, deployID}) + return nil +} + +func TestBackfill(t *testing.T) { + lister := &fakeLister{ + sites: []string{"www", "learn"}, + byPfx: map[string][]string{ + "www/deploys/": { + "www/deploys/20260420-141522-abc1234/index.html", + "www/deploys/20260420-141522-abc1234/_artemis_meta.json", + "www/deploys/20260101-090000-old0001/index.html", + }, + "learn/deploys/": { + "learn/deploys/20260515-120000-def5678/index.html", + }, + }, + aliases: map[string]string{ + "www/production": "20260420-141522-abc1234", + "www/preview": "20260101-090000-old0001", + "learn/preview": "20260515-120000-def5678", + }, + } + idx := &fakeIndexer{} + b := &Backfill{Lister: lister, Indexer: idx, Now: func() time.Time { + return time.Date(2026, 6, 2, 0, 0, 0, 0, time.UTC) + }} + + res, err := b.Run(context.Background()) + require.NoError(t, err) + + assert.Equal(t, 2, res.Sites) + assert.Equal(t, 3, res.Deploys, "two www deploys + one learn deploy (marker is not its own deploy)") + assert.Equal(t, 3, res.Aliases, "www prod+preview, learn preview (learn prod absent)") + + byID := map[string]idxDeploy{} + for _, d := range idx.deploys { + byID[d.id] = d + } + require.Contains(t, byID, "20260420-141522-abc1234") + assert.True(t, byID["20260420-141522-abc1234"].hasMarker, "deploy with _artemis_meta.json marked completed") + assert.False(t, byID["20260101-090000-old0001"].hasMarker, "deploy without marker is an orphan") + assert.Equal(t, time.Date(2026, 4, 20, 14, 15, 22, 0, time.UTC), byID["20260420-141522-abc1234"].mtime, + "mtime parsed from deploy-id timestamp") +} + +func TestBackfill_AliasKeyIsR2DirRelative(t *testing.T) { + dir := "www.freecode.camp" + lister := &fakeLister{ + sites: []string{dir}, + byPfx: map[string][]string{dir + "/deploys/": {dir + "/deploys/20260420-141522-abc1234/index.html"}}, + aliases: map[string]string{ + dir + "/production": "20260420-141522-abc1234", + dir + "/preview": "20260420-141522-abc1234", + }, + } + idx := &fakeIndexer{} + b := &Backfill{Lister: lister, Indexer: idx, Now: func() time.Time { return time.Date(2026, 6, 2, 0, 0, 0, 0, time.UTC) }} + + res, err := b.Run(context.Background()) + require.NoError(t, err) + assert.Equal(t, 1, res.Deploys) + assert.Equal(t, 2, res.Aliases, + "alias key is the R2-dir-relative literal /; the dir from ListSites already carries the .freecode.camp suffix, so the slug-templated ALIAS_*_KEY_FORMAT must NOT be re-applied") +} diff --git a/internal/config/config.go b/internal/config/config.go index 863cb7b..6ab2961 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -11,6 +11,8 @@ package config import ( "fmt" "log/slog" + "net" + "net/url" "os" "regexp" "strconv" @@ -37,6 +39,26 @@ type Config struct { Registry RegistryConfig Repo RepoConfig Sentry SentryConfig + DatabaseURL string + BackfillOnBoot bool + Hatchet HatchetConfig + Cleanup CleanupConfig +} + +type HatchetConfig struct { + ClientToken string + Addr string +} + +type CleanupConfig struct { + RetentionDays int + RecentKeep int + Grace time.Duration + BlastCap int + TrashPrefix string + RecoveryDays int + DryRun bool + ServeCacheTTL time.Duration } // SentryConfig holds the optional Sentry error-monitoring + tracing @@ -151,6 +173,12 @@ func (r RepoConfig) Enabled() bool { } const ( + serveCacheTTL = 15 * time.Second + defaultCleanupRetentionDays = 7 + defaultCleanupRecentKeep = 3 + defaultCleanupGrace = time.Hour + defaultCleanupRecoveryDays = 7 + defaultCleanupTrashPrefix = "_trash/" minSigningKeyBytes = 32 defaultRegistryAuthzTeam = "staff" defaultRepoOrg = "freeCodeCamp-Universe" @@ -203,6 +231,14 @@ func Load() (*Config, error) { Sentry: SentryConfig{ TracesSampleRate: defaultSentryTracesSampleRate, }, + Cleanup: CleanupConfig{ + RetentionDays: defaultCleanupRetentionDays, + RecentKeep: defaultCleanupRecentKeep, + Grace: defaultCleanupGrace, + TrashPrefix: defaultCleanupTrashPrefix, + RecoveryDays: defaultCleanupRecoveryDays, + ServeCacheTTL: serveCacheTTL, + }, } if v, ok := os.LookupEnv("PORT"); ok { @@ -299,6 +335,17 @@ func Load() (*Config, error) { cfg.Sentry.Debug = v == "1" || strings.EqualFold(v, "true") } + cfg.DatabaseURL = os.Getenv("DATABASE_URL") + if v := os.Getenv("BACKFILL_ON_BOOT"); v != "" { + cfg.BackfillOnBoot = v == "1" || strings.EqualFold(v, "true") + } + cfg.Hatchet.ClientToken = os.Getenv("HATCHET_CLIENT_TOKEN") + cfg.Hatchet.Addr = os.Getenv("HATCHET_ADDR") + + if err := loadCleanup(&cfg.Cleanup); err != nil { + return nil, err + } + if err := cfg.validate(); err != nil { return nil, err } @@ -347,10 +394,13 @@ func (c *Config) validate() error { if err := validateDeployPrefixFormat(c.DeployPrefixFormat); err != nil { return err } + if err := validateGitHubAPIBase(c.GitHub.APIBase); err != nil { + return err + } if c.Registry.Valkey.Addr == "" { return missing("VALKEY_ADDR") } - if c.Registry.AuthzTeam == "" { + if strings.TrimSpace(c.Registry.AuthzTeam) == "" { return fmt.Errorf("REGISTRY_AUTHZ_TEAM must not be empty") } if c.Repo.Org == "" { @@ -385,6 +435,12 @@ func (c *Config) validate() error { if c.Sentry.TracesSampleRate < 0 || c.Sentry.TracesSampleRate > 1 { return fmt.Errorf("invalid SENTRY_TRACES_SAMPLE_RATE %v: must be in [0,1]", c.Sentry.TracesSampleRate) } + if c.Cleanup.Grace < c.JWT.TTL { + return fmt.Errorf("CLEANUP_GRACE (%s) must be >= JWT TTL (%s): an in-flight upload must never be GC'd before its deploy session expires", c.Cleanup.Grace, c.JWT.TTL) + } + if c.Cleanup.Grace < c.Cleanup.ServeCacheTTL { + return fmt.Errorf("CLEANUP_GRACE (%s) must be >= serve-cache TTL (%s): a just-superseded deploy must outlive the Caddy alias cache", c.Cleanup.Grace, c.Cleanup.ServeCacheTTL) + } return nil } @@ -406,6 +462,83 @@ func validateDeployPrefixFormat(fmtStr string) error { return nil } +func validateGitHubAPIBase(raw string) error { + u, err := url.Parse(raw) + if err != nil { + return fmt.Errorf("invalid GH_API_BASE %q: %w", raw, err) + } + if u.Host == "" || (u.Scheme != "http" && u.Scheme != "https") { + return fmt.Errorf("invalid GH_API_BASE %q: must be an absolute http(s) URL", raw) + } + if u.User != nil { + return fmt.Errorf("invalid GH_API_BASE %q: must not embed credentials", raw) + } + if u.Scheme == "http" && !isLoopbackHost(u.Hostname()) { + return fmt.Errorf("invalid GH_API_BASE %q: plaintext http is allowed only for loopback hosts; a bearer token must never traverse cleartext to a remote", raw) + } + return nil +} + +func isLoopbackHost(host string) bool { + if strings.EqualFold(host, "localhost") { + return true + } + if ip := net.ParseIP(host); ip != nil { + return ip.IsLoopback() + } + return false +} + +func (c *Config) GCEnabled() bool { return c.DatabaseURL != "" } + +func loadCleanup(c *CleanupConfig) error { + if v, ok := os.LookupEnv("CLEANUP_RETENTION_DAYS"); ok { + n, err := strconv.Atoi(v) + if err != nil || n <= 0 { + return fmt.Errorf("invalid CLEANUP_RETENTION_DAYS %q: must be positive integer (days)", v) + } + c.RetentionDays = n + } + if v, ok := os.LookupEnv("CLEANUP_RECENT_KEEP"); ok { + n, err := strconv.Atoi(v) + if err != nil || n < 0 { + return fmt.Errorf("invalid CLEANUP_RECENT_KEEP %q: must be non-negative integer", v) + } + c.RecentKeep = n + } + if v, ok := os.LookupEnv("CLEANUP_GRACE"); ok { + d, err := time.ParseDuration(v) + if err != nil || d <= 0 { + return fmt.Errorf("invalid CLEANUP_GRACE %q: must be a positive Go duration (e.g. 1h)", v) + } + c.Grace = d + } + if v, ok := os.LookupEnv("CLEANUP_BLAST_CAP"); ok { + n, err := strconv.Atoi(v) + if err != nil || n < 0 { + return fmt.Errorf("invalid CLEANUP_BLAST_CAP %q: must be non-negative integer (0 disables)", v) + } + c.BlastCap = n + } + if v, ok := os.LookupEnv("CLEANUP_TRASH_PREFIX"); ok && v != "" { + if !strings.HasSuffix(v, "/") { + v += "/" + } + c.TrashPrefix = v + } + if v, ok := os.LookupEnv("CLEANUP_RECOVERY_DAYS"); ok { + n, err := strconv.Atoi(v) + if err != nil || n <= 0 { + return fmt.Errorf("invalid CLEANUP_RECOVERY_DAYS %q: must be positive integer (days)", v) + } + c.RecoveryDays = n + } + if v, ok := os.LookupEnv("CLEANUP_DRY_RUN"); ok { + c.DryRun = v == "1" || strings.EqualFold(v, "true") + } + return nil +} + // getEnv returns the env var value or empty string. validate() then // surfaces any missing required vars with a uniform error message; // using empty string here lets validate() be the single source of diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 31c456e..89278dd 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -58,6 +58,46 @@ func TestLoad_AllDefaults(t *testing.T) { assert.Empty(t, cfg.Registry.Valkey.Password) } +func TestLoad_GitHubAPIBaseValidation(t *testing.T) { + valid := []string{ + "", // unset -> default https://api.github.com + "https://api.github.com", // canonical + "https://ghe.corp.example", // GitHub Enterprise + "http://127.0.0.1:8080", // loopback recording proxy + "http://localhost:3000", // loopback by name + "http://[::1]:9090", // loopback v6 + } + for _, base := range valid { + t.Run("valid/"+base, func(t *testing.T) { + for k, v := range requiredEnv() { + t.Setenv(k, v) + } + t.Setenv("GH_API_BASE", base) + _, err := Load() + require.NoError(t, err) + }) + } + + invalid := []string{ + "http://evil.example.com", // cleartext to a remote -> bearer exfil + "http://api.github.com", // cleartext downgrade of canonical host + "https://user:pass@gh.example", // embedded credentials + "ftp://gh.example", // non-http scheme + "gh.example.com", // no scheme/host + "://broken", // unparseable + } + for _, base := range invalid { + t.Run("invalid/"+base, func(t *testing.T) { + for k, v := range requiredEnv() { + t.Setenv(k, v) + } + t.Setenv("GH_API_BASE", base) + _, err := Load() + require.Error(t, err, "GH_API_BASE %q must be rejected", base) + }) + } +} + func TestLoad_OverridesViaEnv(t *testing.T) { for k, v := range requiredEnv() { t.Setenv(k, v) @@ -94,9 +134,79 @@ func TestLoad_OverridesViaEnv(t *testing.T) { assert.Equal(t, "secret-pw", cfg.Registry.Valkey.Password) } -// TestLoad_UploadMaxBytes_RejectsNonPositive — env var is additive but -// when set must be a positive integer. Empty/absent → default; explicit -// "0" or negative → boot-time error. +func TestConfigLoad(t *testing.T) { + for k, v := range requiredEnv() { + t.Setenv(k, v) + } + cfg, err := Load() + require.NoError(t, err) + + assert.False(t, cfg.GCEnabled(), "no DATABASE_URL -> GC disabled") + assert.Equal(t, 7, cfg.Cleanup.RetentionDays) + assert.Equal(t, 3, cfg.Cleanup.RecentKeep) + assert.Equal(t, time.Hour, cfg.Cleanup.Grace) + assert.Equal(t, 0, cfg.Cleanup.BlastCap) + assert.Equal(t, "_trash/", cfg.Cleanup.TrashPrefix) + assert.Equal(t, 7, cfg.Cleanup.RecoveryDays) + assert.False(t, cfg.Cleanup.DryRun) + assert.Equal(t, 15*time.Second, cfg.Cleanup.ServeCacheTTL) +} + +func TestConfigLoad_Overrides(t *testing.T) { + for k, v := range requiredEnv() { + t.Setenv(k, v) + } + t.Setenv("DATABASE_URL", "postgres://artemis@pg/artemis") + t.Setenv("HATCHET_CLIENT_TOKEN", "ht-token") + t.Setenv("HATCHET_ADDR", "hatchet.svc:7077") + t.Setenv("CLEANUP_RETENTION_DAYS", "14") + t.Setenv("CLEANUP_RECENT_KEEP", "5") + t.Setenv("CLEANUP_GRACE", "2h") + t.Setenv("CLEANUP_BLAST_CAP", "100") + t.Setenv("CLEANUP_TRASH_PREFIX", "_graveyard") + t.Setenv("CLEANUP_RECOVERY_DAYS", "30") + t.Setenv("CLEANUP_DRY_RUN", "true") + + cfg, err := Load() + require.NoError(t, err) + + assert.True(t, cfg.GCEnabled()) + assert.Equal(t, "postgres://artemis@pg/artemis", cfg.DatabaseURL) + assert.Equal(t, "ht-token", cfg.Hatchet.ClientToken) + assert.Equal(t, "hatchet.svc:7077", cfg.Hatchet.Addr) + assert.Equal(t, 14, cfg.Cleanup.RetentionDays) + assert.Equal(t, 5, cfg.Cleanup.RecentKeep) + assert.Equal(t, 2*time.Hour, cfg.Cleanup.Grace) + assert.Equal(t, 100, cfg.Cleanup.BlastCap) + assert.Equal(t, "_graveyard/", cfg.Cleanup.TrashPrefix, "trailing slash normalized in") + assert.Equal(t, 30, cfg.Cleanup.RecoveryDays) + assert.True(t, cfg.Cleanup.DryRun) +} + +func TestConfigLoad_GraceBelowJWTTTLFails(t *testing.T) { + for k, v := range requiredEnv() { + t.Setenv(k, v) + } + t.Setenv("JWT_TTL_SECONDS", "3600") + t.Setenv("CLEANUP_GRACE", "30m") + + _, err := Load() + require.Error(t, err) + assert.Contains(t, err.Error(), "CLEANUP_GRACE") +} + +func TestConfigLoad_GraceBelowServeCacheTTLFails(t *testing.T) { + for k, v := range requiredEnv() { + t.Setenv(k, v) + } + t.Setenv("JWT_TTL_SECONDS", "5") + t.Setenv("CLEANUP_GRACE", "10s") + + _, err := Load() + require.Error(t, err) + assert.Contains(t, err.Error(), "serve-cache") +} + func TestLoad_UploadMaxBytes_RejectsNonPositive(t *testing.T) { for _, bad := range []string{"0", "-1", "not-a-number", ""} { t.Run("v="+bad, func(t *testing.T) { @@ -105,11 +215,6 @@ func TestLoad_UploadMaxBytes_RejectsNonPositive(t *testing.T) { } t.Setenv("UPLOAD_MAX_BYTES", bad) _, err := Load() - if bad == "" { - // empty is treated as set-but-blank; ParseInt on "" → error - require.Error(t, err) - return - } require.Error(t, err) assert.Contains(t, err.Error(), "UPLOAD_MAX_BYTES") }) @@ -208,17 +313,28 @@ func TestLoad_AcceptsValidDeployPrefix(t *testing.T) { assert.Equal(t, "/custom/-/sub/", cfg.DeployPrefixFormat) } -func TestLoad_RegistryAuthzTeamRejectsBlank(t *testing.T) { +func TestLoad_RegistryAuthzTeamRejectsWhitespace(t *testing.T) { + for k, v := range requiredEnv() { + t.Setenv(k, v) + } + t.Setenv("REGISTRY_AUTHZ_TEAM", " ") + _, err := Load() + require.Error(t, err) + assert.Contains(t, err.Error(), "REGISTRY_AUTHZ_TEAM") +} + +func TestValidate_RegistryAuthzTeamRejectsBlank(t *testing.T) { for k, v := range requiredEnv() { t.Setenv(k, v) } - // Setting REGISTRY_AUTHZ_TEAM to an empty string keeps the default - // (the env-loader only overrides when v != ""), so this case - // exercises validate() against an explicitly cleared default. - t.Setenv("REGISTRY_AUTHZ_TEAM", " ") // whitespace-only is treated as content; validate accepts; the assertion below covers the unset path cfg, err := Load() require.NoError(t, err) - assert.Equal(t, " ", cfg.Registry.AuthzTeam) + require.Equal(t, "staff", cfg.Registry.AuthzTeam) + + cfg.Registry.AuthzTeam = "" + err = cfg.validate() + require.Error(t, err) + assert.Contains(t, err.Error(), "REGISTRY_AUTHZ_TEAM") } // captureSlog redirects slog.Default() to a buffer for the duration of diff --git a/internal/gc/errorpath_test.go b/internal/gc/errorpath_test.go new file mode 100644 index 0000000..38ed4e3 --- /dev/null +++ b/internal/gc/errorpath_test.go @@ -0,0 +1,186 @@ +package gc + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type errMover struct { + err error + moves [][2]string +} + +func (m *errMover) MovePrefix(_ context.Context, src, dst string) (int, error) { + m.moves = append(m.moves, [2]string{src, dst}) + return 0, m.err +} + +func expiredOne() []Tombstone { + return []Tombstone{{Site: "www", ID: "d-expired", TrashedAt: ago(8 * 24 * time.Hour), Bytes: 100}} +} + +type errDeleter struct { + err error + deleted []string +} + +func (d *errDeleter) DeletePrefix(_ context.Context, prefix string) (int, error) { + d.deleted = append(d.deleted, prefix) + return 0, d.err +} + +type errClearReaper struct { + *fakeReaper + clearErr error +} + +func (r *errClearReaper) ClearTombstone(ctx context.Context, site, id string) error { + if r.clearErr != nil { + return r.clearErr + } + return r.fakeReaper.ClearTombstone(ctx, site, id) +} + +func newErrPurge(reaper TombstoneReaper, del Deleter) *TombstonePurge { + return &TombstonePurge{ + Store: reaper, + Deleter: del, + Recovery: 7 * 24 * time.Hour, + TrashBase: "_trash/", + Now: func() time.Time { return testNow }, + } +} + +type errReconcileStore struct { + deploys map[string][]Deploy + aliases map[string]struct{} + aliasErrOnCall int + aliasCalls int + pruneErr error + tombstoned []string + pruned []string +} + +func (s *errReconcileStore) DeploysForSite(_ context.Context, site string) ([]Deploy, error) { + return s.deploys[site], nil +} + +func (s *errReconcileStore) AliasTargets(_ context.Context, _ string) (map[string]struct{}, time.Time, error) { + s.aliasCalls++ + if s.aliasErrOnCall != 0 && s.aliasCalls == s.aliasErrOnCall { + return nil, time.Time{}, errors.New("pg read failed") + } + return s.aliases, time.Time{}, nil +} + +func (s *errReconcileStore) UpsertDeploy(_ context.Context, _, _ string, _ time.Time, _ int64, _ bool, _ string) error { + return nil +} + +func (s *errReconcileStore) RecordTombstone(_ context.Context, _, id string, _ int64) error { + s.tombstoned = append(s.tombstoned, id) + return nil +} + +func (s *errReconcileStore) PruneDeploy(_ context.Context, _, id string) error { + if s.pruneErr != nil { + return s.pruneErr + } + s.pruned = append(s.pruned, id) + return nil +} + +func TestGC_TombstoneRecordFailurePropagates(t *testing.T) { + store := &fakeStore{ + deploys: map[string][]Deploy{"www": sixOld()}, + targetsSeq: []map[string]struct{}{{}}, + tombstoneErr: errors.New("pg down"), + } + mover := &fakeMover{} + + res, err := newSiteGC(store, mover).Run(context.Background(), "www", false) + + require.ErrorContains(t, err, "record tombstone") + assert.Empty(t, res.Tombstoned, "a failed PG tombstone is not reported as reclaimed") + assert.EqualValues(t, 0, res.BytesReclaimed, "no bytes accounted for an unrecorded tombstone") + require.Len(t, mover.moves, 1, "the R2 move ran before the PG write failed, leaving orphaned bytes the retry must reclaim") + assert.Empty(t, store.tombstoned) +} + +func TestGC_MoveFailureAbortsBeforeTombstone(t *testing.T) { + mover := &errMover{err: errors.New("r2 5xx")} + store := &fakeStore{ + deploys: map[string][]Deploy{"www": sixOld()}, + targetsSeq: []map[string]struct{}{{}}, + } + + res, err := newSiteGC(store, mover).Run(context.Background(), "www", false) + + require.ErrorContains(t, err, "tombstone-move") + assert.Empty(t, store.tombstoned, "no PG tombstone when the R2 move failed (V1/V5)") + assert.Empty(t, res.Tombstoned) + require.Len(t, mover.moves, 1, "aborts on the first failed move, never proceeding to the next deploy") +} + +func TestTombstonePurge_ClearFailurePersistsRowForRetry(t *testing.T) { + reaper := &errClearReaper{ + fakeReaper: &fakeReaper{tombstones: expiredOne()}, + clearErr: errors.New("pg down"), + } + del := &fakeDeleter{} + + _, err := newErrPurge(reaper, del).Run(context.Background(), false) + + require.ErrorContains(t, err, "clear") + assert.Equal(t, []string{"_trash/www/d-expired/"}, del.deleted, "R2 delete still happened; row left for idempotent retry (V10)") + assert.Len(t, reaper.tombstones, 1, "the tombstone row survives a failed clear so a re-run safely re-deletes") +} + +func TestTombstonePurge_DeleteFailureAbortsBeforeClear(t *testing.T) { + del := &errDeleter{err: errors.New("r2 down")} + reaper := &fakeReaper{tombstones: expiredOne()} + + res, err := newErrPurge(reaper, del).Run(context.Background(), false) + + require.ErrorContains(t, err, "delete") + assert.Empty(t, reaper.cleared, "PG row not cleared when R2 delete failed") + assert.Empty(t, res.Purged, "nothing reported reclaimed when the R2 delete failed") + assert.EqualValues(t, 0, res.BytesReclaimed) +} + +func TestReconcile_ReReadAliasFailureAbortsBeforeTombstone(t *testing.T) { + orphan := ts(2 * time.Hour) + lister := &fakeReconcileLister{keys: []string{"www/deploys/" + orphan + "/index.html"}} + store := &errReconcileStore{ + deploys: map[string][]Deploy{}, + aliases: map[string]struct{}{}, + aliasErrOnCall: 2, + } + mover := &errMover{} + + _, err := newReconciler(lister, store, mover).ReconcileSite(context.Background(), "www") + + require.ErrorContains(t, err, "re-read aliases before tombstone") + assert.Empty(t, mover.moves, "no move when the safety re-read failed (V1)") + assert.Empty(t, store.tombstoned, "no tombstone recorded when the re-read errored") +} + +func TestReconcile_PruneFailurePropagates(t *testing.T) { + lister := &fakeReconcileLister{keys: nil} + store := &errReconcileStore{ + deploys: map[string][]Deploy{"www": {{ID: "ghost", Mtime: ago(time.Hour)}}}, + aliases: map[string]struct{}{}, + pruneErr: errors.New("pg down"), + } + + report, err := newReconciler(lister, store, &fakeMover{}).ReconcileSite(context.Background(), "www") + + require.ErrorContains(t, err, "prune ghost") + assert.Empty(t, report.PGPruned, "a failed PruneDeploy is not reported as pruned") + assert.Empty(t, store.pruned) +} diff --git a/internal/gc/gcsite.go b/internal/gc/gcsite.go new file mode 100644 index 0000000..9f5cff4 --- /dev/null +++ b/internal/gc/gcsite.go @@ -0,0 +1,106 @@ +package gc + +import ( + "context" + "fmt" + "log/slog" + "time" +) + +type Store interface { + DeploysForSite(ctx context.Context, site string) ([]Deploy, error) + AliasTargets(ctx context.Context, site string) (targets map[string]struct{}, lastChange time.Time, err error) + Tombstone(ctx context.Context, site string, d Deploy) error +} + +type Mover interface { + MovePrefix(ctx context.Context, src, dst string) (int, error) +} + +type SiteGC struct { + Store Store + Mover Mover + Policy Policy + BlastCap int + DeployPrefix func(site, id string) string + TrashPrefix func(site, id string) string + Now func() time.Time + Metrics *Metrics +} + +type GCResult struct { + Site string + Planned []string + Tombstoned []string + SkippedAliased []string + BytesReclaimed int64 + Aborted bool + AbortReason string + DryRun bool +} + +func (g *SiteGC) Run(ctx context.Context, site string, dryRun bool) (GCResult, error) { + res := GCResult{Site: site, DryRun: dryRun} + + deploys, err := g.Store.DeploysForSite(ctx, site) + if err != nil { + return res, fmt.Errorf("gc %s: load deploys: %w", site, err) + } + targets, lastChange, err := g.Store.AliasTargets(ctx, site) + if err != nil { + return res, fmt.Errorf("gc %s: load aliases: %w", site, err) + } + + plan := PlanSite(site, RetainInput{ + Deploys: deploys, + AliasTargets: targets, + LastAliasChange: lastChange, + Now: g.Now(), + }, g.Policy, g.BlastCap) + + for _, d := range plan.Delete { + res.Planned = append(res.Planned, d.ID) + } + if plan.Aborted { + res.Aborted = true + res.AbortReason = plan.Reason + g.Metrics.run(WorkflowGCSiteLabel, "aborted") + slog.Warn("gc.site.aborted", "site", site, "planned", len(res.Planned), "reason", plan.Reason) + return res, nil + } + if dryRun { + g.Metrics.run(WorkflowGCSiteLabel, "dry-run") + slog.Info("gc.site.dry-run", "site", site, "planned", len(res.Planned)) + return res, nil + } + + fresh, _, err := g.Store.AliasTargets(ctx, site) + if err != nil { + return res, fmt.Errorf("gc %s: re-read aliases: %w", site, err) + } + for _, d := range plan.Delete { + if _, nowAliased := fresh[d.ID]; nowAliased { + res.SkippedAliased = append(res.SkippedAliased, d.ID) + continue + } + src := g.DeployPrefix(site, d.ID) + dst := g.TrashPrefix(site, d.ID) + if _, err := g.Mover.MovePrefix(ctx, src, dst); err != nil { + return res, fmt.Errorf("gc %s: tombstone-move %s: %w", site, d.ID, err) + } + if err := g.Store.Tombstone(ctx, site, d); err != nil { + return res, fmt.Errorf("gc %s: record tombstone %s: %w", site, d.ID, err) + } + res.Tombstoned = append(res.Tombstoned, d.ID) + res.BytesReclaimed += d.Bytes + } + + g.Metrics.tombstoned(len(res.Tombstoned)) + g.Metrics.run(WorkflowGCSiteLabel, "ok") + slog.Info("gc.site.done", "site", site, + "planned", len(res.Planned), + "tombstoned", len(res.Tombstoned), + "skippedAliased", len(res.SkippedAliased), + "bytes", res.BytesReclaimed) + return res, nil +} diff --git a/internal/gc/gcsite_test.go b/internal/gc/gcsite_test.go new file mode 100644 index 0000000..9f3df78 --- /dev/null +++ b/internal/gc/gcsite_test.go @@ -0,0 +1,182 @@ +package gc + +import ( + "context" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type fakeStore struct { + deploys map[string][]Deploy + targetsSeq []map[string]struct{} + lastChange time.Time + aliasCalls int + tombstoned []string + tombstoneErr error +} + +func (s *fakeStore) DeploysForSite(_ context.Context, site string) ([]Deploy, error) { + return s.deploys[site], nil +} + +func (s *fakeStore) AliasTargets(_ context.Context, _ string) (map[string]struct{}, time.Time, error) { + idx := s.aliasCalls + s.aliasCalls++ + if idx >= len(s.targetsSeq) { + idx = len(s.targetsSeq) - 1 + } + if idx < 0 { + return map[string]struct{}{}, s.lastChange, nil + } + return s.targetsSeq[idx], s.lastChange, nil +} + +func (s *fakeStore) Tombstone(_ context.Context, site string, d Deploy) error { + if s.tombstoneErr != nil { + return s.tombstoneErr + } + s.tombstoned = append(s.tombstoned, site+"/"+d.ID) + return nil +} + +type fakeMover struct { + moves [][2]string +} + +func (m *fakeMover) MovePrefix(_ context.Context, src, dst string) (int, error) { + m.moves = append(m.moves, [2]string{src, dst}) + return 1, nil +} + +func newSiteGC(store Store, mover Mover) *SiteGC { + return &SiteGC{ + Store: store, + Mover: mover, + Policy: testPolicy(), + BlastCap: 0, + DeployPrefix: func(site, id string) string { return site + "/deploys/" + id + "/" }, + TrashPrefix: func(site, id string) string { return "_trash/" + site + "/" + id + "/" }, + Now: func() time.Time { return testNow }, + } +} + +func sixOld() []Deploy { + return oldDeploys(6, 100) +} + +func TestGC_AliasPinned(t *testing.T) { + ds := sixOld() + aliased := ds[len(ds)-1].ID + store := &fakeStore{ + deploys: map[string][]Deploy{"www": ds}, + targetsSeq: []map[string]struct{}{aliasSet(aliased)}, + } + mover := &fakeMover{} + res, err := newSiteGC(store, mover).Run(context.Background(), "www", false) + require.NoError(t, err) + + assert.NotContains(t, res.Tombstoned, aliased, "aliased deploy never tombstoned (V1)") + for _, m := range mover.moves { + assert.NotContains(t, m[0], aliased, "no move of an aliased deploy") + } +} + +func TestGC_PromoteMidRun(t *testing.T) { + ds := sixOld() + victim := ds[len(ds)-1].ID + store := &fakeStore{ + deploys: map[string][]Deploy{"www": ds}, + targetsSeq: []map[string]struct{}{ + {}, // plan-time: nothing aliased + aliasSet(victim), // TOCTOU re-read: alias moved onto victim + }, + } + mover := &fakeMover{} + res, err := newSiteGC(store, mover).Run(context.Background(), "www", false) + require.NoError(t, err) + + assert.Contains(t, res.Planned, victim, "victim was in the plan") + assert.Contains(t, res.SkippedAliased, victim, "TOCTOU re-check skips a deploy aliased mid-run (V1)") + assert.NotContains(t, res.Tombstoned, victim) + for _, m := range mover.moves { + assert.NotContains(t, m[0], victim) + } +} + +func TestGC_InflightProtected(t *testing.T) { + ds := []Deploy{ + {ID: "n1", Mtime: ago(10 * time.Minute), Bytes: 1, HasMarker: true}, + {ID: "n2", Mtime: ago(20 * time.Minute), Bytes: 1, HasMarker: true}, + {ID: "n3", Mtime: ago(30 * time.Minute), Bytes: 1, HasMarker: true}, + {ID: "uploading", Mtime: ago(2 * time.Minute), Bytes: 1, HasMarker: false}, + } + store := &fakeStore{deploys: map[string][]Deploy{"www": ds}, targetsSeq: []map[string]struct{}{{}}} + mover := &fakeMover{} + res, err := newSiteGC(store, mover).Run(context.Background(), "www", false) + require.NoError(t, err) + + assert.NotContains(t, res.Tombstoned, "uploading", "in-flight (young, no marker) deploy protected (V4)") + assert.Empty(t, mover.moves) +} + +func TestGC_Idempotent(t *testing.T) { + ds := sixOld() + store := &fakeStore{deploys: map[string][]Deploy{"www": ds}, targetsSeq: []map[string]struct{}{{}}} + mover := &fakeMover{} + g := newSiteGC(store, mover) + + res1, err := g.Run(context.Background(), "www", false) + require.NoError(t, err) + require.Len(t, res1.Tombstoned, 3) + + store.deploys["www"] = nil + res2, err := g.Run(context.Background(), "www", false) + require.NoError(t, err) + assert.Empty(t, res2.Tombstoned, "re-run after reclaim tombstones nothing new (V10)") +} + +func TestGC_PerSiteScoped(t *testing.T) { + store := &fakeStore{ + deploys: map[string][]Deploy{"www": sixOld(), "learn": sixOld()}, + targetsSeq: []map[string]struct{}{{}}, + } + mover := &fakeMover{} + _, err := newSiteGC(store, mover).Run(context.Background(), "www", false) + require.NoError(t, err) + + for _, m := range mover.moves { + assert.True(t, strings.HasPrefix(m[0], "www/"), "GC of www must only touch www prefixes (V7 site scope)") + } + for _, ts := range store.tombstoned { + assert.True(t, strings.HasPrefix(ts, "www/"), "tombstones scoped to the target site") + } +} + +func TestGC_DryRun(t *testing.T) { + store := &fakeStore{deploys: map[string][]Deploy{"www": sixOld()}, targetsSeq: []map[string]struct{}{{}}} + mover := &fakeMover{} + res, err := newSiteGC(store, mover).Run(context.Background(), "www", true) + require.NoError(t, err) + + assert.Len(t, res.Planned, 3, "dry-run still computes the plan") + assert.Empty(t, res.Tombstoned, "dry-run mutates nothing") + assert.Empty(t, mover.moves) + assert.Empty(t, store.tombstoned) +} + +func TestGC_BlastCapAborts(t *testing.T) { + store := &fakeStore{deploys: map[string][]Deploy{"www": oldDeploys(10, 1)}, targetsSeq: []map[string]struct{}{{}}} + mover := &fakeMover{} + g := newSiteGC(store, mover) + g.BlastCap = 5 + res, err := g.Run(context.Background(), "www", false) + require.NoError(t, err) + + assert.True(t, res.Aborted) + assert.Empty(t, res.Tombstoned, "aborted plan mutates nothing (V6)") + assert.Empty(t, mover.moves) +} diff --git a/internal/gc/marker.go b/internal/gc/marker.go new file mode 100644 index 0000000..6ab14ad --- /dev/null +++ b/internal/gc/marker.go @@ -0,0 +1,3 @@ +package gc + +const MarkerObjectName = "_artemis_meta.json" diff --git a/internal/gc/metrics.go b/internal/gc/metrics.go new file mode 100644 index 0000000..5d13825 --- /dev/null +++ b/internal/gc/metrics.go @@ -0,0 +1,66 @@ +package gc + +import "github.com/prometheus/client_golang/prometheus" + +const ( + WorkflowGCSiteLabel = "gc-site" + WorkflowTombstonePurgeLabel = "tombstone-purge" +) + +type Metrics struct { + DeploysTombstoned prometheus.Counter + BytesReclaimed prometheus.Counter + Runs *prometheus.CounterVec + Drift *prometheus.CounterVec +} + +func NewMetrics(reg prometheus.Registerer) *Metrics { + m := &Metrics{ + DeploysTombstoned: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "artemis_gc_deploys_tombstoned_total", + Help: "Count of deploys soft-deleted (moved to _trash) by retention GC, manual delete, and site purge.", + }), + BytesReclaimed: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "artemis_gc_bytes_reclaimed_total", + Help: "Bytes hard-reclaimed from _trash by the tombstone-purge pass past the recovery window.", + }), + Runs: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "artemis_gc_runs_total", + Help: "Count of GC workflow runs, labelled by workflow and outcome.", + }, []string{"workflow", "outcome"}), + Drift: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "artemis_gc_drift_total", + Help: "Reconcile drift events, labelled by kind (reindexed, orphan, pruned, aliased_missing).", + }, []string{"kind"}), + } + reg.MustRegister(m.DeploysTombstoned, m.BytesReclaimed, m.Runs, m.Drift) + return m +} + +func (m *Metrics) drift(kind string, n int) { + if m == nil || n == 0 { + return + } + m.Drift.WithLabelValues(kind).Add(float64(n)) +} + +func (m *Metrics) tombstoned(n int) { + if m == nil { + return + } + m.DeploysTombstoned.Add(float64(n)) +} + +func (m *Metrics) reclaimed(bytes int64) { + if m == nil { + return + } + m.BytesReclaimed.Add(float64(bytes)) +} + +func (m *Metrics) run(workflow, outcome string) { + if m == nil { + return + } + m.Runs.WithLabelValues(workflow, outcome).Inc() +} diff --git a/internal/gc/metrics_test.go b/internal/gc/metrics_test.go new file mode 100644 index 0000000..924d6f3 --- /dev/null +++ b/internal/gc/metrics_test.go @@ -0,0 +1,44 @@ +package gc + +import ( + "context" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestMetrics(t *testing.T) { + reg := prometheus.NewRegistry() + m := NewMetrics(reg) + + store := &fakeStore{deploys: map[string][]Deploy{"www": sixOld()}, targetsSeq: []map[string]struct{}{{}}} + g := newSiteGC(store, &fakeMover{}) + g.Metrics = m + _, err := g.Run(context.Background(), "www", false) + require.NoError(t, err) + + assert.EqualValues(t, 3, testutil.ToFloat64(m.DeploysTombstoned), "3 deploys tombstoned by gc-site") + assert.EqualValues(t, 1, testutil.ToFloat64(m.Runs.WithLabelValues("gc-site", "ok"))) + + reaper := &fakeReaper{tombstones: []Tombstone{ + {Site: "www", ID: "d", TrashedAt: ago(8 * 24 * time.Hour), Bytes: 500}, + }} + p := newPurge(reaper, &fakeDeleter{}) + p.Metrics = m + _, err = p.Run(context.Background(), false) + require.NoError(t, err) + + assert.EqualValues(t, 500, testutil.ToFloat64(m.BytesReclaimed), "bytes reclaimed by tombstone-purge") + assert.EqualValues(t, 1, testutil.ToFloat64(m.Runs.WithLabelValues("tombstone-purge", "ok"))) +} + +func TestMetrics_NilSafe(t *testing.T) { + store := &fakeStore{deploys: map[string][]Deploy{"www": sixOld()}, targetsSeq: []map[string]struct{}{{}}} + g := newSiteGC(store, &fakeMover{}) + _, err := g.Run(context.Background(), "www", false) + require.NoError(t, err, "nil Metrics must not panic") +} diff --git a/internal/gc/plan.go b/internal/gc/plan.go new file mode 100644 index 0000000..eed24f2 --- /dev/null +++ b/internal/gc/plan.go @@ -0,0 +1,29 @@ +package gc + +import "fmt" + +type Plan struct { + Site string + Delete []Deploy + TotalBytes int64 + Aborted bool + Reason string +} + +func PlanSite(site string, in RetainInput, p Policy, blastCap int) Plan { + _, del := Retain(in, p) + + var total int64 + for _, d := range del { + total += d.Bytes + } + + plan := Plan{Site: site, Delete: del, TotalBytes: total} + if blastCap > 0 && len(del) > blastCap { + plan.Aborted = true + plan.Reason = fmt.Sprintf("delete plan of %d exceeds blast-cap %d", len(del), blastCap) + plan.Delete = nil + plan.TotalBytes = 0 + } + return plan +} diff --git a/internal/gc/plan_test.go b/internal/gc/plan_test.go new file mode 100644 index 0000000..a81042b --- /dev/null +++ b/internal/gc/plan_test.go @@ -0,0 +1,47 @@ +package gc + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func oldDeploys(n int, eachBytes int64) []Deploy { + out := make([]Deploy, n) + for i := range out { + out[i] = Deploy{ + ID: string(rune('a'+i)) + "-old", + Mtime: ago(time.Duration(30+i) * 24 * time.Hour), + Bytes: eachBytes, + HasMarker: true, + } + } + return out +} + +func TestPlanSite_KeepN(t *testing.T) { + plan := PlanSite("www", RetainInput{Deploys: oldDeploys(6, 100), Now: testNow}, testPolicy(), 0) + + assert.Len(t, plan.Delete, 3, "6 old deploys, keepN=3 -> 3 deletable (V2)") + assert.False(t, plan.Aborted) + assert.EqualValues(t, 300, plan.TotalBytes, "bytes summed across delete set") +} + +func TestGC_BlastCap(t *testing.T) { + under := PlanSite("www", RetainInput{Deploys: oldDeploys(6, 10), Now: testNow}, testPolicy(), 5) + assert.False(t, under.Aborted, "3 deletes under cap=5") + assert.Len(t, under.Delete, 3) + + over := PlanSite("www", RetainInput{Deploys: oldDeploys(10, 10), Now: testNow}, testPolicy(), 5) + assert.True(t, over.Aborted, "7 deletes over cap=5 -> abort (V6)") + assert.Empty(t, over.Delete, "aborted plan deletes nothing") + assert.EqualValues(t, 0, over.TotalBytes) + assert.Contains(t, over.Reason, "blast-cap") +} + +func TestPlanSite_BlastCapDisabled(t *testing.T) { + plan := PlanSite("www", RetainInput{Deploys: oldDeploys(20, 1), Now: testNow}, testPolicy(), 0) + assert.False(t, plan.Aborted, "blastCap=0 disables the cap") + assert.Len(t, plan.Delete, 17) +} diff --git a/internal/gc/reconcile.go b/internal/gc/reconcile.go new file mode 100644 index 0000000..5d93b07 --- /dev/null +++ b/internal/gc/reconcile.go @@ -0,0 +1,167 @@ +package gc + +import ( + "context" + "fmt" + "log/slog" + "strings" + "time" +) + +type ReconcileLister interface { + ListPrefix(ctx context.Context, prefix string) ([]string, error) +} + +type ReconcileStore interface { + DeploysForSite(ctx context.Context, site string) ([]Deploy, error) + AliasTargets(ctx context.Context, site string) (map[string]struct{}, time.Time, error) + UpsertDeploy(ctx context.Context, site, id string, mtime time.Time, bytes int64, hasMarker bool, state string) error + RecordTombstone(ctx context.Context, site, id string, bytes int64) error + PruneDeploy(ctx context.Context, site, id string) error +} + +type Reconciler struct { + Lister ReconcileLister + Store ReconcileStore + Mover Mover + Grace time.Duration + SitePrefix func(site string) string + DeployPrefix func(site, id string) string + TrashPrefix func(site, id string) string + Now func() time.Time + Metrics *Metrics +} + +type DriftReport struct { + Site string + Reindexed []string + OrphanTombstoned []string + PGPruned []string + AliasedMissing []string +} + +type r2Deploy struct { + hasMarker bool + mtime time.Time +} + +func (rc *Reconciler) ReconcileSite(ctx context.Context, site string) (DriftReport, error) { + report := DriftReport{Site: site} + + keys, err := rc.Lister.ListPrefix(ctx, rc.SitePrefix(site)) + if err != nil { + return report, fmt.Errorf("reconcile %s: list r2: %w", site, err) + } + sitePrefix := rc.SitePrefix(site) + r2 := map[string]*r2Deploy{} + for _, k := range keys { + rest := strings.TrimPrefix(k, sitePrefix) + id := rest + if i := strings.IndexByte(rest, '/'); i >= 0 { + id = rest[:i] + } + if id == "" { + continue + } + d, ok := r2[id] + if !ok { + d = &r2Deploy{mtime: parseDeployTime(id, rc.Now())} + r2[id] = d + } + if rest == id+"/"+MarkerObjectName { + d.hasMarker = true + } + } + + pgDeploys, err := rc.Store.DeploysForSite(ctx, site) + if err != nil { + return report, fmt.Errorf("reconcile %s: load pg: %w", site, err) + } + pg := map[string]struct{}{} + for _, d := range pgDeploys { + pg[d.ID] = struct{}{} + } + aliases, _, err := rc.Store.AliasTargets(ctx, site) + if err != nil { + return report, fmt.Errorf("reconcile %s: load aliases: %w", site, err) + } + + for id, info := range r2 { + if _, indexed := pg[id]; indexed { + continue + } + if _, aliased := aliases[id]; aliased { + report.AliasedMissing = append(report.AliasedMissing, id) + slog.Error("reconcile.aliased_unindexed", "site", site, "deployId", id, + "detail", "alias targets a deploy with no PG row; reindex, never tombstone (V1)") + if info.hasMarker { + if err := rc.Store.UpsertDeploy(ctx, site, id, info.mtime, 0, true, "active"); err != nil { + return report, fmt.Errorf("reconcile %s: reindex aliased %s: %w", site, id, err) + } + report.Reindexed = append(report.Reindexed, id) + } + continue + } + switch { + case info.hasMarker: + if err := rc.Store.UpsertDeploy(ctx, site, id, info.mtime, 0, true, "active"); err != nil { + return report, fmt.Errorf("reconcile %s: reindex %s: %w", site, id, err) + } + report.Reindexed = append(report.Reindexed, id) + case rc.Now().Sub(info.mtime) >= rc.Grace: + nowAliases, _, err := rc.Store.AliasTargets(ctx, site) + if err != nil { + return report, fmt.Errorf("reconcile %s: re-read aliases before tombstone %s: %w", site, id, err) + } + if _, nowAliased := nowAliases[id]; nowAliased { + report.AliasedMissing = append(report.AliasedMissing, id) + slog.Error("reconcile.aliased_raced", "site", site, "deployId", id, + "detail", "alias appeared after snapshot read; skip tombstone (V1)") + continue + } + if _, err := rc.Mover.MovePrefix(ctx, rc.DeployPrefix(site, id), rc.TrashPrefix(site, id)); err != nil { + return report, fmt.Errorf("reconcile %s: tombstone orphan %s: %w", site, id, err) + } + if err := rc.Store.RecordTombstone(ctx, site, id, 0); err != nil { + return report, fmt.Errorf("reconcile %s: record orphan %s: %w", site, id, err) + } + report.OrphanTombstoned = append(report.OrphanTombstoned, id) + } + } + + for id := range pg { + if _, present := r2[id]; present { + continue + } + if _, aliased := aliases[id]; aliased { + report.AliasedMissing = append(report.AliasedMissing, id) + slog.Error("reconcile.aliased_bytes_missing", "site", site, "deployId", id, + "detail", "alias targets a deploy whose R2 bytes are gone") + continue + } + if err := rc.Store.PruneDeploy(ctx, site, id); err != nil { + return report, fmt.Errorf("reconcile %s: prune %s: %w", site, id, err) + } + report.PGPruned = append(report.PGPruned, id) + } + + rc.Metrics.drift("reindexed", len(report.Reindexed)) + rc.Metrics.drift("orphan", len(report.OrphanTombstoned)) + rc.Metrics.drift("pruned", len(report.PGPruned)) + rc.Metrics.drift("aliased_missing", len(report.AliasedMissing)) + slog.Info("reconcile.site.done", "site", site, + "reindexed", len(report.Reindexed), + "orphanTombstoned", len(report.OrphanTombstoned), + "pgPruned", len(report.PGPruned), + "aliasedMissing", len(report.AliasedMissing)) + return report, nil +} + +func parseDeployTime(id string, fallback time.Time) time.Time { + if len(id) >= 15 { + if t, err := time.Parse("20060102-150405", id[:15]); err == nil { + return t.UTC() + } + } + return fallback +} diff --git a/internal/gc/reconcile_test.go b/internal/gc/reconcile_test.go new file mode 100644 index 0000000..9df1e75 --- /dev/null +++ b/internal/gc/reconcile_test.go @@ -0,0 +1,200 @@ +package gc + +import ( + "context" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type fakeReconcileStore struct { + deploys map[string][]Deploy + aliases map[string]struct{} + aliasesAfter map[string]struct{} + aliasCalls int + reindexed []string + tombstoned []string + pruned []string +} + +func (s *fakeReconcileStore) DeploysForSite(_ context.Context, site string) ([]Deploy, error) { + return s.deploys[site], nil +} +func (s *fakeReconcileStore) AliasTargets(_ context.Context, _ string) (map[string]struct{}, time.Time, error) { + s.aliasCalls++ + if s.aliasesAfter != nil && s.aliasCalls >= 2 { + return s.aliasesAfter, time.Time{}, nil + } + return s.aliases, time.Time{}, nil +} +func (s *fakeReconcileStore) UpsertDeploy(_ context.Context, _, id string, _ time.Time, _ int64, _ bool, _ string) error { + s.reindexed = append(s.reindexed, id) + return nil +} +func (s *fakeReconcileStore) RecordTombstone(_ context.Context, _, id string, _ int64) error { + s.tombstoned = append(s.tombstoned, id) + return nil +} +func (s *fakeReconcileStore) PruneDeploy(_ context.Context, _, id string) error { + s.pruned = append(s.pruned, id) + return nil +} + +func newReconciler(lister ReconcileLister, store ReconcileStore, mover Mover) *Reconciler { + return &Reconciler{ + Lister: lister, + Store: store, + Mover: mover, + Grace: time.Hour, + SitePrefix: func(site string) string { return site + "/deploys/" }, + DeployPrefix: func(site, id string) string { return site + "/deploys/" + id + "/" }, + TrashPrefix: func(site, id string) string { return "_trash/" + site + "/" + id + "/" }, + Now: func() time.Time { return testNow }, + } +} + +type fakeReconcileLister struct{ keys []string } + +func (f *fakeReconcileLister) ListPrefix(context.Context, string) ([]string, error) { + return f.keys, nil +} + +func ts(d time.Duration) string { + return testNow.Add(-d).UTC().Format("20060102-150405") + "-sha1234" +} + +func TestReconcile_Orphan(t *testing.T) { + orphan := ts(2 * time.Hour) + lister := &fakeReconcileLister{keys: []string{"www/deploys/" + orphan + "/index.html"}} + store := &fakeReconcileStore{deploys: map[string][]Deploy{}, aliases: map[string]struct{}{}} + mover := &fakeMover{} + + report, err := newReconciler(lister, store, mover).ReconcileSite(context.Background(), "www") + require.NoError(t, err) + + assert.Equal(t, []string{orphan}, report.OrphanTombstoned, "no-marker, past-grace, unindexed R2 prefix -> tombstoned (E4)") + assert.Equal(t, []string{orphan}, store.tombstoned) + require.Len(t, mover.moves, 1) +} + +func TestReconcile_Rebuild(t *testing.T) { + completed := ts(2 * time.Hour) + lister := &fakeReconcileLister{keys: []string{ + "www/deploys/" + completed + "/index.html", + "www/deploys/" + completed + "/" + MarkerObjectName, + }} + store := &fakeReconcileStore{deploys: map[string][]Deploy{}, aliases: map[string]struct{}{}} + mover := &fakeMover{} + + report, err := newReconciler(lister, store, mover).ReconcileSite(context.Background(), "www") + require.NoError(t, err) + + assert.Equal(t, []string{completed}, report.Reindexed, "marked-complete R2 deploy missing from PG -> re-indexed (E3)") + assert.Empty(t, report.OrphanTombstoned, "a completed deploy is never tombstoned by reconcile") + assert.Empty(t, mover.moves) +} + +func TestReconcile_InflightSkipped(t *testing.T) { + young := ts(5 * time.Minute) + lister := &fakeReconcileLister{keys: []string{"www/deploys/" + young + "/index.html"}} + store := &fakeReconcileStore{deploys: map[string][]Deploy{}, aliases: map[string]struct{}{}} + mover := &fakeMover{} + + report, err := newReconciler(lister, store, mover).ReconcileSite(context.Background(), "www") + require.NoError(t, err) + assert.Empty(t, report.OrphanTombstoned, "no-marker but within grace -> in-flight, left alone") + assert.Empty(t, store.tombstoned) +} + +func TestReconcile_PrunesStalePGRow(t *testing.T) { + lister := &fakeReconcileLister{keys: []string{}} + store := &fakeReconcileStore{ + deploys: map[string][]Deploy{"www": {{ID: "ghost", Mtime: ago(time.Hour)}}}, + aliases: map[string]struct{}{}, + } + report, err := newReconciler(lister, store, &fakeMover{}).ReconcileSite(context.Background(), "www") + require.NoError(t, err) + assert.Equal(t, []string{"ghost"}, report.PGPruned, "PG row with no R2 bytes pruned") +} + +func TestReconcile_AliasedMissingNotPruned(t *testing.T) { + lister := &fakeReconcileLister{keys: []string{}} + store := &fakeReconcileStore{ + deploys: map[string][]Deploy{"www": {{ID: "live", Mtime: ago(time.Hour)}}}, + aliases: map[string]struct{}{"live": {}}, + } + report, err := newReconciler(lister, store, &fakeMover{}).ReconcileSite(context.Background(), "www") + require.NoError(t, err) + assert.Empty(t, report.PGPruned, "an aliased deploy whose bytes vanished is alerted, never silently pruned") + assert.Equal(t, []string{"live"}, report.AliasedMissing) +} + +func TestReconcile_AliasedOrphanNotTombstoned(t *testing.T) { + id := ts(2 * time.Hour) + lister := &fakeReconcileLister{keys: []string{"www/deploys/" + id + "/index.html"}} + store := &fakeReconcileStore{ + deploys: map[string][]Deploy{}, + aliases: map[string]struct{}{id: {}}, + } + mover := &fakeMover{} + + report, err := newReconciler(lister, store, mover).ReconcileSite(context.Background(), "www") + require.NoError(t, err) + + assert.NotContains(t, report.OrphanTombstoned, id, + "an alias-pinned deploy is never tombstoned even when unindexed + marker-less + past grace (V1)") + assert.Empty(t, mover.moves, "no R2 move of an aliased deploy") + assert.Empty(t, store.tombstoned) + assert.Contains(t, report.AliasedMissing, id, "surfaced as drift to alert on instead") +} + +func TestReconcile_AliasRaceAfterSnapshotNotTombstoned(t *testing.T) { + id := ts(2 * time.Hour) + lister := &fakeReconcileLister{keys: []string{"www/deploys/" + id + "/index.html"}} + store := &fakeReconcileStore{ + deploys: map[string][]Deploy{}, + aliases: map[string]struct{}{}, + aliasesAfter: map[string]struct{}{id: {}}, + } + mover := &fakeMover{} + + report, err := newReconciler(lister, store, mover).ReconcileSite(context.Background(), "www") + require.NoError(t, err) + + assert.Empty(t, mover.moves, "deploy aliased after the snapshot read must not be tombstoned (V1 TOCTOU)") + assert.Empty(t, store.tombstoned) + assert.NotContains(t, report.OrphanTombstoned, id) +} + +func TestReconcile_DriftMetrics(t *testing.T) { + reg := prometheus.NewRegistry() + m := NewMetrics(reg) + orphan := ts(2 * time.Hour) + lister := &fakeReconcileLister{keys: []string{"www/deploys/" + orphan + "/index.html"}} + store := &fakeReconcileStore{deploys: map[string][]Deploy{"www": {{ID: "ghost", Mtime: ago(time.Hour)}}}, aliases: map[string]struct{}{}} + rc := newReconciler(lister, store, &fakeMover{}) + rc.Metrics = m + + _, err := rc.ReconcileSite(context.Background(), "www") + require.NoError(t, err) + assert.EqualValues(t, 1, testutil.ToFloat64(m.Drift.WithLabelValues("orphan"))) + assert.EqualValues(t, 1, testutil.ToFloat64(m.Drift.WithLabelValues("pruned"))) +} + +func TestReconcile_ConsistentNoDrift(t *testing.T) { + id := ts(2 * time.Hour) + lister := &fakeReconcileLister{keys: []string{"www/deploys/" + id + "/index.html"}} + store := &fakeReconcileStore{ + deploys: map[string][]Deploy{"www": {{ID: id, Mtime: ago(2 * time.Hour)}}}, + aliases: map[string]struct{}{}, + } + report, err := newReconciler(lister, store, &fakeMover{}).ReconcileSite(context.Background(), "www") + require.NoError(t, err) + assert.Empty(t, report.Reindexed) + assert.Empty(t, report.OrphanTombstoned) + assert.Empty(t, report.PGPruned) +} diff --git a/internal/gc/retain.go b/internal/gc/retain.go new file mode 100644 index 0000000..1652177 --- /dev/null +++ b/internal/gc/retain.go @@ -0,0 +1,70 @@ +package gc + +import ( + "sort" + "time" +) + +type Deploy struct { + ID string + Mtime time.Time + Bytes int64 + HasMarker bool +} + +type Policy struct { + RecentKeep int + Grace time.Duration + Retention time.Duration + ServeCacheTTL time.Duration +} + +type RetainInput struct { + Deploys []Deploy + AliasTargets map[string]struct{} + LastAliasChange time.Time + Now time.Time +} + +func Retain(in RetainInput, p Policy) (keep, del []Deploy) { + ordered := make([]Deploy, len(in.Deploys)) + copy(ordered, in.Deploys) + sort.SliceStable(ordered, func(i, j int) bool { + if !ordered[i].Mtime.Equal(ordered[j].Mtime) { + return ordered[i].Mtime.After(ordered[j].Mtime) + } + return ordered[i].ID > ordered[j].ID + }) + + freshAliasMove := !in.LastAliasChange.IsZero() && + in.Now.Sub(in.LastAliasChange) < p.ServeCacheTTL + + for rank, d := range ordered { + if retainDeploy(d, rank, freshAliasMove, in.AliasTargets, in.Now, p) { + keep = append(keep, d) + } else { + del = append(del, d) + } + } + return keep, del +} + +func retainDeploy(d Deploy, rank int, freshAliasMove bool, aliases map[string]struct{}, now time.Time, p Policy) bool { + if _, aliased := aliases[d.ID]; aliased { + return true + } + if rank < p.RecentKeep { + return true + } + if freshAliasMove { + return true + } + age := now.Sub(d.Mtime) + if age < p.Grace { + return true + } + if d.HasMarker && age < p.Retention { + return true + } + return false +} diff --git a/internal/gc/retain_test.go b/internal/gc/retain_test.go new file mode 100644 index 0000000..710115a --- /dev/null +++ b/internal/gc/retain_test.go @@ -0,0 +1,152 @@ +package gc + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +var testNow = time.Date(2026, 6, 2, 12, 0, 0, 0, time.UTC) + +func testPolicy() Policy { + return Policy{ + RecentKeep: 3, + Grace: time.Hour, + Retention: 7 * 24 * time.Hour, + ServeCacheTTL: 15 * time.Second, + } +} + +func ago(d time.Duration) time.Time { return testNow.Add(-d) } + +func delIDs(ds []Deploy) []string { + out := make([]string, len(ds)) + for i, d := range ds { + out[i] = d.ID + } + return out +} + +func aliasSet(ids ...string) map[string]struct{} { + m := make(map[string]struct{}, len(ids)) + for _, id := range ids { + m[id] = struct{}{} + } + return m +} + +func TestRetain_AliasPinned(t *testing.T) { + deploys := []Deploy{ + {ID: "d-old", Mtime: ago(30 * 24 * time.Hour), HasMarker: true}, + {ID: "d-n1", Mtime: ago(3 * time.Hour), HasMarker: true}, + {ID: "d-n2", Mtime: ago(2 * time.Hour), HasMarker: true}, + {ID: "d-n3", Mtime: ago(1 * time.Hour), HasMarker: true}, + {ID: "d-n4", Mtime: ago(30 * time.Minute), HasMarker: true}, + } + _, del := Retain(RetainInput{ + Deploys: deploys, + AliasTargets: aliasSet("d-old"), + Now: testNow, + }, testPolicy()) + + assert.NotContains(t, delIDs(del), "d-old", "aliased deploy must never be deleted (V1)") +} + +func TestRetain_KeepN(t *testing.T) { + deploys := []Deploy{ + {ID: "d1", Mtime: ago(10 * 24 * time.Hour), HasMarker: true}, + {ID: "d2", Mtime: ago(20 * 24 * time.Hour), HasMarker: true}, + {ID: "d3", Mtime: ago(30 * 24 * time.Hour), HasMarker: true}, + {ID: "d4", Mtime: ago(40 * 24 * time.Hour), HasMarker: true}, + {ID: "d5", Mtime: ago(50 * 24 * time.Hour), HasMarker: true}, + } + keep, del := Retain(RetainInput{Deploys: deploys, Now: testNow}, testPolicy()) + + assert.ElementsMatch(t, []string{"d1", "d2", "d3"}, delIDs(keep), + "newest recentKeep=3 retained at any age (V2)") + assert.ElementsMatch(t, []string{"d4", "d5"}, delIDs(del)) +} + +func TestRetain_Grace(t *testing.T) { + deploys := []Deploy{ + {ID: "n1", Mtime: ago(time.Hour), HasMarker: true}, + {ID: "n2", Mtime: ago(2 * time.Hour), HasMarker: true}, + {ID: "n3", Mtime: ago(3 * time.Hour), HasMarker: true}, + {ID: "young-orphan", Mtime: ago(30 * time.Minute), HasMarker: false}, + {ID: "old-orphan", Mtime: ago(4 * time.Hour), HasMarker: false}, + } + keep, del := Retain(RetainInput{Deploys: deploys, Now: testNow}, testPolicy()) + + assert.Contains(t, delIDs(keep), "young-orphan", "deploy younger than grace retained (V3)") + assert.Contains(t, delIDs(del), "old-orphan", "orphan past grace, unaliased, beyond keepN is reclaimed") +} + +func TestRetain_ServeCacheSafe(t *testing.T) { + deploys := []Deploy{ + {ID: "n1", Mtime: ago(time.Hour), HasMarker: true}, + {ID: "n2", Mtime: ago(2 * time.Hour), HasMarker: true}, + {ID: "n3", Mtime: ago(3 * time.Hour), HasMarker: true}, + {ID: "just-superseded", Mtime: ago(30 * 24 * time.Hour), HasMarker: true}, + } + + _, delFresh := Retain(RetainInput{ + Deploys: deploys, + LastAliasChange: testNow.Add(-5 * time.Second), + Now: testNow, + }, testPolicy()) + assert.NotContains(t, delIDs(delFresh), "just-superseded", + "no delete within serve_cache_ttl of an alias move (V11)") + + _, delLater := Retain(RetainInput{ + Deploys: deploys, + LastAliasChange: testNow.Add(-30 * time.Second), + Now: testNow, + }, testPolicy()) + assert.Contains(t, delIDs(delLater), "just-superseded", + "past serve_cache_ttl the superseded deploy is collectable") +} + +func TestPlan_Deterministic(t *testing.T) { + deploys := []Deploy{ + {ID: "a", Mtime: ago(8 * 24 * time.Hour), HasMarker: true}, + {ID: "b", Mtime: ago(8 * 24 * time.Hour), HasMarker: true}, + {ID: "c", Mtime: ago(9 * 24 * time.Hour), HasMarker: true}, + {ID: "d", Mtime: ago(10 * 24 * time.Hour), HasMarker: true}, + {ID: "e", Mtime: ago(11 * 24 * time.Hour), HasMarker: true}, + } + in := RetainInput{Deploys: deploys, Now: testNow} + + _, del1 := Retain(in, testPolicy()) + _, del2 := Retain(in, testPolicy()) + assert.Equal(t, delIDs(del1), delIDs(del2), + "same store state + same now -> identical, stably-ordered delete set (V9)") +} + +func TestRetain_CompletedRetentionWindow(t *testing.T) { + deploys := []Deploy{ + {ID: "n1", Mtime: ago(time.Hour), HasMarker: true}, + {ID: "n2", Mtime: ago(2 * time.Hour), HasMarker: true}, + {ID: "n3", Mtime: ago(3 * time.Hour), HasMarker: true}, + {ID: "within-7d", Mtime: ago(3 * 24 * time.Hour), HasMarker: true}, + {ID: "beyond-7d", Mtime: ago(8 * 24 * time.Hour), HasMarker: true}, + } + keep, del := Retain(RetainInput{Deploys: deploys, Now: testNow}, testPolicy()) + + assert.Contains(t, delIDs(keep), "within-7d", "completed deploy inside retention window retained") + assert.Contains(t, delIDs(del), "beyond-7d", "completed deploy past retention, unaliased, beyond keepN is collectable") +} + +func TestRetain_OrphanFastReclaim(t *testing.T) { + deploys := []Deploy{ + {ID: "n1", Mtime: ago(10 * time.Minute), HasMarker: true}, + {ID: "n2", Mtime: ago(20 * time.Minute), HasMarker: true}, + {ID: "n3", Mtime: ago(30 * time.Minute), HasMarker: true}, + {ID: "orphan-2h", Mtime: ago(2 * time.Hour), HasMarker: false}, + } + _, del := Retain(RetainInput{Deploys: deploys, Now: testNow}, testPolicy()) + + require.Contains(t, delIDs(del), "orphan-2h", + "orphan past grace reclaimed fast, no 7d retention wait") +} diff --git a/internal/gc/tombstone.go b/internal/gc/tombstone.go new file mode 100644 index 0000000..6c2b04b --- /dev/null +++ b/internal/gc/tombstone.go @@ -0,0 +1,81 @@ +package gc + +import ( + "context" + "fmt" + "log/slog" + "time" +) + +type Tombstone struct { + Site string + ID string + TrashedAt time.Time + Bytes int64 +} + +type TombstoneReaper interface { + ExpiredTombstones(ctx context.Context, before time.Time) ([]Tombstone, error) + ClearTombstone(ctx context.Context, site, id string) error +} + +type Deleter interface { + DeletePrefix(ctx context.Context, prefix string) (int, error) +} + +type TombstonePurge struct { + Store TombstoneReaper + Deleter Deleter + Recovery time.Duration + TrashBase string + Now func() time.Time + Metrics *Metrics +} + +type PurgeResult struct { + Purged []string + BytesReclaimed int64 + DryRun bool +} + +func (p *TombstonePurge) trashPrefix(t Tombstone) string { + base := p.TrashBase + if base == "" { + base = "_trash/" + } + if t.ID == "" { + return base + t.Site + "/" + } + return base + t.Site + "/" + t.ID + "/" +} + +func (p *TombstonePurge) Run(ctx context.Context, dryRun bool) (PurgeResult, error) { + res := PurgeResult{DryRun: dryRun} + cutoff := p.Now().Add(-p.Recovery) + expired, err := p.Store.ExpiredTombstones(ctx, cutoff) + if err != nil { + return res, fmt.Errorf("tombstone-purge: list expired: %w", err) + } + for _, t := range expired { + label := t.Site + "/" + t.ID + if dryRun { + res.Purged = append(res.Purged, label) + continue + } + if _, err := p.Deleter.DeletePrefix(ctx, p.trashPrefix(t)); err != nil { + return res, fmt.Errorf("tombstone-purge: delete %s: %w", label, err) + } + if err := p.Store.ClearTombstone(ctx, t.Site, t.ID); err != nil { + return res, fmt.Errorf("tombstone-purge: clear %s: %w", label, err) + } + res.Purged = append(res.Purged, label) + res.BytesReclaimed += t.Bytes + } + + if !dryRun { + p.Metrics.reclaimed(res.BytesReclaimed) + p.Metrics.run(WorkflowTombstonePurgeLabel, "ok") + } + slog.Info("gc.tombstone-purge.done", "purged", len(res.Purged), "bytes", res.BytesReclaimed, "dryRun", dryRun) + return res, nil +} diff --git a/internal/gc/tombstone_test.go b/internal/gc/tombstone_test.go new file mode 100644 index 0000000..9ed9b6a --- /dev/null +++ b/internal/gc/tombstone_test.go @@ -0,0 +1,107 @@ +package gc + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type fakeReaper struct { + tombstones []Tombstone + cleared []string +} + +func (f *fakeReaper) ExpiredTombstones(_ context.Context, before time.Time) ([]Tombstone, error) { + var out []Tombstone + for _, t := range f.tombstones { + if t.TrashedAt.Before(before) { + out = append(out, t) + } + } + return out, nil +} + +func (f *fakeReaper) ClearTombstone(_ context.Context, site, id string) error { + f.cleared = append(f.cleared, site+"/"+id) + for i, t := range f.tombstones { + if t.Site == site && t.ID == id { + f.tombstones = append(f.tombstones[:i], f.tombstones[i+1:]...) + break + } + } + return nil +} + +type fakeDeleter struct { + deleted []string +} + +func (f *fakeDeleter) DeletePrefix(_ context.Context, prefix string) (int, error) { + f.deleted = append(f.deleted, prefix) + return 1, nil +} + +func newPurge(reaper *fakeReaper, del *fakeDeleter) *TombstonePurge { + return &TombstonePurge{ + Store: reaper, + Deleter: del, + Recovery: 7 * 24 * time.Hour, + TrashBase: "_trash/", + Now: func() time.Time { return testNow }, + } +} + +func TestTombstonePurge(t *testing.T) { + reaper := &fakeReaper{tombstones: []Tombstone{ + {Site: "www", ID: "d-expired", TrashedAt: ago(8 * 24 * time.Hour), Bytes: 100}, + {Site: "www", ID: "d-fresh", TrashedAt: ago(1 * 24 * time.Hour), Bytes: 50}, + }} + del := &fakeDeleter{} + res, err := newPurge(reaper, del).Run(context.Background(), false) + require.NoError(t, err) + + assert.Equal(t, []string{"www/d-expired"}, res.Purged, "only tombstones past the recovery window are hard-reclaimed (V5)") + assert.Equal(t, []string{"_trash/www/d-expired/"}, del.deleted) + assert.Equal(t, []string{"www/d-expired"}, reaper.cleared) + assert.EqualValues(t, 100, res.BytesReclaimed) +} + +func TestTombstonePurge_SitePurgeTrashLayout(t *testing.T) { + reaper := &fakeReaper{tombstones: []Tombstone{ + {Site: "gone", ID: "", TrashedAt: ago(10 * 24 * time.Hour), Bytes: 0}, + }} + del := &fakeDeleter{} + _, err := newPurge(reaper, del).Run(context.Background(), false) + require.NoError(t, err) + assert.Equal(t, []string{"_trash/gone/"}, del.deleted, "empty id -> whole-site trash prefix") +} + +func TestTombstonePurge_DryRun(t *testing.T) { + reaper := &fakeReaper{tombstones: []Tombstone{ + {Site: "www", ID: "d-expired", TrashedAt: ago(8 * 24 * time.Hour)}, + }} + del := &fakeDeleter{} + res, err := newPurge(reaper, del).Run(context.Background(), true) + require.NoError(t, err) + + assert.Equal(t, []string{"www/d-expired"}, res.Purged) + assert.Empty(t, del.deleted, "dry-run reclaims nothing") + assert.Empty(t, reaper.cleared) +} + +func TestTombstonePurge_Idempotent(t *testing.T) { + reaper := &fakeReaper{tombstones: []Tombstone{ + {Site: "www", ID: "d-expired", TrashedAt: ago(8 * 24 * time.Hour)}, + }} + del := &fakeDeleter{} + p := newPurge(reaper, del) + + _, err := p.Run(context.Background(), false) + require.NoError(t, err) + res2, err := p.Run(context.Background(), false) + require.NoError(t, err) + assert.Empty(t, res2.Purged, "re-run after reclaim finds no expired tombstones (V10)") +} diff --git a/internal/handler/deploy.go b/internal/handler/deploy.go index ed670be..0884df3 100644 --- a/internal/handler/deploy.go +++ b/internal/handler/deploy.go @@ -2,12 +2,15 @@ package handler import ( "errors" + "fmt" "log/slog" "mime" "net/http" "path" "strings" + "time" + "github.com/freeCodeCamp/artemis/internal/gc" "github.com/freeCodeCamp/artemis/internal/r2" "github.com/go-chi/chi/v5" ) @@ -202,12 +205,21 @@ func (h *Handlers) DeployFinalize(w http.ResponseWriter, r *http.Request) { return } + markerKey := prefix + gc.MarkerObjectName + meta := fmt.Sprintf(`{"site":%q,"deployId":%q,"mode":%q,"finalizedAt":%q}`, + claims.Site, deployID, mode, time.Now().UTC().Format(time.RFC3339)) + if err := h.R2.PutObject(r.Context(), markerKey, strings.NewReader(meta), "application/json", int64(len(meta))); err != nil { + writeUpstreamError(w, r, http.StatusBadGateway, "r2_put_failed", "r2.put.marker.finalize", err) + return + } + aliasKey := h.aliasKey(claims.Site, mode) if err := h.R2.PutAlias(r.Context(), aliasKey, deployID); err != nil { writeUpstreamError(w, r, http.StatusBadGateway, "r2_put_failed", "r2.put.alias.finalize", err) return } + h.emitSiteChanged(r.Context(), claims.Site) slog.Info("deploy.finalize.live", "site", claims.Site, "deployId", deployID, "mode", mode, "reqID", RequestIDFromContext(r.Context())) writeJSON(w, http.StatusOK, map[string]any{ "url": h.publicURL(claims.Site, mode), diff --git a/internal/handler/deploy_delete.go b/internal/handler/deploy_delete.go new file mode 100644 index 0000000..ad5c589 --- /dev/null +++ b/internal/handler/deploy_delete.go @@ -0,0 +1,74 @@ +package handler + +import ( + "log/slog" + "net/http" + "strings" + + "github.com/freeCodeCamp/artemis/internal/r2" + "github.com/go-chi/chi/v5" +) + +func (h *Handlers) SiteDeployDelete(w http.ResponseWriter, r *http.Request) { + site := chi.URLParam(r, "site") + if err := h.requireSiteAuthz(w, r, site); err != nil { + return + } + deployID := chi.URLParam(r, "deployId") + if !deployIDPattern.MatchString(deployID) { + writeError(w, http.StatusBadRequest, "bad_request", "deployId is not a valid artemis deploy id") + return + } + + for _, mode := range []string{"production", "preview"} { + cur, err := h.R2.GetAlias(r.Context(), h.aliasKey(site, mode)) + if err != nil && !r2.IsNotFound(err) { + writeUpstreamError(w, r, http.StatusBadGateway, "r2_get_failed", "r2.get.alias.delete", err) + return + } + if strings.TrimSpace(cur) == deployID { + writeJSON(w, http.StatusConflict, map[string]any{ + "error": map[string]string{ + "code": "deploy_aliased", + "message": "deploy is the target of a live alias; promote or roll back before deleting", + }, + "site": site, + "deployId": deployID, + "alias": mode, + }) + return + } + } + + if h.Tombstones == nil { + writeError(w, http.StatusServiceUnavailable, "unavailable", "tombstone store not configured") + return + } + + moved, err := h.R2.MovePrefix(r.Context(), h.deployPrefix(site, deployID), h.trashPrefix(site, deployID)) + if err != nil { + writeUpstreamError(w, r, http.StatusBadGateway, "r2_move_failed", "r2.move.tombstone", err) + return + } + if err := h.Tombstones.RecordTombstone(r.Context(), site, deployID, 0); err != nil { + writeUpstreamError(w, r, http.StatusBadGateway, "tombstone_record_failed", "pg.tombstone.record", err) + return + } + + slog.Info("site.deploy.tombstoned", "site", site, "deployId", deployID, "moved", moved, + "reqID", RequestIDFromContext(r.Context())) + writeJSON(w, http.StatusOK, map[string]any{ + "site": site, + "deployId": deployID, + "status": "tombstoned", + "moved": moved, + }) +} + +func (h *Handlers) trashPrefix(site, id string) string { + base := h.TrashPrefixBase + if base == "" { + base = "_trash/" + } + return base + site + "/" + id + "/" +} diff --git a/internal/handler/deploy_delete_test.go b/internal/handler/deploy_delete_test.go new file mode 100644 index 0000000..521ebbe --- /dev/null +++ b/internal/handler/deploy_delete_test.go @@ -0,0 +1,115 @@ +package handler + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type fakeTombstones struct { + recorded []string + err error +} + +func (f *fakeTombstones) RecordTombstone(_ context.Context, site, id string, _ int64) error { + if f.err != nil { + return f.err + } + f.recorded = append(f.recorded, site+"/"+id) + return nil +} + +func authedGH() *fakeGH { + return &fakeGH{ + tokenLogins: map[string]string{"tok": "alice"}, + userTeams: map[string]map[string]bool{"alice": {"team-eng": true}}, + } +} + +func callDeployDelete(h *Handlers, site, deployID string) *httptest.ResponseRecorder { + return withSiteRoute(http.MethodDelete, "/api/site/{site}/deploys/{deployId}", + "/api/site/"+site+"/deploys/"+deployID, nil, + contextWithLogin(context.Background(), "alice", "tok"), + h.SiteDeployDelete, + ) +} + +func TestDelete_Tombstone(t *testing.T) { + deployID := "20260420-141522-abc1234" + store := newFakeR2() + prefix := "www/deploys/" + deployID + "/" + store.objects[prefix+"index.html"] = []byte("hi") + store.objects[prefix+"app.js"] = []byte("js") + + h, _ := newTestHandlers(t, authedGH(), standardSites(), store) + tomb := &fakeTombstones{} + h.Tombstones = tomb + + w := callDeployDelete(h, "www", deployID) + require.Equal(t, http.StatusOK, w.Code, w.Body.String()) + + store.mu.Lock() + _, srcGone := store.objects[prefix+"index.html"] + _, inTrash := store.objects["_trash/www/"+deployID+"/index.html"] + store.mu.Unlock() + assert.False(t, srcGone, "deploy bytes moved out of the live prefix") + assert.True(t, inTrash, "bytes moved to _trash (tombstone, not hard delete) (V5)") + assert.Equal(t, []string{"www/" + deployID}, tomb.recorded, "tombstone recorded in store") +} + +func TestDelete_AliasedConflict(t *testing.T) { + deployID := "20260420-141522-abc1234" + store := newFakeR2() + store.objects["www/deploys/"+deployID+"/index.html"] = []byte("hi") + store.aliases["www/production"] = deployID + + h, _ := newTestHandlers(t, authedGH(), standardSites(), store) + tomb := &fakeTombstones{} + h.Tombstones = tomb + + w := callDeployDelete(h, "www", deployID) + require.Equal(t, http.StatusConflict, w.Code, w.Body.String()) + assert.Contains(t, w.Body.String(), "deploy_aliased") + + store.mu.Lock() + _, stillLive := store.objects["www/deploys/"+deployID+"/index.html"] + store.mu.Unlock() + assert.True(t, stillLive, "an aliased deploy is never moved/deleted (V1)") + assert.Empty(t, tomb.recorded, "no tombstone recorded for an aliased deploy") +} + +func TestDelete_PreviewAliasedConflict(t *testing.T) { + deployID := "20260420-141522-abc1234" + store := newFakeR2() + store.objects["www/deploys/"+deployID+"/index.html"] = []byte("hi") + store.aliases["www/preview"] = deployID + + h, _ := newTestHandlers(t, authedGH(), standardSites(), store) + h.Tombstones = &fakeTombstones{} + + w := callDeployDelete(h, "www", deployID) + require.Equal(t, http.StatusConflict, w.Code, w.Body.String()) + assert.Contains(t, w.Body.String(), "preview") +} + +func TestDelete_BadDeployID(t *testing.T) { + store := newFakeR2() + h, _ := newTestHandlers(t, authedGH(), standardSites(), store) + h.Tombstones = &fakeTombstones{} + + w := callDeployDelete(h, "www", "not-a-valid-id") + require.Equal(t, http.StatusBadRequest, w.Code, w.Body.String()) +} + +func TestDelete_Unauthorized(t *testing.T) { + store := newFakeR2() + h, _ := newTestHandlers(t, authedGH(), standardSites(), store) + h.Tombstones = &fakeTombstones{} + + w := callDeployDelete(h, "unregistered-site", "20260420-141522-abc1234") + require.Equal(t, http.StatusForbidden, w.Code, w.Body.String()) +} diff --git a/internal/handler/deploy_test.go b/internal/handler/deploy_test.go index 72286ae..4dbdb4d 100644 --- a/internal/handler/deploy_test.go +++ b/internal/handler/deploy_test.go @@ -9,6 +9,7 @@ import ( "strings" "testing" + "github.com/freeCodeCamp/artemis/internal/gc" "github.com/go-chi/chi/v5" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -325,6 +326,43 @@ func TestDeployFinalize_VerifyThenAlias(t *testing.T) { assert.Equal(t, deployID, alias) } +func TestFinalizeMarker(t *testing.T) { + store := newFakeR2() + h, jwt := newTestHandlers(t, &fakeGH{}, standardSites(), store) + + deployID := "20260420-141522-abc1234" + prefix := "www/deploys/" + deployID + "/" + store.objects[prefix+"index.html"] = []byte("

hi

") + + tok, _, err := jwt.Sign("alice", "www", deployID) + require.NoError(t, err) + + body, _ := json.Marshal(DeployFinalizeRequest{Mode: "production", Files: []string{"index.html"}}) + w := withChiRoute(http.MethodPost, "/api/deploy/{deployId}/finalize", + "/api/deploy/"+deployID+"/finalize", + body, + map[string]string{"Authorization": "Bearer " + tok}, + h.RequireDeployJWT(http.HandlerFunc(h.DeployFinalize)).ServeHTTP, + context.Background(), + ) + require.Equal(t, http.StatusOK, w.Code, w.Body.String()) + + store.mu.Lock() + raw, ok := store.objects[prefix+gc.MarkerObjectName] + store.mu.Unlock() + require.True(t, ok, "finalize must write the _artemis_meta.json marker under the deploy prefix") + + var meta struct { + Site string `json:"site"` + DeployID string `json:"deployId"` + Mode string `json:"mode"` + } + require.NoError(t, json.Unmarshal(raw, &meta)) + assert.Equal(t, "www", meta.Site) + assert.Equal(t, deployID, meta.DeployID) + assert.Equal(t, "production", meta.Mode) +} + func TestDeployFinalize_VerifyMissing_DoesNotWriteAlias(t *testing.T) { store := newFakeR2() h, jwt := newTestHandlers(t, &fakeGH{}, standardSites(), store) diff --git a/internal/handler/export_test.go b/internal/handler/export_test.go new file mode 100644 index 0000000..6f1c6d5 --- /dev/null +++ b/internal/handler/export_test.go @@ -0,0 +1,8 @@ +package handler + +import "sync" + +func resetMetricsForTest() { + pkgMetrics = nil + pkgMetricsOnce = sync.Once{} +} diff --git a/internal/handler/handler.go b/internal/handler/handler.go index 89927e7..d7c4aad 100644 --- a/internal/handler/handler.go +++ b/internal/handler/handler.go @@ -57,6 +57,15 @@ type R2Store interface { ListPrefix(ctx context.Context, prefix string) ([]string, error) HasPrefix(ctx context.Context, prefix string) (bool, error) VerifyDeployComplete(ctx context.Context, prefix string, expected []string) error + MovePrefix(ctx context.Context, src, dst string) (int, error) +} + +type TombstoneStore interface { + RecordTombstone(ctx context.Context, site, id string, bytes int64) error +} + +type SiteChangeEmitter interface { + EnqueueSiteChanged(ctx context.Context, site string) error } // RegistryHealth is the readiness probe contract for the registry @@ -66,6 +75,10 @@ type RegistryHealth interface { Ping(ctx context.Context) error } +type PGHealth interface { + Ping(ctx context.Context) error +} + // Handlers carries the dependencies needed by every endpoint in this package. type Handlers struct { GH GitHubAuthenticator @@ -73,9 +86,13 @@ type Handlers struct { Sites SitesProvider Registry RegistryWriter Health RegistryHealth + PGHealth PGHealth R2 R2Store AliasProductionFmt string // e.g. "/production" AliasPreviewFmt string // e.g. "/preview" + Tombstones TombstoneStore + TrashPrefixBase string // e.g. "_trash/" + Outbox SiteChangeEmitter // DeployPrefix is the parsed deploy-key template. DeployPrefix DeployPrefixTemplate // UploadMaxBytes caps a single PUT /upload body size. 0 or @@ -108,6 +125,23 @@ type Handlers struct { Metrics *Metrics } +func (h *Handlers) emitSiteChanged(ctx context.Context, site string) { + if h.Outbox == nil { + return + } + ctx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 5*time.Second) + defer cancel() + if err := h.Outbox.EnqueueSiteChanged(ctx, site); err != nil { + slog.Error("outbox enqueue site.changed failed", "site", site, "err", err) + sentry.WithScope(func(scope *sentry.Scope) { + scope.SetTag("op", "outbox.enqueue") + scope.SetTag("site", site) + scope.SetFingerprint([]string{"outbox.enqueue"}) + sentry.CaptureException(err) + }) + } +} + // writeJSON marshals v as JSON and writes it with the given status code. func writeJSON(w http.ResponseWriter, status int, v any) { w.Header().Set("Content-Type", "application/json") diff --git a/internal/handler/metrics_test.go b/internal/handler/metrics_test.go index 8f43386..2aa6a29 100644 --- a/internal/handler/metrics_test.go +++ b/internal/handler/metrics_test.go @@ -36,9 +36,9 @@ func TestMetrics_RegisterAndExpose(t *testing.T) { func TestWriteUpstreamError_IncrementsUpstreamErrorsCounter(t *testing.T) { reg := prometheus.NewRegistry() m := NewMetrics(reg) - prev := pkgMetrics + resetMetricsForTest() + t.Cleanup(resetMetricsForTest) SetMetrics(m) - t.Cleanup(func() { SetMetrics(prev) }) w := httptest.NewRecorder() r := httptest.NewRequest(http.MethodGet, "/api/whoami", nil) diff --git a/internal/handler/outbox_emit_test.go b/internal/handler/outbox_emit_test.go new file mode 100644 index 0000000..562b905 --- /dev/null +++ b/internal/handler/outbox_emit_test.go @@ -0,0 +1,88 @@ +package handler + +import ( + "context" + "encoding/json" + "net/http" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type fakeOutbox struct { + sites []string +} + +func (f *fakeOutbox) EnqueueSiteChanged(_ context.Context, site string) error { + f.sites = append(f.sites, site) + return nil +} + +type ctxCapturingOutbox struct { + capturedDone bool + called bool +} + +func (f *ctxCapturingOutbox) EnqueueSiteChanged(ctx context.Context, _ string) error { + f.called = true + select { + case <-ctx.Done(): + f.capturedDone = true + default: + } + return nil +} + +func TestEmitSiteChanged_DetachedFromRequestCancellation(t *testing.T) { + h, _ := newTestHandlers(t, &fakeGH{}, standardSites(), newFakeR2()) + ob := &ctxCapturingOutbox{} + h.Outbox = ob + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + h.emitSiteChanged(ctx, "www") + + require.True(t, ob.called, "emitSiteChanged must enqueue even when the request context is canceled") + assert.False(t, ob.capturedDone, "enqueue context must be detached from the canceled request context") +} + +func TestFinalize_EmitsSiteChanged(t *testing.T) { + store := newFakeR2() + h, jwt := newTestHandlers(t, &fakeGH{}, standardSites(), store) + ob := &fakeOutbox{} + h.Outbox = ob + + deployID := "20260420-141522-abc1234" + store.objects["www/deploys/"+deployID+"/index.html"] = []byte("hi") + tok, _, err := jwt.Sign("alice", "www", deployID) + require.NoError(t, err) + + body, _ := json.Marshal(DeployFinalizeRequest{Mode: "preview", Files: []string{"index.html"}}) + w := withChiRoute(http.MethodPost, "/api/deploy/{deployId}/finalize", + "/api/deploy/"+deployID+"/finalize", + body, + map[string]string{"Authorization": "Bearer " + tok}, + h.RequireDeployJWT(http.HandlerFunc(h.DeployFinalize)).ServeHTTP, + context.Background(), + ) + require.Equal(t, http.StatusOK, w.Code, w.Body.String()) + assert.Equal(t, []string{"www"}, ob.sites, "finalize emits site.changed for event-driven GC") +} + +func TestPromote_EmitsSiteChanged(t *testing.T) { + store := newFakeR2() + store.aliases["www/preview"] = "20260420-141522-abc1234" + h, _ := newTestHandlers(t, authedGH(), standardSites(), store) + ob := &fakeOutbox{} + h.Outbox = ob + + w := withSiteRoute(http.MethodPost, "/api/site/{site}/promote", + "/api/site/www/promote", nil, + contextWithLogin(context.Background(), "alice", "tok"), + h.SitePromote, + ) + require.Equal(t, http.StatusOK, w.Code, w.Body.String()) + assert.Equal(t, []string{"www"}, ob.sites) +} diff --git a/internal/handler/readyz.go b/internal/handler/readyz.go index 92a145e..dae869d 100644 --- a/internal/handler/readyz.go +++ b/internal/handler/readyz.go @@ -2,6 +2,7 @@ package handler import ( "context" + "log/slog" "net/http" "sync" "time" @@ -20,7 +21,7 @@ const readyZProbeTimeout = 5 * time.Second // probes. func (h *Handlers) ReadyZ(w http.ResponseWriter, r *http.Request) { var wg sync.WaitGroup - var valkeyErr, r2Err error + var valkeyErr, r2Err, pgErr error if h.Health != nil { wg.Add(1) @@ -42,6 +43,16 @@ func (h *Handlers) ReadyZ(w http.ResponseWriter, r *http.Request) { }() } + if h.PGHealth != nil { + wg.Add(1) + go func() { + defer wg.Done() + ctx, cancel := context.WithTimeout(r.Context(), readyZProbeTimeout) + defer cancel() + pgErr = h.PGHealth.Ping(ctx) + }() + } + wg.Wait() switch { @@ -49,6 +60,9 @@ func (h *Handlers) ReadyZ(w http.ResponseWriter, r *http.Request) { writeUpstreamError(w, r, http.StatusServiceUnavailable, "valkey_unreachable", "valkey.ping", valkeyErr) case r2Err != nil: writeUpstreamError(w, r, http.StatusServiceUnavailable, "r2_unreachable", "r2.has_prefix", r2Err) + case pgErr != nil: + slog.Error("readyz: postgres degraded", "err", pgErr) + writeJSON(w, http.StatusOK, map[string]bool{"ready": true, "degraded": true}) default: writeJSON(w, http.StatusOK, map[string]bool{"ready": true}) } diff --git a/internal/handler/readyz_test.go b/internal/handler/readyz_test.go index f72e154..48c2ce5 100644 --- a/internal/handler/readyz_test.go +++ b/internal/handler/readyz_test.go @@ -33,6 +33,50 @@ func TestReadyZ_NoAuthRequired_BothUpstreamsReachable_ReturnsOK(t *testing.T) { assert.Equal(t, "application/json", w.Header().Get("Content-Type")) } +func TestReadyzDegraded_PGDown_Returns200Degraded(t *testing.T) { + h := &Handlers{ + Health: &fakeHealth{}, + R2: newFakeR2(), + PGHealth: &fakeHealth{err: errors.New("dial tcp artemis-postgresql:5432: i/o timeout")}, + } + + r := httptest.NewRequest(http.MethodGet, "/readyz", nil) + w := httptest.NewRecorder() + h.ReadyZ(w, r) + + require.Equal(t, http.StatusOK, w.Code, "PG down is degraded, not down — serve plane unaffected (R6/R7)") + assert.JSONEq(t, `{"ready":true,"degraded":true}`, w.Body.String()) +} + +func TestReadyzDegraded_PGUp_ReturnsReady(t *testing.T) { + h := &Handlers{ + Health: &fakeHealth{}, + R2: newFakeR2(), + PGHealth: &fakeHealth{}, + } + + r := httptest.NewRequest(http.MethodGet, "/readyz", nil) + w := httptest.NewRecorder() + h.ReadyZ(w, r) + + require.Equal(t, http.StatusOK, w.Code) + assert.JSONEq(t, `{"ready":true}`, w.Body.String()) +} + +func TestReadyzDegraded_ValkeyDownHardFailsEvenIfPGUp(t *testing.T) { + h := &Handlers{ + Health: &fakeHealth{err: errors.New("valkey down")}, + R2: newFakeR2(), + PGHealth: &fakeHealth{}, + } + + r := httptest.NewRequest(http.MethodGet, "/readyz", nil) + w := httptest.NewRecorder() + h.ReadyZ(w, r) + + require.Equal(t, http.StatusServiceUnavailable, w.Code, "Valkey/R2 down = hard down even when PG ok") +} + func TestReadyZ_ValkeyDown_Returns503_ValkeyUnreachable(t *testing.T) { h := &Handlers{ Health: &fakeHealth{err: errors.New("dial tcp valkey:6379: i/o timeout")}, diff --git a/internal/handler/site.go b/internal/handler/site.go index 792ceb3..9904604 100644 --- a/internal/handler/site.go +++ b/internal/handler/site.go @@ -132,6 +132,7 @@ func (h *Handlers) SitePromote(w http.ResponseWriter, r *http.Request) { return } + h.emitSiteChanged(r.Context(), site) slog.Info("site.promote", "site", site, "deployId", deployID, "reqID", RequestIDFromContext(r.Context())) writeJSON(w, http.StatusOK, map[string]any{ "url": h.publicURL(site, "production"), @@ -221,6 +222,7 @@ func (h *Handlers) SiteRollback(w http.ResponseWriter, r *http.Request) { return } + h.emitSiteChanged(r.Context(), site) slog.Info("site.rollback", "site", site, "to", req.To, "reqID", RequestIDFromContext(r.Context())) writeJSON(w, http.StatusOK, map[string]any{ "url": h.publicURL(site, "production"), diff --git a/internal/handler/site_purge_test.go b/internal/handler/site_purge_test.go new file mode 100644 index 0000000..2afbfed --- /dev/null +++ b/internal/handler/site_purge_test.go @@ -0,0 +1,131 @@ +package handler + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type flakyMoveR2 struct { + *fakeR2 + failMovesRemaining int +} + +func (f *flakyMoveR2) MovePrefix(ctx context.Context, src, dst string) (int, error) { + if f.failMovesRemaining > 0 { + f.failMovesRemaining-- + return 0, errors.New("r2 move outage") + } + return f.fakeR2.MovePrefix(ctx, src, dst) +} + +func TestSitePurge(t *testing.T) { + store := newFakeR2() + store.objects["example/deploys/20260420-141522-abc1234/index.html"] = []byte("hi") + store.objects["example/deploys/20260101-000000-old0001/index.html"] = []byte("old") + store.aliases["example/production"] = "20260420-141522-abc1234" + store.objects["example/production"] = []byte("20260420-141522-abc1234") + + h, _ := newTestHandlers(t, staffCallerGH(), standardSites(), store) + tomb := &fakeTombstones{} + h.Tombstones = tomb + + regBody, _ := json.Marshal(SiteRegisterRequest{Slug: "example", Teams: []string{"staff"}}) + require.Equal(t, http.StatusCreated, callRegister(h, regBody, "alice", "tok").Code) + + w := withChiRoute(http.MethodDelete, "/api/site/{slug}", + "/api/site/example?purge=true", nil, + map[string]string{}, + h.SiteDelete, + contextWithLogin(context.Background(), "alice", "tok"), + ) + require.Equal(t, http.StatusOK, w.Code, w.Body.String()) + assert.Contains(t, w.Body.String(), "purged") + + store.mu.Lock() + defer store.mu.Unlock() + for k := range store.objects { + assert.Truef(t, hasPrefix(k, "_trash/example/"), "every example/ object cascaded into _trash, found %q live", k) + } + assert.Equal(t, []string{"example/"}, tomb.recorded, "site-level tombstone recorded (empty id = whole-site purge)") +} + +func TestSitePurge_FailedMoveKeepsSiteRetryable(t *testing.T) { + store := &flakyMoveR2{fakeR2: newFakeR2(), failMovesRemaining: 1} + store.objects["example/deploys/20260420-141522-abc1234/index.html"] = []byte("hi") + + h, _ := newTestHandlers(t, staffCallerGH(), standardSites(), store) + tomb := &fakeTombstones{} + h.Tombstones = tomb + + regBody, _ := json.Marshal(SiteRegisterRequest{Slug: "example", Teams: []string{"staff"}}) + require.Equal(t, http.StatusCreated, callRegister(h, regBody, "alice", "tok").Code) + + failW := withChiRoute(http.MethodDelete, "/api/site/{slug}", + "/api/site/example?purge=true", nil, + map[string]string{}, + h.SiteDelete, + contextWithLogin(context.Background(), "alice", "tok"), + ) + require.Equal(t, http.StatusBadGateway, failW.Code, failW.Body.String()) + assert.Contains(t, failW.Body.String(), "r2_move_failed") + + listW := callSitesList(h, "alice", "tok") + require.Equal(t, http.StatusOK, listW.Code) + var rows []SiteRow + require.NoError(t, json.Unmarshal(listW.Body.Bytes(), &rows)) + slugs := make([]string, len(rows)) + for i, r := range rows { + slugs[i] = r.Slug + } + assert.Contains(t, slugs, "example", "failed purge must not deregister the site (still retryable)") + assert.Empty(t, tomb.recorded, "no tombstone written when the move failed") + + retryW := withChiRoute(http.MethodDelete, "/api/site/{slug}", + "/api/site/example?purge=true", nil, + map[string]string{}, + h.SiteDelete, + contextWithLogin(context.Background(), "alice", "tok"), + ) + require.Equal(t, http.StatusOK, retryW.Code, retryW.Body.String()) + assert.Contains(t, retryW.Body.String(), "purged") + + store.mu.Lock() + for k := range store.objects { + assert.Truef(t, hasPrefix(k, "_trash/example/"), "retry cascaded every example/ object into _trash, found %q live", k) + } + store.mu.Unlock() + assert.Equal(t, []string{"example/"}, tomb.recorded, "retry records the site-level tombstone") + + gone := callSitesList(h, "alice", "tok") + require.Equal(t, http.StatusOK, gone.Code) + var after []SiteRow + require.NoError(t, json.Unmarshal(gone.Body.Bytes(), &after)) + for _, r := range after { + assert.NotEqual(t, "example", r.Slug, "successful purge deregisters the site") + } +} + +func TestSiteDelete_NoPurge_LeavesBytes(t *testing.T) { + store := newFakeR2() + store.objects["example/deploys/20260420-141522-abc1234/index.html"] = []byte("hi") + + h, _ := newTestHandlers(t, staffCallerGH(), standardSites(), store) + h.Tombstones = &fakeTombstones{} + + regBody, _ := json.Marshal(SiteRegisterRequest{Slug: "example", Teams: []string{"staff"}}) + require.Equal(t, http.StatusCreated, callRegister(h, regBody, "alice", "tok").Code) + + w := callDelete(h, "example", "alice", "tok") + require.Equal(t, http.StatusNoContent, w.Code, w.Body.String()) + + store.mu.Lock() + _, stillThere := store.objects["example/deploys/20260420-141522-abc1234/index.html"] + store.mu.Unlock() + assert.True(t, stillThere, "plain deregister (no purge) leaves R2 bytes untouched") +} diff --git a/internal/handler/site_register.go b/internal/handler/site_register.go index f32b59e..783f78d 100644 --- a/internal/handler/site_register.go +++ b/internal/handler/site_register.go @@ -190,17 +190,47 @@ func (h *Handlers) SiteDelete(w http.ResponseWriter, r *http.Request) { return } - if err := h.Registry.Delete(r.Context(), slug); err != nil { - switch { - case errors.Is(err, registry.ErrNotFound): - writeError(w, http.StatusNotFound, "not_found", "site is not registered") - default: - writeUpstreamError(w, r, http.StatusBadGateway, "registry_write_failed", "valkey.delete", err) + if r.URL.Query().Get("purge") != "true" { + if err := h.Registry.Delete(r.Context(), slug); err != nil { + writeRegistryDeleteError(w, r, err) + return } + slog.Info("site.delete", "slug", slug, "reqID", RequestIDFromContext(r.Context())) + w.WriteHeader(http.StatusNoContent) + return + } + + if h.Tombstones == nil { + writeError(w, http.StatusServiceUnavailable, "unavailable", "tombstone store not configured") + return + } + base := h.TrashPrefixBase + if base == "" { + base = "_trash/" + } + moved, err := h.R2.MovePrefix(r.Context(), slug+"/", base+slug+"/") + if err != nil { + writeUpstreamError(w, r, http.StatusBadGateway, "r2_move_failed", "r2.move.site-purge", err) + return + } + if err := h.Tombstones.RecordTombstone(r.Context(), slug, "", 0); err != nil { + writeUpstreamError(w, r, http.StatusBadGateway, "tombstone_record_failed", "pg.tombstone.site-purge", err) + return + } + if err := h.Registry.Delete(r.Context(), slug); err != nil { + writeRegistryDeleteError(w, r, err) + return + } + slog.Info("site.purge", "slug", slug, "moved", moved, "reqID", RequestIDFromContext(r.Context())) + writeJSON(w, http.StatusOK, map[string]any{"slug": slug, "status": "purged", "moved": moved}) +} + +func writeRegistryDeleteError(w http.ResponseWriter, r *http.Request, err error) { + if errors.Is(err, registry.ErrNotFound) { + writeError(w, http.StatusNotFound, "not_found", "site is not registered") return } - slog.Info("site.delete", "slug", slug, "reqID", RequestIDFromContext(r.Context())) - w.WriteHeader(http.StatusNoContent) + writeUpstreamError(w, r, http.StatusBadGateway, "registry_write_failed", "valkey.delete", err) } // SitesList implements GET /api/sites — enumerates every registered diff --git a/internal/handler/test_helpers_test.go b/internal/handler/test_helpers_test.go index b41c981..1676fd0 100644 --- a/internal/handler/test_helpers_test.go +++ b/internal/handler/test_helpers_test.go @@ -373,6 +373,23 @@ func (f *fakeR2) HasPrefix(_ context.Context, prefix string) (bool, error) { return false, nil } +func (f *fakeR2) MovePrefix(_ context.Context, src, dst string) (int, error) { + f.mu.Lock() + defer f.mu.Unlock() + if f.listErr != nil { + return 0, f.listErr + } + var moved int + for k, v := range f.objects { + if hasPrefix(k, src) { + f.objects[dst+trimPrefix(k, src)] = v + delete(f.objects, k) + moved++ + } + } + return moved, nil +} + func (f *fakeR2) VerifyDeployComplete(_ context.Context, prefix string, expected []string) error { f.mu.Lock() defer f.mu.Unlock() diff --git a/internal/hatchet/adapter.go b/internal/hatchet/adapter.go new file mode 100644 index 0000000..6d7543b --- /dev/null +++ b/internal/hatchet/adapter.go @@ -0,0 +1,147 @@ +package hatchet + +import ( + "context" + "encoding/json" + "fmt" + "net" + "strconv" + + v0Client "github.com/hatchet-dev/hatchet/pkg/client" + "github.com/hatchet-dev/hatchet/pkg/client/types" + hsdk "github.com/hatchet-dev/hatchet/sdks/go" + + "github.com/freeCodeCamp/artemis/internal/worker" +) + +const defaultWorkerName = "artemis" + +type Config struct { + Token string + Addr string + WorkerName string +} + +type Adapter struct { + cfg Config + defs []worker.WorkflowDef + client *hsdk.Client + worker *hsdk.Worker +} + +func New(cfg Config) *Adapter { + return &Adapter{cfg: cfg} +} + +func (a *Adapter) Register(def worker.WorkflowDef) error { + if def.Name == "" { + return fmt.Errorf("hatchet: workflow name required") + } + if def.Handler == nil { + return fmt.Errorf("hatchet: workflow %s has nil handler", def.Name) + } + a.defs = append(a.defs, def) + return nil +} + +func (a *Adapter) Registered() []worker.WorkflowDef { + out := make([]worker.WorkflowDef, len(a.defs)) + copy(out, a.defs) + return out +} + +func (a *Adapter) Start(ctx context.Context) error { + client, err := a.connect() + if err != nil { + return fmt.Errorf("hatchet: connect: %w", err) + } + a.client = client + + workflows := make([]hsdk.WorkflowBase, 0, len(a.defs)) + for _, def := range a.defs { + workflows = append(workflows, a.buildWorkflow(client, def)) + } + + w, err := client.NewWorker(a.workerName(), hsdk.WithWorkflows(workflows...)) + if err != nil { + return fmt.Errorf("hatchet: new worker: %w", err) + } + a.worker = w + return w.StartBlocking(ctx) +} + +func (a *Adapter) Stop(context.Context) error { + return nil +} + +func (a *Adapter) Publish(ctx context.Context, topic string, payload []byte) error { + if a.client == nil { + return fmt.Errorf("hatchet: publish %s before start", topic) + } + var data any + if len(payload) > 0 { + if err := json.Unmarshal(payload, &data); err != nil { + return fmt.Errorf("hatchet: publish %s: decode payload: %w", topic, err) + } + } + if err := a.client.Events().Push(ctx, topic, data); err != nil { + return fmt.Errorf("hatchet: publish %s: %w", topic, err) + } + return nil +} + +func (a *Adapter) buildWorkflow(client *hsdk.Client, def worker.WorkflowDef) *hsdk.Workflow { + var opts []hsdk.WorkflowOption + if def.ConcurrencyKey != "" { + maxRuns := int32(1) + strategy := types.GroupRoundRobin + opts = append(opts, hsdk.WithWorkflowConcurrency(types.Concurrency{ + Expression: "input." + def.ConcurrencyKey, + MaxRuns: &maxRuns, + LimitStrategy: &strategy, + })) + } + if len(def.EventTriggers) > 0 { + opts = append(opts, hsdk.WithWorkflowEvents(def.EventTriggers...)) + } + if len(def.Cron) > 0 { + opts = append(opts, hsdk.WithWorkflowCron(def.Cron...)) + } + wf := client.NewWorkflow(def.Name, opts...) + handler := def.Handler + wf.NewTask(def.Name, func(ctx hsdk.Context, input map[string]any) (any, error) { + return nil, handler(ctx, input) + }) + return wf +} + +func (a *Adapter) connect() (*hsdk.Client, error) { + var opts []v0Client.ClientOpt + if a.cfg.Token != "" { + opts = append(opts, v0Client.WithToken(a.cfg.Token)) + } + if a.cfg.Addr != "" { + host, portStr, err := net.SplitHostPort(a.cfg.Addr) + if err != nil { + return nil, fmt.Errorf("parse addr %q: %w", a.cfg.Addr, err) + } + port, err := strconv.Atoi(portStr) + if err != nil { + return nil, fmt.Errorf("parse addr %q port: %w", a.cfg.Addr, err) + } + opts = append(opts, v0Client.WithHostPort(host, port)) + } + return hsdk.NewClient(opts...) +} + +func (a *Adapter) workerName() string { + if a.cfg.WorkerName != "" { + return a.cfg.WorkerName + } + return defaultWorkerName +} + +var ( + _ worker.Engine = (*Adapter)(nil) + _ worker.Publisher = (*Adapter)(nil) +) diff --git a/internal/hatchet/adapter_internal_test.go b/internal/hatchet/adapter_internal_test.go new file mode 100644 index 0000000..e0bc612 --- /dev/null +++ b/internal/hatchet/adapter_internal_test.go @@ -0,0 +1,179 @@ +package hatchet + +import ( + "context" + "encoding/base64" + "encoding/json" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/freeCodeCamp/artemis/internal/worker" +) + +func craftJWT(t *testing.T) string { + t.Helper() + header := map[string]any{"alg": "none", "typ": "JWT"} + claims := map[string]any{ + "server_url": "https://hatchet.local", + "grpc_broadcast_address": "localhost:7077", + "exp": float64(time.Now().Add(time.Hour).Unix()), + "sub": "707d0855-80ab-4e1f-a156-f1c4546cbf52", + } + return encodeSegment(t, header) + "." + encodeSegment(t, claims) + "." + "sig" +} + +func encodeSegment(t *testing.T, v map[string]any) string { + t.Helper() + raw, err := json.Marshal(v) + require.NoError(t, err) + return base64.RawURLEncoding.EncodeToString(raw) +} + +func encodeClaims(t *testing.T, claims map[string]any) string { + t.Helper() + header := map[string]any{"alg": "none", "typ": "JWT"} + return encodeSegment(t, header) + "." + encodeSegment(t, claims) + "." + "sig" +} + +func isolateClientEnv(t *testing.T) { + t.Helper() + t.Setenv("HATCHET_CLIENT_TLS_STRATEGY", "none") + t.Setenv("HATCHET_CLIENT_TOKEN", "") + t.Setenv("HATCHET_CLIENT_HOST_PORT", "") + t.Setenv("HATCHET_CLIENT_SERVER_URL", "") + t.Setenv("HATCHET_CLIENT_TENANT_ID", "") + t.Setenv("HATCHET_CLIENT_NAMESPACE", "") +} + +func TestAdapterConnectPanicsOnMalformedToken(t *testing.T) { + isolateClientEnv(t) + + missingServerURL := encodeClaims(t, map[string]any{ + "grpc_broadcast_address": "localhost:7077", + "exp": float64(time.Now().Add(time.Hour).Unix()), + "sub": "707d0855-80ab-4e1f-a156-f1c4546cbf52", + }) + + cases := []struct { + name string + token string + }{ + {"two-segment token", "not.ajwt"}, + {"bad base64 claims segment", "a.b.c"}, + {"missing server_url claim", missingServerURL}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + a := New(Config{Token: tc.token}) + require.Panics(t, func() { + client, err := a.connect() + _ = client + _ = err + }, "connect() must panic (not return error) on a malformed token; Start()'s error-wrap is dead code for this path") + }) + } +} + +func TestAdapterBuildWorkflowConcurrencyExpression(t *testing.T) { + isolateClientEnv(t) + + tok := craftJWT(t) + a := New(Config{Token: tok, Addr: "localhost:7077"}) + c, err := a.connect() + require.NoError(t, err) + + noop := func(context.Context, map[string]any) error { return nil } + wf := a.buildWorkflow(c, worker.WorkflowDef{ + Name: worker.WorkflowGCSite, + ConcurrencyKey: worker.ConcurrencyKeySite, + Handler: noop, + }) + require.Equal(t, worker.WorkflowGCSite, wf.GetName()) + + req, _, _, _ := wf.Dump() + require.Len(t, req.ConcurrencyArr, 1, "exactly one concurrency option from a non-empty ConcurrencyKey") + conc := req.ConcurrencyArr[0] + require.Equal(t, "input.site", conc.GetExpression(), + "per-site fan-out limiter must be input.; a wrong prefix silently breaks per-site isolation") + require.EqualValues(t, 1, conc.GetMaxRuns()) + require.Equal(t, "GROUP_ROUND_ROBIN", conc.GetLimitStrategy().String()) +} + +func TestAdapterBuildWorkflowOptionGuards(t *testing.T) { + isolateClientEnv(t) + + tok := craftJWT(t) + a := New(Config{Token: tok, Addr: "localhost:7077"}) + c, err := a.connect() + require.NoError(t, err) + + noop := func(context.Context, map[string]any) error { return nil } + + cases := []struct { + name string + def worker.WorkflowDef + wantCo bool + events []string + crons []string + wantExp string + }{ + { + name: "empty config attaches no concurrency/events/cron", + def: worker.WorkflowDef{ + Name: worker.WorkflowTombstonePurge, + Handler: noop, + }, + wantCo: false, + events: nil, + crons: nil, + }, + { + name: "full config attaches all three", + def: worker.WorkflowDef{ + Name: worker.WorkflowGCSite, + ConcurrencyKey: worker.ConcurrencyKeySite, + EventTriggers: []string{"site.changed"}, + Cron: []string{"0 0 * * *"}, + Handler: noop, + }, + wantCo: true, + events: []string{"site.changed"}, + crons: []string{"0 0 * * *"}, + wantExp: "input.site", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + wf := a.buildWorkflow(c, tc.def) + req, _, _, _ := wf.Dump() + + if tc.wantCo { + require.Len(t, req.ConcurrencyArr, 1) + require.Equal(t, tc.wantExp, req.ConcurrencyArr[0].GetExpression()) + } else { + require.Empty(t, req.ConcurrencyArr, + "empty ConcurrencyKey must not attach a zero-value concurrency option") + } + require.Equal(t, tc.events, req.GetEventTriggers()) + require.Equal(t, tc.crons, req.GetCronTriggers()) + }) + } +} + +func TestAdapterPublishDecodeErrorPath(t *testing.T) { + isolateClientEnv(t) + + tok := craftJWT(t) + a := New(Config{Token: tok, Addr: "localhost:7077"}) + c, err := a.connect() + require.NoError(t, err) + a.client = c + + err = a.Publish(context.Background(), "site.changed", []byte(`{bad json`)) + require.Error(t, err, "malformed payload must surface a decode error, not push garbage to the event bus") + require.ErrorContains(t, err, "decode payload") +} diff --git a/internal/hatchet/adapter_test.go b/internal/hatchet/adapter_test.go new file mode 100644 index 0000000..d8672b9 --- /dev/null +++ b/internal/hatchet/adapter_test.go @@ -0,0 +1,39 @@ +package hatchet + +import ( + "context" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/freeCodeCamp/artemis/internal/worker" +) + +func TestAdapterRegister(t *testing.T) { + a := New(Config{Token: "tok", Addr: "hatchet.svc:7077"}) + + noop := func(context.Context, map[string]any) error { return nil } + require.NoError(t, a.Register(worker.WorkflowDef{ + Name: worker.WorkflowGCSite, ConcurrencyKey: worker.ConcurrencyKeySite, Handler: noop, + })) + require.NoError(t, a.Register(worker.WorkflowDef{ + Name: worker.WorkflowTombstonePurge, Handler: noop, + })) + require.Len(t, a.Registered(), 2) + + require.Error(t, a.Register(worker.WorkflowDef{Name: "", Handler: noop}), "empty name rejected") + require.Error(t, a.Register(worker.WorkflowDef{Name: "x"}), "nil handler rejected") + require.Len(t, a.Registered(), 2, "rejected defs must not accumulate") +} + +func TestAdapterPublishBeforeStart(t *testing.T) { + a := New(Config{Token: "tok"}) + err := a.Publish(context.Background(), "site.changed", []byte(`{"site":"www.freecode.camp"}`)) + require.Error(t, err, "publish before Start must fail, not panic on a nil client") +} + +func TestAdapterConnectBadAddr(t *testing.T) { + a := New(Config{Token: "tok", Addr: "no-port"}) + err := a.Start(context.Background()) + require.Error(t, err) +} diff --git a/internal/hatchet/integration_concurrency_test.go b/internal/hatchet/integration_concurrency_test.go new file mode 100644 index 0000000..32cc03e --- /dev/null +++ b/internal/hatchet/integration_concurrency_test.go @@ -0,0 +1,77 @@ +//go:build integration + +package hatchet_test + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" + + hatchetadapter "github.com/freeCodeCamp/artemis/internal/hatchet" + "github.com/freeCodeCamp/artemis/internal/worker" +) + +func TestR2WorkerRegistersPerSiteConcurrency(t *testing.T) { + requireEngine(t) + + adapter := hatchetadapter.New(hatchetadapter.Config{WorkerName: "artemis-it-" + shortID()}) + for _, def := range deployDefs(newObserver(), nil) { + require.NoError(t, adapter.Register(def)) + } + + regd := adapter.Registered() + require.Len(t, regd, 3) + + want := map[string]bool{ + worker.WorkflowFinalize: false, + worker.WorkflowPromote: false, + worker.WorkflowRollback: false, + } + for _, def := range regd { + _, ok := want[def.Name] + require.True(t, ok, "unexpected workflow %q", def.Name) + require.Equal(t, worker.ConcurrencyKeySite, def.ConcurrencyKey, + "workflow %q must key concurrency on site", def.Name) + require.Contains(t, def.EventTriggers, def.Name) + want[def.Name] = true + } + for name, seen := range want { + require.True(t, seen, "workflow %q not registered", name) + } +} + +func TestR3SameSiteNeverConcurrent(t *testing.T) { + obs := newObserver() + h := startHarness(t, obs, map[string]worker.Handler{ + worker.WorkflowFinalize: instrumented(obs, 1500*time.Millisecond, nil), + }) + + const site = "r3-same-site" + h.fire(t, worker.WorkflowFinalize, site) + h.fire(t, worker.WorkflowFinalize, site) + h.fire(t, worker.WorkflowFinalize, site) + + h.waitStarts(t, site, 3) + + require.LessOrEqual(t, h.observed.peakConcurrency(site), 1, + "two events for the same site must never run concurrently") +} + +func TestR3DistinctSitesRunConcurrent(t *testing.T) { + obs := newObserver() + h := startHarness(t, obs, map[string]worker.Handler{ + worker.WorkflowPromote: instrumented(obs, 1500*time.Millisecond, nil), + }) + + siteA := "r3-distinct-a" + siteB := "r3-distinct-b" + h.fire(t, worker.WorkflowPromote, siteA) + h.fire(t, worker.WorkflowPromote, siteB) + + h.waitStarts(t, siteA, 1) + h.waitStarts(t, siteB, 1) + + require.GreaterOrEqual(t, h.observed.peakGlobalConcurrency(), 2, + "distinct sites must overlap in execution, not merely both start eventually") +} diff --git a/internal/hatchet/integration_harness_test.go b/internal/hatchet/integration_harness_test.go new file mode 100644 index 0000000..b421e77 --- /dev/null +++ b/internal/hatchet/integration_harness_test.go @@ -0,0 +1,251 @@ +//go:build integration + +package hatchet_test + +import ( + "context" + "fmt" + "os" + "sync" + "testing" + "time" + + "github.com/google/uuid" + "github.com/stretchr/testify/require" + + "github.com/hatchet-dev/hatchet/pkg/client/rest" + + hsdk "github.com/hatchet-dev/hatchet/sdks/go" + + hatchetadapter "github.com/freeCodeCamp/artemis/internal/hatchet" + "github.com/freeCodeCamp/artemis/internal/worker" +) + +const ( + siteKey = "input.site" + pollInterval = 250 * time.Millisecond + startupTimeout = 30 * time.Second + runReadyTimeout = 90 * time.Second +) + +const skipUsage = ` +real-Hatchet integration suite skipped: %s not set. + +To run against a live engine: + + cd test/integration/hatchet + docker compose -f compose.hatchet.yaml up -d + TOKEN=$(docker compose -f compose.hatchet.yaml exec -T hatchet-lite \ + /hatchet-admin token create --config /config \ + --tenant-id 707d0855-80ab-4e1f-a156-f1c4546cbf52 | tr -d '\r\n') + HATCHET_CLIENT_TOKEN="$TOKEN" \ + HATCHET_CLIENT_HOST_PORT=127.0.0.1:7077 \ + HATCHET_CLIENT_TLS_STRATEGY=none \ + go test -tags=integration -count=1 -timeout=10m ./internal/hatchet/... +` + +type harness struct { + pub worker.Publisher + client *hsdk.Client + observed *observer +} + +type observer struct { + mu sync.Mutex + starts map[string]int + active map[string]int + maxCo map[string]int + globalActive int + globalMax int +} + +func newObserver() *observer { + return &observer{ + starts: map[string]int{}, + active: map[string]int{}, + maxCo: map[string]int{}, + } +} + +func (o *observer) enter(site string) { + o.mu.Lock() + defer o.mu.Unlock() + o.starts[site]++ + o.active[site]++ + if o.active[site] > o.maxCo[site] { + o.maxCo[site] = o.active[site] + } + o.globalActive++ + if o.globalActive > o.globalMax { + o.globalMax = o.globalActive + } +} + +func (o *observer) leave(site string) { + o.mu.Lock() + defer o.mu.Unlock() + o.active[site]-- + o.globalActive-- +} + +func (o *observer) peakGlobalConcurrency() int { + o.mu.Lock() + defer o.mu.Unlock() + return o.globalMax +} + +func (o *observer) startsFor(site string) int { + o.mu.Lock() + defer o.mu.Unlock() + return o.starts[site] +} + +func (o *observer) peakConcurrency(site string) int { + o.mu.Lock() + defer o.mu.Unlock() + return o.maxCo[site] +} + +func requireEngine(t *testing.T) { + t.Helper() + if os.Getenv("HATCHET_CLIENT_TOKEN") == "" { + t.Skipf(skipUsage, "HATCHET_CLIENT_TOKEN") + } + if os.Getenv("HATCHET_CLIENT_HOST_PORT") == "" { + t.Skipf(skipUsage, "HATCHET_CLIENT_HOST_PORT") + } +} + +func siteOf(input map[string]any) string { + if v, ok := input[worker.ConcurrencyKeySite].(string); ok { + return v + } + return "" +} + +func startHarness(t *testing.T, obs *observer, handlers map[string]worker.Handler) *harness { + t.Helper() + requireEngine(t) + + adapter := hatchetadapter.New(hatchetadapter.Config{ + WorkerName: "artemis-it-" + shortID(), + }) + + for _, def := range deployDefs(obs, handlers) { + require.NoError(t, adapter.Register(def)) + } + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + errCh := make(chan error, 1) + go func() { errCh <- adapter.Start(ctx) }() + + waitPublishable(t, adapter) + + client, err := hsdk.NewClient() + require.NoError(t, err) + + t.Cleanup(func() { + cancel() + select { + case <-errCh: + case <-time.After(10 * time.Second): + } + }) + + return &harness{pub: adapter, client: client, observed: obs} +} + +func deployDefs(obs *observer, handlers map[string]worker.Handler) []worker.WorkflowDef { + names := []string{worker.WorkflowFinalize, worker.WorkflowPromote, worker.WorkflowRollback} + defs := make([]worker.WorkflowDef, 0, len(names)) + for _, name := range names { + h := handlers[name] + if h == nil { + h = instrumented(obs, 0, nil) + } + defs = append(defs, worker.WorkflowDef{ + Name: name, + ConcurrencyKey: worker.ConcurrencyKeySite, + EventTriggers: []string{name}, + Handler: h, + }) + } + return defs +} + +func instrumented(obs *observer, hold time.Duration, fail error) worker.Handler { + return func(ctx context.Context, input map[string]any) error { + site := siteOf(input) + obs.enter(site) + defer obs.leave(site) + if hold > 0 { + select { + case <-ctx.Done(): + case <-time.After(hold): + } + } + return fail + } +} + +func waitPublishable(t *testing.T, pub worker.Publisher) { + t.Helper() + deadline := time.Now().Add(startupTimeout) + for time.Now().Before(deadline) { + err := pub.Publish(context.Background(), "artemis.it.warmup", []byte(`{"site":"__warmup__"}`)) + if err == nil { + return + } + time.Sleep(pollInterval) + } + t.Fatalf("worker did not become publishable within %s", startupTimeout) +} + +func (h *harness) fire(t *testing.T, topic, site string) { + t.Helper() + payload := []byte(fmt.Sprintf(`{"site":%q}`, site)) + require.NoError(t, h.pub.Publish(context.Background(), topic, payload)) +} + +func (h *harness) waitStarts(t *testing.T, site string, want int) { + t.Helper() + deadline := time.Now().Add(runReadyTimeout) + for time.Now().Before(deadline) { + if h.observed.startsFor(site) >= want { + return + } + time.Sleep(pollInterval) + } + t.Fatalf("site=%s: got %d starts, want >= %d within %s", + site, h.observed.startsFor(site), want, runReadyTimeout) +} + +func (h *harness) waitRunStatus(t *testing.T, runID string, target rest.V1TaskStatus) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), runReadyTimeout) + defer cancel() + for { + details, err := h.client.Runs().GetDetails(ctx, uuid.MustParse(runID)) + if err == nil { + if details.Status == target { + return + } + for _, tr := range details.TaskRuns { + if tr.Status == target { + return + } + } + } + select { + case <-ctx.Done(): + t.Fatalf("run %s did not reach status %s within %s", runID, target, runReadyTimeout) + case <-time.After(pollInterval): + } + } +} + +func shortID() string { + return uuid.NewString()[:8] +} diff --git a/internal/hatchet/integration_poison_test.go b/internal/hatchet/integration_poison_test.go new file mode 100644 index 0000000..46cd45e --- /dev/null +++ b/internal/hatchet/integration_poison_test.go @@ -0,0 +1,34 @@ +//go:build integration + +package hatchet_test + +import ( + "errors" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/freeCodeCamp/artemis/internal/worker" +) + +func TestR4PoisonDeadLettersWithoutBlockingKey(t *testing.T) { + obs := newObserver() + + h := startHarness(t, obs, map[string]worker.Handler{ + worker.WorkflowRollback: instrumented(obs, 0, errors.New("poison: deliberate failure for dead-letter")), + worker.WorkflowFinalize: instrumented(obs, 0, nil), + }) + + const site = "r4-poison-site" + h.fire(t, worker.WorkflowRollback, site) + h.waitStarts(t, site, 1) + + time.Sleep(2 * time.Second) + + h.fire(t, worker.WorkflowFinalize, site) + h.waitStarts(t, site, 2) + + require.GreaterOrEqual(t, obs.startsFor(site), 2, + "healthy workflow on same key never ran: poison left the concurrency key blocked") +} diff --git a/internal/hatchet/integration_relay_test.go b/internal/hatchet/integration_relay_test.go new file mode 100644 index 0000000..815a345 --- /dev/null +++ b/internal/hatchet/integration_relay_test.go @@ -0,0 +1,120 @@ +//go:build integration + +package hatchet_test + +import ( + "context" + "fmt" + "os" + "os/exec" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/freeCodeCamp/artemis/internal/pg" + "github.com/freeCodeCamp/artemis/internal/worker" +) + +type memOutbox struct { + mu sync.Mutex + events []pg.OutboxEvent + published map[int64]bool +} + +func newMemOutbox(events []pg.OutboxEvent) *memOutbox { + return &memOutbox{events: events, published: map[int64]bool{}} +} + +func (m *memOutbox) FetchUnpublished(_ context.Context, limit int) ([]pg.OutboxEvent, error) { + m.mu.Lock() + defer m.mu.Unlock() + var out []pg.OutboxEvent + for _, e := range m.events { + if m.published[e.ID] { + continue + } + if len(out) >= limit { + break + } + out = append(out, e) + } + return out, nil +} + +func (m *memOutbox) MarkPublished(_ context.Context, ids []int64, _ time.Time) error { + m.mu.Lock() + defer m.mu.Unlock() + for _, id := range ids { + m.published[id] = true + } + return nil +} + +func (m *memOutbox) outstanding() int { + m.mu.Lock() + defer m.mu.Unlock() + n := 0 + for _, e := range m.events { + if !m.published[e.ID] { + n++ + } + } + return n +} + +func TestR5OutboxRelayAtLeastOnceAcrossRestart(t *testing.T) { + obs := newObserver() + h := startHarness(t, obs, map[string]worker.Handler{ + worker.WorkflowFinalize: instrumented(obs, 0, nil), + }) + + const n = 6 + sites := make([]string, n) + events := make([]pg.OutboxEvent, n) + for i := 0; i < n; i++ { + sites[i] = fmt.Sprintf("r5-relay-%02d", i) + events[i] = pg.OutboxEvent{ + ID: int64(i + 1), + Topic: worker.WorkflowFinalize, + Payload: []byte(fmt.Sprintf(`{"site":%q}`, sites[i])), + } + } + + src := newMemOutbox(events) + relay := &worker.Relay{Source: src, Publisher: h.pub, Batch: 2} + + half := func() { _, _ = relay.RunOnce(context.Background()) } + half() + + restartEngine(t) + + deadline := time.Now().Add(runReadyTimeout) + for src.outstanding() > 0 && time.Now().Before(deadline) { + if _, err := relay.RunOnce(context.Background()); err != nil { + time.Sleep(pollInterval) + } + } + require.Zero(t, src.outstanding(), "relay must drain the outbox after engine recovers") + + for _, site := range sites { + h.waitStarts(t, site, 1) + require.GreaterOrEqual(t, obs.startsFor(site), 1, + "site %s must be delivered at least once across the restart", site) + } +} + +func restartEngine(t *testing.T) { + t.Helper() + composeFile := os.Getenv("HATCHET_COMPOSE_FILE") + if composeFile == "" { + t.Skip("HATCHET_COMPOSE_FILE unset; across-restart invariant not exercised without the compose stack") + } + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + cmd := exec.CommandContext(ctx, "docker", "compose", "-f", composeFile, "restart", "hatchet-lite") + out, err := cmd.CombinedOutput() + require.NoErrorf(t, err, "restart hatchet-lite: %s", string(out)) + time.Sleep(3 * time.Second) +} diff --git a/internal/observability/capture_test.go b/internal/observability/capture_test.go new file mode 100644 index 0000000..3198538 --- /dev/null +++ b/internal/observability/capture_test.go @@ -0,0 +1,91 @@ +package observability + +import ( + "context" + "testing" + "time" + + "github.com/getsentry/sentry-go" + "github.com/stretchr/testify/require" +) + +func bindRecordingHub(t *testing.T) *recordingTransport { + t.Helper() + rt := &recordingTransport{} + client, err := sentry.NewClient(sentry.ClientOptions{ + Dsn: "https://public@example.test/1", + Transport: rt, + }) + require.NoError(t, err) + + hub := sentry.CurrentHub() + prev := hub.Client() + hub.BindClient(client) + t.Cleanup(func() { hub.BindClient(prev) }) + return rt +} + +func TestCaptureFatal_SetsFatalLevelAndBootTag(t *testing.T) { + rt := bindRecordingHub(t) + + CaptureFatal(errString("boot boom")) + + require.Len(t, rt.events, 1) + require.Equal(t, sentry.LevelFatal, rt.events[0].Level) + require.Equal(t, "boot", rt.events[0].Tags["op"]) +} + +type bufferedTransport struct { + pending []*sentry.Event + events []*sentry.Event +} + +func (b *bufferedTransport) Configure(sentry.ClientOptions) {} +func (b *bufferedTransport) SendEvent(e *sentry.Event) { b.pending = append(b.pending, e) } +func (b *bufferedTransport) Flush(time.Duration) bool { + b.events = append(b.events, b.pending...) + b.pending = nil + return true +} +func (b *bufferedTransport) FlushWithContext(context.Context) bool { return b.Flush(0) } +func (b *bufferedTransport) Close() {} + +func TestCaptureFatal_FlushesSynchronously(t *testing.T) { + bt := &bufferedTransport{} + client, err := sentry.NewClient(sentry.ClientOptions{ + Dsn: "https://public@example.test/1", + Transport: bt, + }) + require.NoError(t, err) + hub := sentry.CurrentHub() + prev := hub.Client() + hub.BindClient(client) + t.Cleanup(func() { hub.BindClient(prev) }) + + CaptureFatal(errString("boot boom")) + + require.Empty(t, bt.pending, "CaptureFatal must flush before returning") + require.Len(t, bt.events, 1, "event delivered via flush, not transport goodwill") +} + +func TestCaptureBackground_TagsAndFingerprintGroupOnOp(t *testing.T) { + rt := bindRecordingHub(t) + + CaptureBackground("registry.refresh", errString("x")) + + require.Len(t, rt.events, 1) + require.Equal(t, "registry.refresh", rt.events[0].Tags["op"]) + require.Equal(t, []string{"registry.refresh"}, rt.events[0].Fingerprint) +} + +func TestCaptureBackground_DistinctOpsGroupSeparately(t *testing.T) { + rt := bindRecordingHub(t) + + CaptureBackground("registry.refresh", errString("a")) + CaptureBackground("token.rotate", errString("b")) + sentry.CurrentHub().Flush(time.Second) + + require.Len(t, rt.events, 2) + require.Equal(t, []string{"registry.refresh"}, rt.events[0].Fingerprint) + require.Equal(t, []string{"token.rotate"}, rt.events[1].Fingerprint) +} diff --git a/internal/observability/multihandler_test.go b/internal/observability/multihandler_test.go new file mode 100644 index 0000000..d50151d --- /dev/null +++ b/internal/observability/multihandler_test.go @@ -0,0 +1,117 @@ +package observability + +import ( + "context" + "errors" + "log/slog" + "testing" + + "github.com/stretchr/testify/require" +) + +type errHandler struct{ err error } + +func (h errHandler) Enabled(context.Context, slog.Level) bool { return true } +func (h errHandler) Handle(context.Context, slog.Record) error { return h.err } +func (h errHandler) WithAttrs([]slog.Attr) slog.Handler { return h } +func (h errHandler) WithGroup(string) slog.Handler { return h } + +type enabledStub struct{ enabled bool } + +func (h enabledStub) Enabled(context.Context, slog.Level) bool { return h.enabled } +func (h enabledStub) Handle(context.Context, slog.Record) error { return nil } +func (h enabledStub) WithAttrs([]slog.Attr) slog.Handler { return h } +func (h enabledStub) WithGroup(string) slog.Handler { return h } + +type recordingAttrHandler struct { + attrs *[][]slog.Attr + groups *[]string +} + +func (h recordingAttrHandler) Enabled(context.Context, slog.Level) bool { return true } +func (h recordingAttrHandler) Handle(context.Context, slog.Record) error { return nil } +func (h recordingAttrHandler) WithAttrs(a []slog.Attr) slog.Handler { + *h.attrs = append(*h.attrs, a) + return h +} +func (h recordingAttrHandler) WithGroup(name string) slog.Handler { + *h.groups = append(*h.groups, name) + return h +} + +func TestMultiHandler_HandleAggregatesErrorsAndStillFansOut(t *testing.T) { + t.Parallel() + sentinel := errors.New("child handler boom") + var msgs []string + + multi := NewMultiHandler(errHandler{err: sentinel}, recordingHandler{&msgs}) + rec := slog.Record{Message: "still-delivered"} + + err := multi.Handle(context.Background(), rec) + + require.ErrorIs(t, err, sentinel) + require.Equal(t, []string{"still-delivered"}, msgs, "second handler must run despite first erroring") +} + +func TestMultiHandler_Enabled(t *testing.T) { + t.Parallel() + tests := []struct { + name string + stubs []slog.Handler + want bool + }{ + { + name: "allEnabled", + stubs: []slog.Handler{enabledStub{true}, enabledStub{true}}, + want: true, + }, + { + name: "oneEnabled", + stubs: []slog.Handler{enabledStub{false}, enabledStub{true}}, + want: true, + }, + { + name: "noneEnabled", + stubs: []slog.Handler{enabledStub{false}, enabledStub{false}}, + want: false, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + m := NewMultiHandler(tc.stubs...) + require.Equal(t, tc.want, m.Enabled(context.Background(), slog.LevelInfo)) + }) + } +} + +func TestMultiHandler_WithAttrsPropagatesToEveryChild(t *testing.T) { + t.Parallel() + var attrs1, attrs2 [][]slog.Attr + var groups1, groups2 []string + rec1 := recordingAttrHandler{attrs: &attrs1, groups: &groups1} + rec2 := recordingAttrHandler{attrs: &attrs2, groups: &groups2} + + want := []slog.Attr{slog.String("site", "x")} + got := NewMultiHandler(rec1, rec2).WithAttrs(want) + + require.IsType(t, multiHandler{}, got) + require.Len(t, got.(multiHandler).handlers, 2, "no child dropped") + require.Equal(t, [][]slog.Attr{want}, attrs1, "child 1 saw the attr") + require.Equal(t, [][]slog.Attr{want}, attrs2, "child 2 saw the attr") +} + +func TestMultiHandler_WithGroupPropagatesToEveryChild(t *testing.T) { + t.Parallel() + var attrs1, attrs2 [][]slog.Attr + var groups1, groups2 []string + rec1 := recordingAttrHandler{attrs: &attrs1, groups: &groups1} + rec2 := recordingAttrHandler{attrs: &attrs2, groups: &groups2} + + got := NewMultiHandler(rec1, rec2).WithGroup("g") + + require.IsType(t, multiHandler{}, got) + require.Len(t, got.(multiHandler).handlers, 2, "no child dropped") + require.Equal(t, []string{"g"}, groups1, "child 1 saw the group") + require.Equal(t, []string{"g"}, groups2, "child 2 saw the group") +} diff --git a/internal/pg/alias.go b/internal/pg/alias.go new file mode 100644 index 0000000..9f63683 --- /dev/null +++ b/internal/pg/alias.go @@ -0,0 +1,39 @@ +package pg + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/jackc/pgx/v5" +) + +func (r *Repo) SetAliasCAS(ctx context.Context, site, name, expected, next string, at time.Time) (current string, ok bool, err error) { + err = r.WithTx(ctx, func(tx pgx.Tx) error { + var cur string + scanErr := tx.QueryRow(ctx, + `SELECT deploy_id FROM aliases WHERE site = $1 AND name = $2 FOR UPDATE`, site, name).Scan(&cur) + if scanErr != nil && !errors.Is(scanErr, pgx.ErrNoRows) { + return fmt.Errorf("alias cas read %s/%s: %w", site, name, scanErr) + } + current = cur + if cur != expected { + ok = false + return nil + } + if _, err := tx.Exec(ctx, + `INSERT INTO aliases (site, name, deploy_id, updated_at) + VALUES ($1, $2, $3, $4) + ON CONFLICT (site, name) DO UPDATE SET deploy_id = EXCLUDED.deploy_id, updated_at = EXCLUDED.updated_at`, + site, name, next, at); err != nil { + return fmt.Errorf("alias cas write %s/%s: %w", site, name, err) + } + if err := Enqueue(ctx, tx, TopicSiteChanged, map[string]string{"site": site}); err != nil { + return err + } + ok = true + return nil + }) + return current, ok, err +} diff --git a/internal/pg/alias_create_test.go b/internal/pg/alias_create_test.go new file mode 100644 index 0000000..999b7fb --- /dev/null +++ b/internal/pg/alias_create_test.go @@ -0,0 +1,49 @@ +package pg + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestSetAliasCAS_CreateFromEmpty(t *testing.T) { + repo := newTestRepo(t) + ctx := context.Background() + t0 := time.Now().UTC() + + cur, ok, err := repo.SetAliasCAS(ctx, "new-site", "production", "", "d1", t0) + require.NoError(t, err) + assert.True(t, ok, "CAS over an absent row with expected=\"\" creates the alias") + assert.Equal(t, "", cur, "current value of a fresh alias is empty") + + targets, _, err := repo.AliasTargets(ctx, "new-site") + require.NoError(t, err) + assert.Contains(t, targets, "d1", "the new alias points at the published deploy") + + events, err := repo.FetchUnpublished(ctx, 10) + require.NoError(t, err) + require.Len(t, events, 1, "first-publish enqueues exactly one outbox event") + assert.Equal(t, TopicSiteChanged, events[0].Topic) +} + +func TestSetAliasCAS_AbsentRowNonEmptyExpected(t *testing.T) { + repo := newTestRepo(t) + ctx := context.Background() + t0 := time.Now().UTC() + + cur, ok, err := repo.SetAliasCAS(ctx, "ghost-site", "production", "X", "d1", t0) + require.NoError(t, err) + assert.False(t, ok, "CAS over an absent row with a non-empty expected value is rejected") + assert.Equal(t, "", cur, "actual current value is empty for an absent row") + + targets, _, err := repo.AliasTargets(ctx, "ghost-site") + require.NoError(t, err) + assert.Empty(t, targets, "rejected CAS does not create the alias") + + events, err := repo.FetchUnpublished(ctx, 10) + require.NoError(t, err) + assert.Empty(t, events, "rejected CAS enqueues no outbox event") +} diff --git a/internal/pg/alias_test.go b/internal/pg/alias_test.go new file mode 100644 index 0000000..ea306a0 --- /dev/null +++ b/internal/pg/alias_test.go @@ -0,0 +1,64 @@ +package pg + +import ( + "context" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestAlias_NoLostUpdate(t *testing.T) { + repo := newTestRepo(t) + ctx := context.Background() + now := time.Now().UTC() + + require.NoError(t, repo.UpsertAlias(ctx, "www", "production", "A", now)) + + var wg sync.WaitGroup + type res struct { + ok bool + current string + err error + } + results := make([]res, 2) + nexts := []string{"B", "C"} + for i := 0; i < 2; i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + cur, ok, err := repo.SetAliasCAS(ctx, "www", "production", "A", nexts[i], now.Add(time.Minute)) + results[i] = res{ok, cur, err} + }(i) + } + wg.Wait() + + wins := 0 + for _, r := range results { + require.NoError(t, r.err) + if r.ok { + wins++ + } + } + assert.Equal(t, 1, wins, "exactly one concurrent CAS from the same expected value wins (V8 no lost update)") + + targets, _, err := repo.AliasTargets(ctx, "www") + require.NoError(t, err) + assert.Len(t, targets, 1, "alias holds a single, consistent value") + _, hasA := targets["A"] + assert.False(t, hasA, "the stale value was overwritten by the winner") +} + +func TestSetAliasCAS_DriftRejected(t *testing.T) { + repo := newTestRepo(t) + ctx := context.Background() + now := time.Now().UTC() + require.NoError(t, repo.UpsertAlias(ctx, "www", "production", "A", now)) + + cur, ok, err := repo.SetAliasCAS(ctx, "www", "production", "stale-expected", "Z", now) + require.NoError(t, err) + assert.False(t, ok, "CAS with wrong expected value is rejected") + assert.Equal(t, "A", cur, "caller is told the actual current value") +} diff --git a/internal/pg/migrate.go b/internal/pg/migrate.go new file mode 100644 index 0000000..b7bc37a --- /dev/null +++ b/internal/pg/migrate.go @@ -0,0 +1,111 @@ +package pg + +import ( + "context" + "embed" + "fmt" + "io/fs" + "sort" + "strings" + "time" + + "github.com/jackc/pgx/v5/pgxpool" +) + +//go:embed migrations/*.sql +var migrationsFS embed.FS + +const migrateAdvisoryLockKey = 8472013 + +func releaseAdvisoryLock(conn *pgxpool.Conn, key int64) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, _ = conn.Exec(ctx, "SELECT pg_advisory_unlock($1)", key) +} + +func Migrate(ctx context.Context, pool *pgxpool.Pool) error { + conn, err := pool.Acquire(ctx) + if err != nil { + return fmt.Errorf("migrate: acquire: %w", err) + } + defer conn.Release() + + if _, err := conn.Exec(ctx, "SELECT pg_advisory_lock($1)", migrateAdvisoryLockKey); err != nil { + return fmt.Errorf("migrate: lock: %w", err) + } + defer releaseAdvisoryLock(conn, migrateAdvisoryLockKey) + + if _, err := conn.Exec(ctx, `CREATE TABLE IF NOT EXISTS schema_migrations ( + version TEXT PRIMARY KEY, + applied_at TIMESTAMPTZ NOT NULL DEFAULT now() + )`); err != nil { + return fmt.Errorf("migrate: ensure ledger: %w", err) + } + + names, err := migrationFiles() + if err != nil { + return err + } + for _, name := range names { + applied, err := migrationApplied(ctx, conn, name) + if err != nil { + return err + } + if applied { + continue + } + body, err := migrationsFS.ReadFile("migrations/" + name) + if err != nil { + return fmt.Errorf("migrate: read %s: %w", name, err) + } + if err := applyMigration(ctx, conn, name, string(body)); err != nil { + return err + } + } + return nil +} + +func migrationApplied(ctx context.Context, conn *pgxpool.Conn, version string) (bool, error) { + var exists bool + err := conn.QueryRow(ctx, + "SELECT EXISTS (SELECT 1 FROM schema_migrations WHERE version = $1)", version).Scan(&exists) + if err != nil { + return false, fmt.Errorf("migrate: check %s: %w", version, err) + } + return exists, nil +} + +func applyMigration(ctx context.Context, conn *pgxpool.Conn, version, body string) error { + tx, err := conn.Begin(ctx) + if err != nil { + return fmt.Errorf("migrate: begin %s: %w", version, err) + } + defer func() { _ = tx.Rollback(ctx) }() + + if _, err := tx.Exec(ctx, body); err != nil { + return fmt.Errorf("migrate: apply %s: %w", version, err) + } + if _, err := tx.Exec(ctx, + "INSERT INTO schema_migrations (version) VALUES ($1)", version); err != nil { + return fmt.Errorf("migrate: record %s: %w", version, err) + } + if err := tx.Commit(ctx); err != nil { + return fmt.Errorf("migrate: commit %s: %w", version, err) + } + return nil +} + +func migrationFiles() ([]string, error) { + entries, err := fs.ReadDir(migrationsFS, "migrations") + if err != nil { + return nil, fmt.Errorf("migrate: list: %w", err) + } + names := make([]string, 0, len(entries)) + for _, e := range entries { + if !e.IsDir() && strings.HasSuffix(e.Name(), ".sql") { + names = append(names, e.Name()) + } + } + sort.Strings(names) + return names, nil +} diff --git a/internal/pg/migrate_test.go b/internal/pg/migrate_test.go new file mode 100644 index 0000000..9a8af00 --- /dev/null +++ b/internal/pg/migrate_test.go @@ -0,0 +1,131 @@ +package pg + +import ( + "context" + "testing" + + "github.com/jackc/pgx/v5/pgxpool" + "github.com/stretchr/testify/require" + "github.com/testcontainers/testcontainers-go" + "github.com/testcontainers/testcontainers-go/modules/postgres" +) + +func TestMigrations(t *testing.T) { + testcontainers.SkipIfProviderIsNotHealthy(t) + + ctx := context.Background() + container, err := postgres.Run(ctx, "postgres:16-alpine", + postgres.WithDatabase("artemis_test"), + postgres.WithUsername("artemis"), + postgres.WithPassword("artemis"), + postgres.BasicWaitStrategies(), + ) + require.NoError(t, err) + t.Cleanup(func() { _ = container.Terminate(ctx) }) + + connStr, err := container.ConnectionString(ctx, "sslmode=disable") + require.NoError(t, err) + + db, err := New(ctx, Config{DatabaseURL: connStr}) + require.NoError(t, err) + t.Cleanup(db.Close) + + require.NoError(t, Migrate(ctx, db.Pool)) + require.NoError(t, Migrate(ctx, db.Pool), "re-run must be idempotent") + + for _, table := range []string{"deploys", "aliases", "tombstones", "outbox", "schema_migrations"} { + var exists bool + err := db.Pool.QueryRow(ctx, + "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = $1)", + table).Scan(&exists) + require.NoError(t, err) + require.Truef(t, exists, "table %q must exist after migrate", table) + } + + names, err := migrationFiles() + require.NoError(t, err) + var count int + require.NoError(t, db.Pool.QueryRow(ctx, "SELECT count(*) FROM schema_migrations").Scan(&count)) + require.Equal(t, len(names), count, "each migration recorded exactly once") + + var applied bool + require.NoError(t, db.Pool.QueryRow(ctx, + "SELECT EXISTS (SELECT 1 FROM schema_migrations WHERE version = $1)", + "0004_outbox_id_index.sql").Scan(&applied)) + require.True(t, applied, "0004 recorded") + + var indexDef string + require.NoError(t, db.Pool.QueryRow(ctx, + "SELECT indexdef FROM pg_indexes WHERE indexname = 'outbox_unpublished_idx'").Scan(&indexDef)) + require.Contains(t, indexDef, "(id)", "0004 rebuilt outbox_unpublished_idx on id to match FetchUnpublished ORDER BY id") + require.NotContains(t, indexDef, "created_at", "stale created_at index dropped by 0004") + + repo := NewRepo(db) + require.NoError(t, repo.EnqueueSiteChanged(ctx, "second")) + require.NoError(t, repo.EnqueueSiteChanged(ctx, "third")) + events, err := repo.FetchUnpublished(ctx, 10) + require.NoError(t, err) + require.Len(t, events, 2, "both enqueued events unpublished") + require.Less(t, events[0].ID, events[1].ID, "FetchUnpublished returns oldest-first by id") +} + +func TestReleaseAdvisoryLock_FreesLockOnCanceledCallerCtx(t *testing.T) { + testcontainers.SkipIfProviderIsNotHealthy(t) + + ctx := context.Background() + container, err := postgres.Run(ctx, "postgres:16-alpine", + postgres.WithDatabase("artemis_test"), + postgres.WithUsername("artemis"), + postgres.WithPassword("artemis"), + postgres.BasicWaitStrategies(), + ) + require.NoError(t, err) + t.Cleanup(func() { _ = container.Terminate(ctx) }) + + connStr, err := container.ConnectionString(ctx, "sslmode=disable") + require.NoError(t, err) + + poolCfg, err := pgxpool.ParseConfig(connStr) + require.NoError(t, err) + poolCfg.MaxConns = 1 + pool, err := pgxpool.NewWithConfig(ctx, poolCfg) + require.NoError(t, err) + t.Cleanup(pool.Close) + + probe, err := pgxpool.New(ctx, connStr) + require.NoError(t, err) + t.Cleanup(probe.Close) + + conn, err := pool.Acquire(ctx) + require.NoError(t, err) + + var poolPID uint32 + require.NoError(t, conn.QueryRow(ctx, "SELECT pg_backend_pid()").Scan(&poolPID)) + + callerCtx, cancel := context.WithCancel(ctx) + _, err = conn.Exec(callerCtx, "SELECT pg_advisory_lock($1)", migrateAdvisoryLockKey) + require.NoError(t, err) + + held, err := advisoryLockHeldByPID(ctx, probe, migrateAdvisoryLockKey, poolPID) + require.NoError(t, err) + require.True(t, held, "lock acquired on the pooled session") + + cancel() + require.Error(t, callerCtx.Err(), "caller ctx is canceled before the deferred unlock runs") + + releaseAdvisoryLock(conn, migrateAdvisoryLockKey) + conn.Release() + + held, err = advisoryLockHeldByPID(ctx, probe, migrateAdvisoryLockKey, poolPID) + require.NoError(t, err) + require.False(t, held, + "releaseAdvisoryLock must free the lock on the pooled session even when the caller ctx was canceled; a held lock leaks onto the pooled conn and blocks later migrations") +} + +func advisoryLockHeldByPID(ctx context.Context, pool *pgxpool.Pool, key int64, pid uint32) (bool, error) { + var held bool + err := pool.QueryRow(ctx, + `SELECT EXISTS (SELECT 1 FROM pg_locks WHERE locktype = 'advisory' AND objid = $1 AND granted AND pid = $2)`, + key, pid).Scan(&held) + return held, err +} diff --git a/internal/pg/migrations/0001_init.sql b/internal/pg/migrations/0001_init.sql new file mode 100644 index 0000000..9ff7265 --- /dev/null +++ b/internal/pg/migrations/0001_init.sql @@ -0,0 +1,38 @@ +CREATE TABLE IF NOT EXISTS deploys ( + site TEXT NOT NULL, + id TEXT NOT NULL, + mtime TIMESTAMPTZ NOT NULL, + bytes BIGINT NOT NULL DEFAULT 0, + has_marker BOOLEAN NOT NULL DEFAULT FALSE, + state TEXT NOT NULL DEFAULT 'active', + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + PRIMARY KEY (site, id) +); + +CREATE INDEX IF NOT EXISTS deploys_site_mtime_idx ON deploys (site, mtime DESC); + +CREATE TABLE IF NOT EXISTS aliases ( + site TEXT NOT NULL, + name TEXT NOT NULL CHECK (name IN ('production', 'preview')), + deploy_id TEXT NOT NULL, + updated_at TIMESTAMPTZ NOT NULL DEFAULT now(), + PRIMARY KEY (site, name) +); + +CREATE TABLE IF NOT EXISTS tombstones ( + site TEXT NOT NULL, + id TEXT NOT NULL, + trashed_at TIMESTAMPTZ NOT NULL DEFAULT now(), + bytes BIGINT NOT NULL DEFAULT 0, + PRIMARY KEY (site, id) +); + +CREATE TABLE IF NOT EXISTS outbox ( + id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, + topic TEXT NOT NULL, + payload JSONB NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + published_at TIMESTAMPTZ +); + +CREATE INDEX IF NOT EXISTS outbox_unpublished_idx ON outbox (created_at) WHERE published_at IS NULL; diff --git a/internal/pg/migrations/0002_registry.sql b/internal/pg/migrations/0002_registry.sql new file mode 100644 index 0000000..32dc393 --- /dev/null +++ b/internal/pg/migrations/0002_registry.sql @@ -0,0 +1,7 @@ +CREATE TABLE IF NOT EXISTS sites ( + slug TEXT PRIMARY KEY, + teams TEXT[] NOT NULL DEFAULT '{}', + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now(), + created_by TEXT NOT NULL DEFAULT '' +); diff --git a/internal/pg/migrations/0003_repo_requests.sql b/internal/pg/migrations/0003_repo_requests.sql new file mode 100644 index 0000000..7e0be2d --- /dev/null +++ b/internal/pg/migrations/0003_repo_requests.sql @@ -0,0 +1,22 @@ +CREATE TABLE IF NOT EXISTS repo_requests ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL, + owner TEXT NOT NULL DEFAULT '', + visibility TEXT NOT NULL DEFAULT 'private', + description TEXT NOT NULL DEFAULT '', + template TEXT NOT NULL DEFAULT '', + status TEXT NOT NULL, + url TEXT NOT NULL DEFAULT '', + error TEXT NOT NULL DEFAULT '', + requested_by TEXT NOT NULL DEFAULT '', + approver TEXT NOT NULL DEFAULT '', + reject_reason TEXT NOT NULL DEFAULT '', + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now() +); + +CREATE UNIQUE INDEX IF NOT EXISTS repo_requests_name_claim + ON repo_requests (lower(name)) + WHERE status IN ('pending', 'approved', 'active'); + +CREATE INDEX IF NOT EXISTS repo_requests_created_idx ON repo_requests (created_at, id); diff --git a/internal/pg/migrations/0004_outbox_id_index.sql b/internal/pg/migrations/0004_outbox_id_index.sql new file mode 100644 index 0000000..b7c081f --- /dev/null +++ b/internal/pg/migrations/0004_outbox_id_index.sql @@ -0,0 +1,3 @@ +DROP INDEX IF EXISTS outbox_unpublished_idx; + +CREATE INDEX IF NOT EXISTS outbox_unpublished_idx ON outbox (id) WHERE published_at IS NULL; diff --git a/internal/pg/outbox.go b/internal/pg/outbox.go new file mode 100644 index 0000000..932905d --- /dev/null +++ b/internal/pg/outbox.go @@ -0,0 +1,72 @@ +package pg + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/jackc/pgx/v5" +) + +const TopicSiteChanged = "site.changed" + +type OutboxEvent struct { + ID int64 + Topic string + Payload []byte +} + +func (r *Repo) WithTx(ctx context.Context, fn func(tx pgx.Tx) error) error { + return pgx.BeginFunc(ctx, r.pool, fn) +} + +func Enqueue(ctx context.Context, tx pgx.Tx, topic string, payload any) error { + b, err := json.Marshal(payload) + if err != nil { + return fmt.Errorf("pg outbox marshal %s: %w", topic, err) + } + if _, err := tx.Exec(ctx, `INSERT INTO outbox (topic, payload) VALUES ($1, $2)`, topic, b); err != nil { + return fmt.Errorf("pg outbox enqueue %s: %w", topic, err) + } + return nil +} + +func (r *Repo) EnqueueSiteChanged(ctx context.Context, site string) error { + return r.WithTx(ctx, func(tx pgx.Tx) error { + return Enqueue(ctx, tx, TopicSiteChanged, map[string]string{"site": site}) + }) +} + +func (r *Repo) FetchUnpublished(ctx context.Context, limit int) ([]OutboxEvent, error) { + rows, err := r.pool.Query(ctx, + `SELECT id, topic, payload FROM outbox + WHERE published_at IS NULL + ORDER BY id + LIMIT $1`, limit) + if err != nil { + return nil, fmt.Errorf("pg outbox fetch: %w", err) + } + defer rows.Close() + + var out []OutboxEvent + for rows.Next() { + var e OutboxEvent + if err := rows.Scan(&e.ID, &e.Topic, &e.Payload); err != nil { + return nil, fmt.Errorf("pg outbox scan: %w", err) + } + out = append(out, e) + } + return out, rows.Err() +} + +func (r *Repo) MarkPublished(ctx context.Context, ids []int64, at time.Time) error { + if len(ids) == 0 { + return nil + } + if _, err := r.pool.Exec(ctx, + `UPDATE outbox SET published_at = $1 WHERE id = ANY($2)`, at, ids); err != nil { + return fmt.Errorf("pg outbox mark published: %w", err) + } + return nil +} diff --git a/internal/pg/outbox_markpublished_test.go b/internal/pg/outbox_markpublished_test.go new file mode 100644 index 0000000..fbc9e2a --- /dev/null +++ b/internal/pg/outbox_markpublished_test.go @@ -0,0 +1,36 @@ +package pg + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestMarkPublished_EmptyBatchMarksNothing(t *testing.T) { + repo := newTestRepo(t) + ctx := context.Background() + now := time.Now() + + require.NoError(t, repo.MarkPublished(ctx, nil, now), "nil id slice is a guarded no-op") + require.NoError(t, repo.MarkPublished(ctx, []int64{}, now), "empty id slice is a guarded no-op") + + require.NoError(t, repo.EnqueueSiteChanged(ctx, "www")) + events, err := repo.FetchUnpublished(ctx, 10) + require.NoError(t, err) + require.Len(t, events, 1) + + require.NoError(t, repo.MarkPublished(ctx, nil, now), + "a nil batch must not touch existing unpublished rows") + still, err := repo.FetchUnpublished(ctx, 10) + require.NoError(t, err) + require.Len(t, still, 1, "the empty-batch no-op left the real event unpublished") + + ids := []int64{events[0].ID} + require.NoError(t, repo.MarkPublished(ctx, ids, now)) + after, err := repo.FetchUnpublished(ctx, 10) + require.NoError(t, err) + assert.Empty(t, after, "a non-empty batch marks the event published") +} diff --git a/internal/pg/outbox_test.go b/internal/pg/outbox_test.go new file mode 100644 index 0000000..95abb91 --- /dev/null +++ b/internal/pg/outbox_test.go @@ -0,0 +1,72 @@ +package pg + +import ( + "context" + "encoding/json" + "errors" + "testing" + "time" + + "github.com/jackc/pgx/v5" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestOutbox_AtomicWithMetadataAndRelay(t *testing.T) { + repo := newTestRepo(t) + ctx := context.Background() + + require.NoError(t, repo.WithTx(ctx, func(tx pgx.Tx) error { + if _, err := tx.Exec(ctx, `INSERT INTO deploys (site, id, mtime) VALUES ('www', 'd1', now())`); err != nil { + return err + } + return Enqueue(ctx, tx, TopicSiteChanged, map[string]string{"site": "www"}) + })) + + boom := errors.New("boom") + err := repo.WithTx(ctx, func(tx pgx.Tx) error { + if _, err := tx.Exec(ctx, `INSERT INTO deploys (site, id, mtime) VALUES ('www', 'd2', now())`); err != nil { + return err + } + if err := Enqueue(ctx, tx, TopicSiteChanged, map[string]string{"site": "rolled-back"}); err != nil { + return err + } + return boom + }) + require.ErrorIs(t, err, boom) + + deploys, err := repo.DeploysForSite(ctx, "www") + require.NoError(t, err) + ids := map[string]bool{} + for _, d := range deploys { + ids[d.ID] = true + } + assert.True(t, ids["d1"], "committed metadata present") + assert.False(t, ids["d2"], "rolled-back metadata absent (dual-write closed)") + + events, err := repo.FetchUnpublished(ctx, 10) + require.NoError(t, err) + require.Len(t, events, 1, "only the committed tx produced an outbox row") + assert.Equal(t, TopicSiteChanged, events[0].Topic) + var p map[string]string + require.NoError(t, json.Unmarshal(events[0].Payload, &p)) + assert.Equal(t, "www", p["site"]) + + require.NoError(t, repo.MarkPublished(ctx, []int64{events[0].ID}, time.Now())) + again, err := repo.FetchUnpublished(ctx, 10) + require.NoError(t, err) + assert.Empty(t, again, "published events are not re-fetched") +} + +func TestOutbox_EnqueueSiteChanged(t *testing.T) { + repo := newTestRepo(t) + ctx := context.Background() + require.NoError(t, repo.EnqueueSiteChanged(ctx, "learn")) + + events, err := repo.FetchUnpublished(ctx, 10) + require.NoError(t, err) + require.Len(t, events, 1) + var p map[string]string + require.NoError(t, json.Unmarshal(events[0].Payload, &p)) + assert.Equal(t, "learn", p["site"]) +} diff --git a/internal/pg/pg.go b/internal/pg/pg.go new file mode 100644 index 0000000..cd31392 --- /dev/null +++ b/internal/pg/pg.go @@ -0,0 +1,39 @@ +package pg + +import ( + "context" + "fmt" + + "github.com/jackc/pgx/v5/pgxpool" +) + +type Config struct { + DatabaseURL string +} + +type DB struct { + Pool *pgxpool.Pool +} + +func New(ctx context.Context, cfg Config) (*DB, error) { + if cfg.DatabaseURL == "" { + return nil, fmt.Errorf("pg: empty DatabaseURL") + } + pool, err := pgxpool.New(ctx, cfg.DatabaseURL) + if err != nil { + return nil, fmt.Errorf("pg: connect: %w", err) + } + if err := pool.Ping(ctx); err != nil { + pool.Close() + return nil, fmt.Errorf("pg: ping: %w", err) + } + return &DB{Pool: pool}, nil +} + +func (db *DB) Ping(ctx context.Context) error { + return db.Pool.Ping(ctx) +} + +func (db *DB) Close() { + db.Pool.Close() +} diff --git a/internal/pg/registry.go b/internal/pg/registry.go new file mode 100644 index 0000000..a831c90 --- /dev/null +++ b/internal/pg/registry.go @@ -0,0 +1,153 @@ +package pg + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" + + "github.com/freeCodeCamp/artemis/internal/registry" +) + +type RegistryStore struct { + pool *pgxpool.Pool + now func() time.Time + onChange func(slug string) +} + +func NewRegistryStore(db *DB) *RegistryStore { + return &RegistryStore{pool: db.Pool, now: time.Now} +} + +func (s *RegistryStore) WithClock(now func() time.Time) *RegistryStore { + s.now = now + return s +} + +func (s *RegistryStore) WithOnChange(fn func(slug string)) *RegistryStore { + s.onChange = fn + return s +} + +func (s *RegistryStore) changed(slug string) { + if s.onChange != nil { + s.onChange(slug) + } +} + +func (s *RegistryStore) Register(ctx context.Context, slug string, teams []string, createdBy string) (registry.Site, error) { + now := s.now().UTC() + teams = append([]string(nil), teams...) + tag, err := s.pool.Exec(ctx, + `INSERT INTO sites (slug, teams, created_at, updated_at, created_by) + VALUES ($1, $2, $3, $3, $4) + ON CONFLICT (slug) DO NOTHING`, + slug, teams, now, createdBy) + if err != nil { + return registry.Site{}, fmt.Errorf("pg registry register %s: %w", slug, err) + } + if tag.RowsAffected() == 0 { + return registry.Site{}, registry.ErrAlreadyExists + } + s.changed(slug) + return registry.Site{Slug: slug, Teams: teams, CreatedAt: now, UpdatedAt: now, CreatedBy: createdBy}, nil +} + +func (s *RegistryStore) UpdateTeams(ctx context.Context, slug string, teams []string) (registry.Site, error) { + now := s.now().UTC() + teams = append([]string(nil), teams...) + var site registry.Site + err := s.pool.QueryRow(ctx, + `UPDATE sites SET teams = $2, updated_at = $3 WHERE slug = $1 + RETURNING slug, teams, created_at, updated_at, created_by`, + slug, teams, now).Scan(&site.Slug, &site.Teams, &site.CreatedAt, &site.UpdatedAt, &site.CreatedBy) + if errors.Is(err, pgx.ErrNoRows) { + return registry.Site{}, registry.ErrNotFound + } + if err != nil { + return registry.Site{}, fmt.Errorf("pg registry update %s: %w", slug, err) + } + s.changed(slug) + return site, nil +} + +func (s *RegistryStore) Delete(ctx context.Context, slug string) error { + tag, err := s.pool.Exec(ctx, `DELETE FROM sites WHERE slug = $1`, slug) + if err != nil { + return fmt.Errorf("pg registry delete %s: %w", slug, err) + } + if tag.RowsAffected() == 0 { + return registry.ErrNotFound + } + s.changed(slug) + return nil +} + +type SitesSource interface { + Sites(ctx context.Context) ([]registry.Site, error) +} + +const importAdvisoryLockKey = 8472014 + +func (s *RegistryStore) Import(ctx context.Context, src SitesSource) (int, error) { + conn, err := s.pool.Acquire(ctx) + if err != nil { + return 0, fmt.Errorf("pg registry import: acquire: %w", err) + } + defer conn.Release() + + if _, err := conn.Exec(ctx, "SELECT pg_advisory_lock($1)", importAdvisoryLockKey); err != nil { + return 0, fmt.Errorf("pg registry import: lock: %w", err) + } + defer releaseAdvisoryLock(conn, importAdvisoryLockKey) + + var count int + if err := conn.QueryRow(ctx, "SELECT count(*) FROM sites").Scan(&count); err != nil { + return 0, fmt.Errorf("pg registry import: count: %w", err) + } + if count > 0 { + return 0, nil + } + + sites, err := src.Sites(ctx) + if err != nil { + return 0, fmt.Errorf("pg registry import: source sites: %w", err) + } + + imported := 0 + for _, site := range sites { + teams := append([]string(nil), site.Teams...) + tag, err := conn.Exec(ctx, + `INSERT INTO sites (slug, teams, created_at, updated_at, created_by) + VALUES ($1, $2, $3, $4, $5) + ON CONFLICT (slug) DO NOTHING`, + site.Slug, teams, site.CreatedAt, site.UpdatedAt, site.CreatedBy) + if err != nil { + return imported, fmt.Errorf("pg registry import %s: %w", site.Slug, err) + } + imported += int(tag.RowsAffected()) + } + return imported, nil +} + +func (s *RegistryStore) Sites(ctx context.Context) ([]registry.Site, error) { + rows, err := s.pool.Query(ctx, + `SELECT slug, teams, created_at, updated_at, created_by FROM sites ORDER BY slug`) + if err != nil { + return nil, fmt.Errorf("pg registry list: %w", err) + } + defer rows.Close() + + var out []registry.Site + for rows.Next() { + var site registry.Site + if err := rows.Scan(&site.Slug, &site.Teams, &site.CreatedAt, &site.UpdatedAt, &site.CreatedBy); err != nil { + return nil, fmt.Errorf("pg registry scan: %w", err) + } + out = append(out, site) + } + return out, rows.Err() +} diff --git a/internal/pg/registry_import_test.go b/internal/pg/registry_import_test.go new file mode 100644 index 0000000..909a7fb --- /dev/null +++ b/internal/pg/registry_import_test.go @@ -0,0 +1,98 @@ +package pg + +import ( + "context" + "testing" + "time" + + "github.com/alicebob/miniredis/v2" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/freeCodeCamp/artemis/internal/registry/valkey" +) + +func seededValkey(t *testing.T) *valkey.Store { + t.Helper() + mr := miniredis.RunT(t) + ctx := context.Background() + store, err := valkey.New(ctx, valkey.Config{Addr: mr.Addr()}) + require.NoError(t, err) + t.Cleanup(func() { _ = store.Close() }) + + _, err = store.Register(ctx, "www", []string{"team-eng", "team-platform"}, "alice") + require.NoError(t, err) + _, err = store.Register(ctx, "learn", []string{"team-eng"}, "carol") + require.NoError(t, err) + return store +} + +func TestRegistryImportOnBoot(t *testing.T) { + ctx := context.Background() + pgStore := newTestRegistry(t) + src := seededValkey(t) + + n, err := pgStore.Import(ctx, src) + require.NoError(t, err) + assert.Equal(t, 2, n, "imports every seeded site") + + sites, err := pgStore.Sites(ctx) + require.NoError(t, err) + require.Len(t, sites, 2) + assert.Equal(t, "learn", sites[0].Slug) + assert.Equal(t, []string{"team-eng"}, sites[0].Teams) + assert.Equal(t, "www", sites[1].Slug) + assert.ElementsMatch(t, []string{"team-eng", "team-platform"}, sites[1].Teams) + + n2, err := pgStore.Import(ctx, src) + require.NoError(t, err) + assert.Equal(t, 0, n2, "second boot is a no-op") + + after, err := pgStore.Sites(ctx) + require.NoError(t, err) + require.Len(t, after, 2, "no duplicate rows on re-run") +} + +func TestRegistryImportOnBoot_NoClobberWhenPGNonEmpty(t *testing.T) { + ctx := context.Background() + pgStore := newTestRegistry(t) + + _, err := pgStore.Register(ctx, "www", []string{"newer-team"}, "operator") + require.NoError(t, err) + + src := seededValkey(t) + n, err := pgStore.Import(ctx, src) + require.NoError(t, err) + assert.Equal(t, 0, n, "PG-non-empty boot does not import") + + sites, err := pgStore.Sites(ctx) + require.NoError(t, err) + require.Len(t, sites, 1, "Valkey rows do not clobber existing PG data") + assert.Equal(t, "www", sites[0].Slug) + assert.Equal(t, []string{"newer-team"}, sites[0].Teams) +} + +func TestRegistryImportOnBoot_PreservesTimestamps(t *testing.T) { + ctx := context.Background() + pgStore := newTestRegistry(t) + + mr := miniredis.RunT(t) + src, err := valkey.New(ctx, valkey.Config{Addr: mr.Addr()}) + require.NoError(t, err) + t.Cleanup(func() { _ = src.Close() }) + + created := time.Date(2025, 3, 1, 12, 0, 0, 0, time.UTC) + src.Now = func() time.Time { return created } + _, err = src.Register(ctx, "www", []string{"team-eng"}, "alice") + require.NoError(t, err) + + n, err := pgStore.Import(ctx, src) + require.NoError(t, err) + require.Equal(t, 1, n) + + sites, err := pgStore.Sites(ctx) + require.NoError(t, err) + require.Len(t, sites, 1) + assert.WithinDuration(t, created, sites[0].CreatedAt, time.Second) + assert.Equal(t, "alice", sites[0].CreatedBy) +} diff --git a/internal/pg/registry_test.go b/internal/pg/registry_test.go new file mode 100644 index 0000000..16f5946 --- /dev/null +++ b/internal/pg/registry_test.go @@ -0,0 +1,54 @@ +package pg + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/freeCodeCamp/artemis/internal/registry" +) + +func newTestRegistry(t *testing.T) *RegistryStore { + t.Helper() + repo := newTestRepo(t) + return NewRegistryStore(&DB{Pool: repo.pool}) +} + +func TestRegistryPG(t *testing.T) { + ctx := context.Background() + var changed []string + store := newTestRegistry(t).WithOnChange(func(slug string) { changed = append(changed, slug) }) + + site, err := store.Register(ctx, "www", []string{"team-eng", "team-platform"}, "alice") + require.NoError(t, err) + assert.Equal(t, "www", site.Slug) + assert.ElementsMatch(t, []string{"team-eng", "team-platform"}, site.Teams) + + _, err = store.Register(ctx, "www", []string{"x"}, "bob") + assert.ErrorIs(t, err, registry.ErrAlreadyExists, "duplicate slug rejected") + + updated, err := store.UpdateTeams(ctx, "www", []string{"team-platform"}) + require.NoError(t, err) + assert.Equal(t, []string{"team-platform"}, updated.Teams) + assert.Equal(t, "alice", updated.CreatedBy, "created_by preserved across update") + assert.True(t, !updated.UpdatedAt.Before(site.UpdatedAt)) + + _, err = store.UpdateTeams(ctx, "absent", []string{"x"}) + assert.ErrorIs(t, err, registry.ErrNotFound) + + _, err = store.Register(ctx, "learn", []string{"team-eng"}, "carol") + require.NoError(t, err) + sites, err := store.Sites(ctx) + require.NoError(t, err) + require.Len(t, sites, 2) + assert.Equal(t, "learn", sites[0].Slug, "sorted by slug ascending") + assert.Equal(t, "www", sites[1].Slug) + + require.NoError(t, store.Delete(ctx, "www")) + assert.ErrorIs(t, store.Delete(ctx, "www"), registry.ErrNotFound, "double delete -> not found") + + assert.Equal(t, []string{"www", "www", "learn", "www"}, changed, + "registry.changed fires on register/update/register/delete for Valkey cache invalidation") +} diff --git a/internal/pg/repo.go b/internal/pg/repo.go new file mode 100644 index 0000000..020d840 --- /dev/null +++ b/internal/pg/repo.go @@ -0,0 +1,157 @@ +package pg + +import ( + "context" + "fmt" + "time" + + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" + + "github.com/freeCodeCamp/artemis/internal/gc" +) + +type Repo struct { + pool *pgxpool.Pool +} + +func NewRepo(db *DB) *Repo { + return &Repo{pool: db.Pool} +} + +func (r *Repo) UpsertDeploy(ctx context.Context, site, id string, mtime time.Time, bytes int64, hasMarker bool, state string) error { + if state == "" { + state = "active" + } + _, err := r.pool.Exec(ctx, ` + INSERT INTO deploys (site, id, mtime, bytes, has_marker, state) + VALUES ($1, $2, $3, $4, $5, $6) + ON CONFLICT (site, id) DO UPDATE SET + mtime = EXCLUDED.mtime, bytes = EXCLUDED.bytes, + has_marker = EXCLUDED.has_marker, state = EXCLUDED.state`, + site, id, mtime, bytes, hasMarker, state) + if err != nil { + return fmt.Errorf("pg upsert deploy %s/%s: %w", site, id, err) + } + return nil +} + +func (r *Repo) UpsertAlias(ctx context.Context, site, name, deployID string, updatedAt time.Time) error { + _, err := r.pool.Exec(ctx, ` + INSERT INTO aliases (site, name, deploy_id, updated_at) + VALUES ($1, $2, $3, $4) + ON CONFLICT (site, name) DO UPDATE SET + deploy_id = EXCLUDED.deploy_id, updated_at = EXCLUDED.updated_at`, + site, name, deployID, updatedAt) + if err != nil { + return fmt.Errorf("pg upsert alias %s/%s: %w", site, name, err) + } + return nil +} + +func (r *Repo) DeploysForSite(ctx context.Context, site string) ([]gc.Deploy, error) { + rows, err := r.pool.Query(ctx, + `SELECT id, mtime, bytes, has_marker FROM deploys WHERE site = $1 AND state = 'active'`, site) + if err != nil { + return nil, fmt.Errorf("pg deploys %s: %w", site, err) + } + defer rows.Close() + + var out []gc.Deploy + for rows.Next() { + var d gc.Deploy + if err := rows.Scan(&d.ID, &d.Mtime, &d.Bytes, &d.HasMarker); err != nil { + return nil, fmt.Errorf("pg scan deploy %s: %w", site, err) + } + out = append(out, d) + } + return out, rows.Err() +} + +func (r *Repo) AliasTargets(ctx context.Context, site string) (map[string]struct{}, time.Time, error) { + rows, err := r.pool.Query(ctx, + `SELECT deploy_id, updated_at FROM aliases WHERE site = $1`, site) + if err != nil { + return nil, time.Time{}, fmt.Errorf("pg aliases %s: %w", site, err) + } + defer rows.Close() + + targets := map[string]struct{}{} + var last time.Time + for rows.Next() { + var id string + var updated time.Time + if err := rows.Scan(&id, &updated); err != nil { + return nil, time.Time{}, fmt.Errorf("pg scan alias %s: %w", site, err) + } + targets[id] = struct{}{} + if updated.After(last) { + last = updated + } + } + return targets, last, rows.Err() +} + +func (r *Repo) Tombstone(ctx context.Context, site string, d gc.Deploy) error { + return pgx.BeginFunc(ctx, r.pool, func(tx pgx.Tx) error { + if _, err := tx.Exec(ctx, + `INSERT INTO tombstones (site, id, bytes) VALUES ($1, $2, $3) + ON CONFLICT (site, id) DO NOTHING`, site, d.ID, d.Bytes); err != nil { + return fmt.Errorf("pg tombstone insert %s/%s: %w", site, d.ID, err) + } + if _, err := tx.Exec(ctx, + `DELETE FROM deploys WHERE site = $1 AND id = $2`, site, d.ID); err != nil { + return fmt.Errorf("pg tombstone delete deploy %s/%s: %w", site, d.ID, err) + } + return nil + }) +} + +func (r *Repo) RecordTombstone(ctx context.Context, site, id string, bytes int64) error { + return pgx.BeginFunc(ctx, r.pool, func(tx pgx.Tx) error { + if _, err := tx.Exec(ctx, + `INSERT INTO tombstones (site, id, bytes) VALUES ($1, $2, $3) + ON CONFLICT (site, id) DO NOTHING`, site, id, bytes); err != nil { + return fmt.Errorf("pg record tombstone %s/%s: %w", site, id, err) + } + if _, err := tx.Exec(ctx, + `DELETE FROM deploys WHERE site = $1 AND id = $2`, site, id); err != nil { + return fmt.Errorf("pg record tombstone delete deploy %s/%s: %w", site, id, err) + } + return nil + }) +} + +func (r *Repo) ExpiredTombstones(ctx context.Context, before time.Time) ([]gc.Tombstone, error) { + rows, err := r.pool.Query(ctx, + `SELECT site, id, trashed_at, bytes FROM tombstones WHERE trashed_at < $1 ORDER BY site, id`, before) + if err != nil { + return nil, fmt.Errorf("pg expired tombstones: %w", err) + } + defer rows.Close() + + var out []gc.Tombstone + for rows.Next() { + var t gc.Tombstone + if err := rows.Scan(&t.Site, &t.ID, &t.TrashedAt, &t.Bytes); err != nil { + return nil, fmt.Errorf("pg scan tombstone: %w", err) + } + out = append(out, t) + } + return out, rows.Err() +} + +func (r *Repo) PruneDeploy(ctx context.Context, site, id string) error { + if _, err := r.pool.Exec(ctx, `DELETE FROM deploys WHERE site = $1 AND id = $2`, site, id); err != nil { + return fmt.Errorf("pg prune deploy %s/%s: %w", site, id, err) + } + return nil +} + +func (r *Repo) ClearTombstone(ctx context.Context, site, id string) error { + if _, err := r.pool.Exec(ctx, + `DELETE FROM tombstones WHERE site = $1 AND id = $2`, site, id); err != nil { + return fmt.Errorf("pg clear tombstone %s/%s: %w", site, id, err) + } + return nil +} diff --git a/internal/pg/repo_test.go b/internal/pg/repo_test.go new file mode 100644 index 0000000..b2fd3a8 --- /dev/null +++ b/internal/pg/repo_test.go @@ -0,0 +1,90 @@ +package pg + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/testcontainers/testcontainers-go" + "github.com/testcontainers/testcontainers-go/modules/postgres" + + "github.com/freeCodeCamp/artemis/internal/gc" +) + +func newTestRepo(t *testing.T) *Repo { + t.Helper() + testcontainers.SkipIfProviderIsNotHealthy(t) + + ctx := context.Background() + container, err := postgres.Run(ctx, "postgres:16-alpine", + postgres.WithDatabase("artemis_test"), + postgres.WithUsername("artemis"), + postgres.WithPassword("artemis"), + postgres.BasicWaitStrategies(), + ) + require.NoError(t, err) + t.Cleanup(func() { _ = container.Terminate(ctx) }) + + connStr, err := container.ConnectionString(ctx, "sslmode=disable") + require.NoError(t, err) + db, err := New(ctx, Config{DatabaseURL: connStr}) + require.NoError(t, err) + t.Cleanup(db.Close) + require.NoError(t, Migrate(ctx, db.Pool)) + return NewRepo(db) +} + +func TestRepo_DeployAliasRoundtrip(t *testing.T) { + repo := newTestRepo(t) + ctx := context.Background() + now := time.Now().UTC().Truncate(time.Second) + + require.NoError(t, repo.UpsertDeploy(ctx, "www", "d1", now.Add(-time.Hour), 100, true, "active")) + require.NoError(t, repo.UpsertDeploy(ctx, "www", "d2", now.Add(-2*time.Hour), 200, false, "active")) + require.NoError(t, repo.UpsertAlias(ctx, "www", "production", "d1", now)) + + deploys, err := repo.DeploysForSite(ctx, "www") + require.NoError(t, err) + assert.Len(t, deploys, 2) + + targets, last, err := repo.AliasTargets(ctx, "www") + require.NoError(t, err) + assert.Contains(t, targets, "d1") + assert.WithinDuration(t, now, last, time.Second) + + require.NoError(t, repo.UpsertDeploy(ctx, "www", "d1", now, 150, true, "active"), + "upsert is idempotent on (site,id)") + deploys, err = repo.DeploysForSite(ctx, "www") + require.NoError(t, err) + assert.Len(t, deploys, 2, "re-upsert updates in place, no duplicate row") +} + +func TestRepo_TombstoneLifecycle(t *testing.T) { + repo := newTestRepo(t) + ctx := context.Background() + now := time.Now().UTC() + + require.NoError(t, repo.UpsertDeploy(ctx, "www", "d-old", now.Add(-30*24*time.Hour), 100, true, "active")) + require.NoError(t, repo.Tombstone(ctx, "www", gc.Deploy{ID: "d-old", Bytes: 100})) + + deploys, err := repo.DeploysForSite(ctx, "www") + require.NoError(t, err) + assert.Empty(t, deploys, "tombstoned deploy removed from active set") + + expired, err := repo.ExpiredTombstones(ctx, now.Add(time.Hour)) + require.NoError(t, err) + require.Len(t, expired, 1) + assert.Equal(t, "d-old", expired[0].ID) + assert.EqualValues(t, 100, expired[0].Bytes) + + none, err := repo.ExpiredTombstones(ctx, now.Add(-time.Hour)) + require.NoError(t, err) + assert.Empty(t, none, "tombstone trashed_at in the future of the cutoff is not yet expired") + + require.NoError(t, repo.ClearTombstone(ctx, "www", "d-old")) + expired, err = repo.ExpiredTombstones(ctx, now.Add(time.Hour)) + require.NoError(t, err) + assert.Empty(t, expired, "cleared tombstone gone") +} diff --git a/internal/pg/repo_tombstone_test.go b/internal/pg/repo_tombstone_test.go new file mode 100644 index 0000000..79be35e --- /dev/null +++ b/internal/pg/repo_tombstone_test.go @@ -0,0 +1,71 @@ +package pg + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestRepo_RecordTombstone(t *testing.T) { + repo := newTestRepo(t) + ctx := context.Background() + t0 := time.Now().UTC().Truncate(time.Second) + + require.NoError(t, repo.UpsertDeploy(ctx, "www", "d1", t0, 100, true, "active")) + require.NoError(t, repo.RecordTombstone(ctx, "www", "d1", 100)) + + deploys, err := repo.DeploysForSite(ctx, "www") + require.NoError(t, err) + assert.Empty(t, deploys, "recorded tombstone removes the deploy from the active set") + + expired, err := repo.ExpiredTombstones(ctx, t0.Add(time.Hour)) + require.NoError(t, err) + require.Len(t, expired, 1) + assert.Equal(t, "d1", expired[0].ID) + assert.EqualValues(t, 100, expired[0].Bytes) +} + +func TestRepo_RecordTombstone_Idempotent(t *testing.T) { + repo := newTestRepo(t) + ctx := context.Background() + t0 := time.Now().UTC().Truncate(time.Second) + + require.NoError(t, repo.UpsertDeploy(ctx, "www", "d1", t0, 100, true, "active")) + require.NoError(t, repo.RecordTombstone(ctx, "www", "d1", 100)) + require.NoError(t, repo.RecordTombstone(ctx, "www", "d1", 100), + "second RecordTombstone for same id is a no-op (ON CONFLICT DO NOTHING)") + + expired, err := repo.ExpiredTombstones(ctx, t0.Add(time.Hour)) + require.NoError(t, err) + require.Len(t, expired, 1, "still exactly one tombstone after the repeat") +} + +func TestRepo_PruneDeploy(t *testing.T) { + repo := newTestRepo(t) + ctx := context.Background() + t0 := time.Now().UTC().Truncate(time.Second) + + require.NoError(t, repo.UpsertDeploy(ctx, "www", "d1", t0, 1, false, "active")) + require.NoError(t, repo.UpsertDeploy(ctx, "learn", "d1", t0, 1, false, "active")) + + require.NoError(t, repo.PruneDeploy(ctx, "www", "d1")) + + w, err := repo.DeploysForSite(ctx, "www") + require.NoError(t, err) + assert.Empty(t, w, "the named site's deploy is pruned") + + l, err := repo.DeploysForSite(ctx, "learn") + require.NoError(t, err) + assert.Len(t, l, 1, "a same-id deploy on a different site is left intact") +} + +func TestRepo_PruneDeploy_MissingRow(t *testing.T) { + repo := newTestRepo(t) + ctx := context.Background() + + require.NoError(t, repo.PruneDeploy(ctx, "www", "absent"), + "pruning a non-existent deploy row is idempotent (no error)") +} diff --git a/internal/pg/repoqueue.go b/internal/pg/repoqueue.go new file mode 100644 index 0000000..fb26990 --- /dev/null +++ b/internal/pg/repoqueue.go @@ -0,0 +1,181 @@ +package pg + +import ( + "context" + "crypto/rand" + "encoding/hex" + "errors" + "fmt" + "time" + + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgconn" + "github.com/jackc/pgx/v5/pgxpool" + + "github.com/freeCodeCamp/artemis/internal/reporequest" +) + +type RepoQueue struct { + pool *pgxpool.Pool + now func() time.Time + newID func() string +} + +func NewRepoQueue(db *DB) *RepoQueue { + return &RepoQueue{pool: db.Pool, now: time.Now, newID: defaultRepoRequestID} +} + +func (q *RepoQueue) WithClock(now func() time.Time) *RepoQueue { q.now = now; return q } +func (q *RepoQueue) WithIDGen(fn func() string) *RepoQueue { q.newID = fn; return q } + +func defaultRepoRequestID() string { + var b [10]byte + if _, err := rand.Read(b[:]); err != nil { + panic(err) + } + return "req_" + hex.EncodeToString(b[:]) +} + +const repoRequestCols = `id, name, owner, visibility, description, template, status, url, error, requested_by, approver, reject_reason, created_at, updated_at` + +func scanRequest(row pgx.Row) (reporequest.Request, error) { + var r reporequest.Request + err := row.Scan(&r.ID, &r.Name, &r.Owner, &r.Visibility, &r.Description, &r.Template, + &r.Status, &r.URL, &r.Error, &r.RequestedBy, &r.Approver, &r.RejectReason, &r.CreatedAt, &r.UpdatedAt) + return r, err +} + +func (q *RepoQueue) Create(ctx context.Context, req reporequest.Request) (reporequest.Request, error) { + if req.Name == "" { + return reporequest.Request{}, errors.New("reporequest/pg: empty name") + } + now := q.now().UTC() + req.ID = q.newID() + req.Status = reporequest.StatusPending + req.CreatedAt = now + req.UpdatedAt = now + + _, err := q.pool.Exec(ctx, + `INSERT INTO repo_requests (`+repoRequestCols+`) + VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14)`, + req.ID, req.Name, req.Owner, req.Visibility, req.Description, req.Template, + req.Status, req.URL, req.Error, req.RequestedBy, req.Approver, req.RejectReason, req.CreatedAt, req.UpdatedAt) + if err != nil { + var pgErr *pgconn.PgError + if errors.As(err, &pgErr) && pgErr.Code == "23505" { + return reporequest.Request{}, reporequest.ErrAlreadyExists + } + return reporequest.Request{}, fmt.Errorf("pg repoqueue create: %w", err) + } + return req, nil +} + +func (q *RepoQueue) Get(ctx context.Context, id string) (reporequest.Request, error) { + r, err := scanRequest(q.pool.QueryRow(ctx, `SELECT `+repoRequestCols+` FROM repo_requests WHERE id = $1`, id)) + if errors.Is(err, pgx.ErrNoRows) { + return reporequest.Request{}, reporequest.ErrNotFound + } + if err != nil { + return reporequest.Request{}, fmt.Errorf("pg repoqueue get %s: %w", id, err) + } + return r, nil +} + +func (q *RepoQueue) List(ctx context.Context) ([]reporequest.Request, error) { + rows, err := q.pool.Query(ctx, `SELECT `+repoRequestCols+` FROM repo_requests ORDER BY created_at, id`) + if err != nil { + return nil, fmt.Errorf("pg repoqueue list: %w", err) + } + defer rows.Close() + + var out []reporequest.Request + for rows.Next() { + r, err := scanRequest(rows) + if err != nil { + return nil, fmt.Errorf("pg repoqueue scan: %w", err) + } + out = append(out, r) + } + return out, rows.Err() +} + +func (q *RepoQueue) Delete(ctx context.Context, id string) error { + tag, err := q.pool.Exec(ctx, `DELETE FROM repo_requests WHERE id = $1`, id) + if err != nil { + return fmt.Errorf("pg repoqueue delete %s: %w", id, err) + } + if tag.RowsAffected() == 0 { + return reporequest.ErrNotFound + } + return nil +} + +func (q *RepoQueue) Approve(ctx context.Context, id, approver string) (reporequest.Request, error) { + return q.transition(ctx, id, reporequest.StatusPending, reporequest.ErrNotPending, func(r *reporequest.Request) { + r.Status = reporequest.StatusApproved + r.Approver = approver + }) +} + +func (q *RepoQueue) Reject(ctx context.Context, id, approver, reason string) (reporequest.Request, error) { + return q.transition(ctx, id, reporequest.StatusPending, reporequest.ErrNotPending, func(r *reporequest.Request) { + r.Status = reporequest.StatusRejected + r.Approver = approver + r.RejectReason = reason + }) +} + +func (q *RepoQueue) MarkActive(ctx context.Context, id, url string) (reporequest.Request, error) { + return q.transition(ctx, id, reporequest.StatusApproved, reporequest.ErrNotPending, func(r *reporequest.Request) { + r.Status = reporequest.StatusActive + r.URL = url + }) +} + +func (q *RepoQueue) MarkFailed(ctx context.Context, id, errMsg string) (reporequest.Request, error) { + return q.transition(ctx, id, reporequest.StatusApproved, reporequest.ErrNotPending, func(r *reporequest.Request) { + r.Status = reporequest.StatusFailed + r.Error = errMsg + }) +} + +func (q *RepoQueue) MarkStale(ctx context.Context, id, reason string) (reporequest.Request, error) { + return q.transition(ctx, id, reporequest.StatusActive, reporequest.ErrNotActive, func(r *reporequest.Request) { + r.Status = reporequest.StatusFailed + r.Error = reason + }) +} + +func (q *RepoQueue) transition(ctx context.Context, id string, want reporequest.Status, mismatch error, apply func(*reporequest.Request)) (reporequest.Request, error) { + var out reporequest.Request + err := pgx.BeginFunc(ctx, q.pool, func(tx pgx.Tx) error { + cur, err := scanRequest(tx.QueryRow(ctx, + `SELECT `+repoRequestCols+` FROM repo_requests WHERE id = $1 FOR UPDATE`, id)) + if errors.Is(err, pgx.ErrNoRows) { + return reporequest.ErrNotFound + } + if err != nil { + return err + } + if cur.Status != want { + return mismatch + } + apply(&cur) + cur.UpdatedAt = q.now().UTC() + _, err = tx.Exec(ctx, + `UPDATE repo_requests SET status=$2, url=$3, error=$4, approver=$5, reject_reason=$6, updated_at=$7 WHERE id=$1`, + id, cur.Status, cur.URL, cur.Error, cur.Approver, cur.RejectReason, cur.UpdatedAt) + if err != nil { + return err + } + out = cur + return nil + }) + if err != nil { + if errors.Is(err, reporequest.ErrNotFound) || errors.Is(err, mismatch) { + return reporequest.Request{}, err + } + return reporequest.Request{}, fmt.Errorf("pg repoqueue transition %s: %w", id, err) + } + return out, nil +} diff --git a/internal/pg/repoqueue_test.go b/internal/pg/repoqueue_test.go new file mode 100644 index 0000000..93b5446 --- /dev/null +++ b/internal/pg/repoqueue_test.go @@ -0,0 +1,103 @@ +package pg + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/freeCodeCamp/artemis/internal/reporequest" +) + +func newTestRepoQueue(t *testing.T) *RepoQueue { + t.Helper() + repo := newTestRepo(t) + return NewRepoQueue(&DB{Pool: repo.pool}) +} + +func req(name string) reporequest.Request { + return reporequest.Request{Name: name, Owner: "freeCodeCamp-Universe", Visibility: reporequest.VisibilityPublic, RequestedBy: "alice"} +} + +func TestRepoQueuePG(t *testing.T) { + ctx := context.Background() + q := newTestRepoQueue(t) + + created, err := q.Create(ctx, req("my-app")) + require.NoError(t, err) + assert.Equal(t, reporequest.StatusPending, created.Status) + assert.NotEmpty(t, created.ID) + + _, err = q.Create(ctx, req("My-App")) + assert.ErrorIs(t, err, reporequest.ErrAlreadyExists, "name claim is case-insensitive while pending") + + got, err := q.Get(ctx, created.ID) + require.NoError(t, err) + assert.Equal(t, "my-app", got.Name) + _, err = q.Get(ctx, "req_absent") + assert.ErrorIs(t, err, reporequest.ErrNotFound) + + approved, err := q.Approve(ctx, created.ID, "admin") + require.NoError(t, err) + assert.Equal(t, reporequest.StatusApproved, approved.Status) + assert.Equal(t, "admin", approved.Approver) + + _, err = q.Approve(ctx, created.ID, "admin2") + assert.ErrorIs(t, err, reporequest.ErrNotPending, "double-approve blocked by CAS guard") + + active, err := q.MarkActive(ctx, created.ID, "https://github.com/o/my-app") + require.NoError(t, err) + assert.Equal(t, reporequest.StatusActive, active.Status) + assert.Equal(t, "https://github.com/o/my-app", active.URL) + + _, err = q.Create(ctx, req("my-app")) + assert.ErrorIs(t, err, reporequest.ErrAlreadyExists, "active repo still holds the name claim") + + stale, err := q.MarkStale(ctx, created.ID, "repo deleted upstream") + require.NoError(t, err) + assert.Equal(t, reporequest.StatusFailed, stale.Status) + + reused, err := q.Create(ctx, req("my-app")) + require.NoError(t, err, "stale/failed releases the name claim -> name reusable") + assert.Equal(t, reporequest.StatusPending, reused.Status) +} + +func TestRepoQueuePG_RejectReleasesName(t *testing.T) { + ctx := context.Background() + q := newTestRepoQueue(t) + + r, err := q.Create(ctx, req("widget")) + require.NoError(t, err) + _, err = q.Reject(ctx, r.ID, "admin", "policy") + require.NoError(t, err) + + _, err = q.Create(ctx, req("widget")) + require.NoError(t, err, "rejected request releases the name claim") + + _, err = q.MarkActive(ctx, r.ID, "x") + assert.ErrorIs(t, err, reporequest.ErrNotPending, "cannot activate a rejected request") +} + +func TestRepoQueuePG_ListOrdered(t *testing.T) { + ctx := context.Background() + base := time.Date(2026, 6, 1, 0, 0, 0, 0, time.UTC) + tick := 0 + q := newTestRepoQueue(t).WithClock(func() time.Time { + tick++ + return base.Add(time.Duration(tick) * time.Minute) + }) + for _, n := range []string{"a", "b", "c"} { + _, err := q.Create(ctx, req(n)) + require.NoError(t, err) + } + list, err := q.List(ctx) + require.NoError(t, err) + require.Len(t, list, 3) + names := []string{list[0].Name, list[1].Name, list[2].Name} + assert.Equal(t, []string{"a", "b", "c"}, names, "ordered by created_at then id") + + require.NoError(t, q.Delete(ctx, list[0].ID)) + assert.ErrorIs(t, q.Delete(ctx, list[0].ID), reporequest.ErrNotFound) +} diff --git a/internal/pg/repoqueue_transition_test.go b/internal/pg/repoqueue_transition_test.go new file mode 100644 index 0000000..0c302cc --- /dev/null +++ b/internal/pg/repoqueue_transition_test.go @@ -0,0 +1,108 @@ +package pg + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/freeCodeCamp/artemis/internal/reporequest" +) + +func TestRepoQueue_MarkFailed(t *testing.T) { + ctx := context.Background() + q := newTestRepoQueue(t) + + r, err := q.Create(ctx, req("x")) + require.NoError(t, err) + _, err = q.Approve(ctx, r.ID, "admin") + require.NoError(t, err) + + f, err := q.MarkFailed(ctx, r.ID, "build broke") + require.NoError(t, err) + assert.Equal(t, reporequest.StatusFailed, f.Status) + assert.Equal(t, "build broke", f.Error) + + got, err := q.Get(ctx, r.ID) + require.NoError(t, err) + assert.Equal(t, reporequest.StatusFailed, got.Status) + assert.Equal(t, "build broke", got.Error) +} + +func TestRepoQueue_MarkFailed_NotApproved(t *testing.T) { + ctx := context.Background() + q := newTestRepoQueue(t) + + r, err := q.Create(ctx, req("y")) + require.NoError(t, err) + + _, err = q.MarkFailed(ctx, r.ID, "ignored") + assert.ErrorIs(t, err, reporequest.ErrNotPending, + "MarkFailed on a pending (un-approved) request is blocked by the CAS guard") + + got, err := q.Get(ctx, r.ID) + require.NoError(t, err) + assert.Equal(t, reporequest.StatusPending, got.Status, "rejected transition leaves status unchanged") + assert.Empty(t, got.Error) +} + +func TestRepoQueue_TransitionMismatchGuards(t *testing.T) { + ctx := context.Background() + + tests := []struct { + name string + setup func(t *testing.T, q *RepoQueue, id string) + verb func(q *RepoQueue, id string) (reporequest.Request, error) + want reporequest.Status + wantErr error + }{ + { + name: "MarkActive on pending (not approved)", + setup: func(t *testing.T, q *RepoQueue, id string) {}, + verb: func(q *RepoQueue, id string) (reporequest.Request, error) { return q.MarkActive(ctx, id, "https://x") }, + want: reporequest.StatusPending, + wantErr: reporequest.ErrNotPending, + }, + { + name: "MarkActive on failed", + setup: func(t *testing.T, q *RepoQueue, id string) { + t.Helper() + _, err := q.Approve(ctx, id, "admin") + require.NoError(t, err) + _, err = q.MarkFailed(ctx, id, "broke") + require.NoError(t, err) + }, + verb: func(q *RepoQueue, id string) (reporequest.Request, error) { return q.MarkActive(ctx, id, "https://x") }, + want: reporequest.StatusFailed, + wantErr: reporequest.ErrNotPending, + }, + { + name: "MarkStale on approved (not active)", + setup: func(t *testing.T, q *RepoQueue, id string) { + t.Helper() + _, err := q.Approve(ctx, id, "admin") + require.NoError(t, err) + }, + verb: func(q *RepoQueue, id string) (reporequest.Request, error) { return q.MarkStale(ctx, id, "reason") }, + want: reporequest.StatusApproved, + wantErr: reporequest.ErrNotActive, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + q := newTestRepoQueue(t) + r, err := q.Create(ctx, req("app")) + require.NoError(t, err) + tc.setup(t, q, r.ID) + + _, err = tc.verb(q, r.ID) + assert.ErrorIs(t, err, tc.wantErr) + + got, err := q.Get(ctx, r.ID) + require.NoError(t, err) + assert.Equal(t, tc.want, got.Status, "rejected transition leaves status unchanged") + }) + } +} diff --git a/internal/pg/saga.go b/internal/pg/saga.go new file mode 100644 index 0000000..dbcebc4 --- /dev/null +++ b/internal/pg/saga.go @@ -0,0 +1,30 @@ +package pg + +import ( + "context" + "fmt" + "time" + + "github.com/jackc/pgx/v5" +) + +func (r *Repo) FinalizeAtomic(ctx context.Context, site, deployID, mode string, mtime time.Time, bytes int64) error { + return r.WithTx(ctx, func(tx pgx.Tx) error { + if _, err := tx.Exec(ctx, + `INSERT INTO deploys (site, id, mtime, bytes, has_marker, state) + VALUES ($1, $2, $3, $4, true, 'active') + ON CONFLICT (site, id) DO UPDATE SET + mtime = EXCLUDED.mtime, bytes = EXCLUDED.bytes, has_marker = true, state = 'active'`, + site, deployID, mtime, bytes); err != nil { + return fmt.Errorf("finalize deploy %s/%s: %w", site, deployID, err) + } + if _, err := tx.Exec(ctx, + `INSERT INTO aliases (site, name, deploy_id, updated_at) + VALUES ($1, $2, $3, $4) + ON CONFLICT (site, name) DO UPDATE SET deploy_id = EXCLUDED.deploy_id, updated_at = EXCLUDED.updated_at`, + site, mode, deployID, mtime); err != nil { + return fmt.Errorf("finalize alias %s/%s: %w", site, mode, err) + } + return Enqueue(ctx, tx, TopicSiteChanged, map[string]string{"site": site}) + }) +} diff --git a/internal/pg/saga_test.go b/internal/pg/saga_test.go new file mode 100644 index 0000000..f023b47 --- /dev/null +++ b/internal/pg/saga_test.go @@ -0,0 +1,39 @@ +package pg + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestDeploySaga(t *testing.T) { + repo := newTestRepo(t) + ctx := context.Background() + mtime := time.Now().UTC().Truncate(time.Second) + + require.NoError(t, repo.FinalizeAtomic(ctx, "www", "20260420-141522-abc1234", "production", mtime, 4096)) + + deploys, err := repo.DeploysForSite(ctx, "www") + require.NoError(t, err) + require.Len(t, deploys, 1) + assert.True(t, deploys[0].HasMarker, "finalize marks deploy completed") + assert.EqualValues(t, 4096, deploys[0].Bytes) + + targets, _, err := repo.AliasTargets(ctx, "www") + require.NoError(t, err) + assert.Contains(t, targets, "20260420-141522-abc1234", "production alias points at the finalized deploy") + + events, err := repo.FetchUnpublished(ctx, 10) + require.NoError(t, err) + require.Len(t, events, 1, "exactly one site.changed emitted in the same tx") + assert.Equal(t, TopicSiteChanged, events[0].Topic) + + require.NoError(t, repo.FinalizeAtomic(ctx, "www", "20260420-141522-abc1234", "production", mtime, 4096), + "re-finalize is idempotent on (site,id) and (site,name)") + deploys, err = repo.DeploysForSite(ctx, "www") + require.NoError(t, err) + assert.Len(t, deploys, 1, "no duplicate deploy row") +} diff --git a/internal/r2/r2.go b/internal/r2/r2.go index e3e46ab..388e365 100644 --- a/internal/r2/r2.go +++ b/internal/r2/r2.go @@ -15,6 +15,7 @@ import ( "errors" "fmt" "io" + "net/url" "strings" "time" @@ -22,6 +23,7 @@ import ( awsconfig "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/credentials" "github.com/aws/aws-sdk-go-v2/service/s3" + s3types "github.com/aws/aws-sdk-go-v2/service/s3/types" smithy "github.com/aws/smithy-go" ) @@ -177,6 +179,154 @@ func (c *Client) ListPrefix(ctx context.Context, prefix string) ([]string, error return out, nil } +func (c *Client) DeleteObject(ctx context.Context, key string) error { + _, err := c.s3.DeleteObject(ctx, &s3.DeleteObjectInput{ + Bucket: awsv2.String(c.bucket), + Key: awsv2.String(key), + }) + if err != nil { + return fmt.Errorf("r2 delete %s: %w", key, err) + } + return nil +} + +const deleteBatchMax = 1000 + +func (c *Client) DeletePrefix(ctx context.Context, prefix string) (int, error) { + var deleted int + var token *string + for { + page, err := c.s3.ListObjectsV2(ctx, &s3.ListObjectsV2Input{ + Bucket: awsv2.String(c.bucket), + Prefix: awsv2.String(prefix), + ContinuationToken: token, + }) + if err != nil { + return deleted, fmt.Errorf("r2 deleteprefix list %s: %w", prefix, err) + } + ids := make([]s3types.ObjectIdentifier, 0, len(page.Contents)) + for _, obj := range page.Contents { + if obj.Key != nil { + ids = append(ids, s3types.ObjectIdentifier{Key: obj.Key}) + } + } + for start := 0; start < len(ids); start += deleteBatchMax { + end := min(start+deleteBatchMax, len(ids)) + n, err := c.deleteBatch(ctx, ids[start:end]) + deleted += n + if err != nil { + return deleted, err + } + } + if page.IsTruncated == nil || !*page.IsTruncated { + break + } + token = page.NextContinuationToken + } + return deleted, nil +} + +func encodeCopySource(bucket, key string) string { + segs := strings.Split(key, "/") + for i, s := range segs { + segs[i] = url.PathEscape(s) + } + return bucket + "/" + strings.Join(segs, "/") +} + +func (c *Client) deleteBatch(ctx context.Context, ids []s3types.ObjectIdentifier) (int, error) { + if len(ids) == 0 { + return 0, nil + } + out, err := c.s3.DeleteObjects(ctx, &s3.DeleteObjectsInput{ + Bucket: awsv2.String(c.bucket), + Delete: &s3types.Delete{Objects: ids, Quiet: awsv2.Bool(true)}, + }) + if err != nil { + return 0, fmt.Errorf("r2 deleteobjects: %w", err) + } + if len(out.Errors) > 0 { + key, msg := "", "" + if out.Errors[0].Key != nil { + key = *out.Errors[0].Key + } + if out.Errors[0].Message != nil { + msg = *out.Errors[0].Message + } + return len(ids) - len(out.Errors), fmt.Errorf("r2 deleteobjects: %d of %d failed (first %s: %s)", len(out.Errors), len(ids), key, msg) + } + return len(ids), nil +} + +func (c *Client) MovePrefix(ctx context.Context, srcPrefix, dstPrefix string) (int, error) { + var moved int + var token *string + for { + page, err := c.s3.ListObjectsV2(ctx, &s3.ListObjectsV2Input{ + Bucket: awsv2.String(c.bucket), + Prefix: awsv2.String(srcPrefix), + ContinuationToken: token, + }) + if err != nil { + return moved, fmt.Errorf("r2 moveprefix list %s: %w", srcPrefix, err) + } + for _, obj := range page.Contents { + if obj.Key == nil { + continue + } + key := *obj.Key + dstKey := dstPrefix + strings.TrimPrefix(key, srcPrefix) + _, err := c.s3.CopyObject(ctx, &s3.CopyObjectInput{ + Bucket: awsv2.String(c.bucket), + Key: awsv2.String(dstKey), + CopySource: awsv2.String(encodeCopySource(c.bucket, key)), + }) + if err != nil { + return moved, fmt.Errorf("r2 moveprefix copy %s->%s: %w", key, dstKey, err) + } + if err := c.DeleteObject(ctx, key); err != nil { + return moved, fmt.Errorf("r2 moveprefix delete %s: %w", key, err) + } + moved++ + } + if page.IsTruncated == nil || !*page.IsTruncated { + break + } + token = page.NextContinuationToken + } + return moved, nil +} + +func (c *Client) ListSites(ctx context.Context) ([]string, error) { + var sites []string + var token *string + for { + page, err := c.s3.ListObjectsV2(ctx, &s3.ListObjectsV2Input{ + Bucket: awsv2.String(c.bucket), + Delimiter: awsv2.String("/"), + ContinuationToken: token, + }) + if err != nil { + return nil, fmt.Errorf("r2 listsites: %w", err) + } + for _, cp := range page.CommonPrefixes { + if cp.Prefix == nil { + continue + } + site := strings.TrimSuffix(*cp.Prefix, "/") + if site == "" || strings.HasPrefix(site, "_") { + continue + } + sites = append(sites, site) + } + if page.IsTruncated == nil || !*page.IsTruncated { + break + } + token = page.NextContinuationToken + } + return sites, nil +} + // VerifyError is returned when VerifyDeployComplete finds expected files // missing from the deploy prefix. The Missing field lists the files that // did not surface in the listing. diff --git a/internal/r2/r2_test.go b/internal/r2/r2_test.go index 0f080cb..e3e03b0 100644 --- a/internal/r2/r2_test.go +++ b/internal/r2/r2_test.go @@ -11,6 +11,7 @@ import ( "net/http/httptest" "net/url" "path" + "sort" "strconv" "strings" "sync" @@ -42,6 +43,18 @@ type fakeS3 struct { // lastPutTransferEncoding captures the Transfer-Encoding header. // Aws-sdk-go-v2 sends "chunked" when ContentLength is unknown. lastPutTransferEncoding string + + pageSize int + deleteObjectsCalls int + lastDeleteBatch int + + failList bool + failDeleteObjects bool + deleteFailKeys map[string]struct{} + failDeleteKeys map[string]struct{} + failCopyKeys map[string]struct{} + failGetKeys map[string]struct{} + truncateGetKeys map[string]struct{} } func newFakeS3(t *testing.T, bucket string) *fakeS3 { @@ -73,12 +86,20 @@ func (f *fakeS3) handle(w http.ResponseWriter, r *http.Request) { f.listV2(w, r) return } + if r.URL.Query().Has("delete") && r.Method == http.MethodPost { + f.deleteObjects(w, r) + return + } http.Error(w, "unsupported bucket op", http.StatusBadRequest) return } key := parts[1] switch r.Method { case http.MethodPut: + if src := r.Header.Get("X-Amz-Copy-Source"); src != "" { + f.copyObject(w, key, src) + return + } body, _ := io.ReadAll(r.Body) f.mu.Lock() f.objects[f.bucket+"/"+key] = body @@ -92,17 +113,44 @@ func (f *fakeS3) handle(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) case http.MethodGet: f.mu.Lock() + _, failGet := f.failGetKeys[key] + _, truncGet := f.truncateGetKeys[key] body, ok := f.objects[f.bucket+"/"+key] f.mu.Unlock() + if failGet { + writeS3Error(w, http.StatusServiceUnavailable, "SlowDown", "reduce your request rate") + return + } if !ok { http.Error(w, "not found", http.StatusNotFound) return } + if truncGet { + w.Header().Set("Content-Length", strconv.Itoa(len(body)+16)) + w.WriteHeader(http.StatusOK) + _, _ = w.Write(body) + if fl, okFlush := w.(http.Flusher); okFlush { + fl.Flush() + } + if hj, okHj := w.(http.Hijacker); okHj { + if conn, _, errHj := hj.Hijack(); errHj == nil { + _ = conn.Close() + } + } + return + } _, _ = w.Write(body) case http.MethodDelete: f.mu.Lock() - delete(f.objects, f.bucket+"/"+key) + _, failDel := f.failDeleteKeys[key] + if !failDel { + delete(f.objects, f.bucket+"/"+key) + } f.mu.Unlock() + if failDel { + writeS3Error(w, http.StatusInternalServerError, "InternalError", "we encountered an internal error") + return + } w.WriteHeader(http.StatusNoContent) default: http.Error(w, "unsupported", http.StatusMethodNotAllowed) @@ -110,11 +158,14 @@ func (f *fakeS3) handle(w http.ResponseWriter, r *http.Request) { } type listResult struct { - XMLName xml.Name `xml:"ListBucketResult"` - Name string `xml:"Name"` - Prefix string `xml:"Prefix"` - KeyCount int `xml:"KeyCount"` - Contents []listContent `xml:"Contents"` + XMLName xml.Name `xml:"ListBucketResult"` + Name string `xml:"Name"` + Prefix string `xml:"Prefix"` + KeyCount int `xml:"KeyCount"` + IsTruncated bool `xml:"IsTruncated"` + NextContinuationToken string `xml:"NextContinuationToken,omitempty"` + Contents []listContent `xml:"Contents"` + CommonPrefixes []commonPrefix `xml:"CommonPrefixes"` } type listContent struct { @@ -122,13 +173,34 @@ type listContent struct { Size int `xml:"Size"` } +type commonPrefix struct { + Prefix string `xml:"Prefix"` +} + +func writeS3Error(w http.ResponseWriter, status int, code, message string) { + w.Header().Set("Content-Type", "application/xml") + w.WriteHeader(status) + _, _ = io.WriteString(w, ``+code+``+message+``) +} + func (f *fakeS3) listV2(w http.ResponseWriter, r *http.Request) { - prefix := r.URL.Query().Get("prefix") - maxKeys := r.URL.Query().Get("max-keys") + f.mu.Lock() + failList := f.failList + f.mu.Unlock() + if failList { + writeS3Error(w, http.StatusInternalServerError, "InternalError", "list failed") + return + } + q := r.URL.Query() + prefix := q.Get("prefix") + delimiter := q.Get("delimiter") + maxKeys := q.Get("max-keys") + contToken := q.Get("continuation-token") + f.mu.Lock() f.lastListMaxKeys = maxKeys - var contents []listContent - for k, v := range f.objects { + var keys []string + for k := range f.objects { if !strings.HasPrefix(k, f.bucket+"/") { continue } @@ -136,18 +208,158 @@ func (f *fakeS3) listV2(w http.ResponseWriter, r *http.Request) { if prefix != "" && !strings.HasPrefix(key, prefix) { continue } - contents = append(contents, listContent{Key: key, Size: len(v)}) + keys = append(keys, key) + } + sizes := make(map[string]int, len(keys)) + for _, key := range keys { + sizes[key] = len(f.objects[f.bucket+"/"+key]) } f.mu.Unlock() + sort.Strings(keys) + + var contents []listContent + commonSet := map[string]struct{}{} + var common []string + for _, key := range keys { + if delimiter != "" { + rest := strings.TrimPrefix(key, prefix) + if i := strings.Index(rest, delimiter); i >= 0 { + cp := prefix + rest[:i+len(delimiter)] + if _, ok := commonSet[cp]; !ok { + commonSet[cp] = struct{}{} + common = append(common, cp) + } + continue + } + } + contents = append(contents, listContent{Key: key, Size: sizes[key]}) + } + sort.Strings(common) - // Honor max-keys param (string-typed in S3 wire format). + pageSize := f.pageSize if maxKeys != "" { - if n, err := strconv.Atoi(maxKeys); err == nil && n >= 0 && n < len(contents) { - contents = contents[:n] + if n, err := strconv.Atoi(maxKeys); err == nil && n >= 0 { + pageSize = n + } + } + start := 0 + if contToken != "" { + start = sort.Search(len(contents), func(i int) bool { return contents[i].Key > contToken }) + } + end := len(contents) + if pageSize > 0 && start+pageSize < end { + end = start + pageSize + } + truncated := end < len(contents) + page := contents[start:end] + + res := listResult{ + Name: f.bucket, + Prefix: prefix, + KeyCount: len(page), + IsTruncated: truncated, + Contents: page, + } + if truncated && len(page) > 0 { + res.NextContinuationToken = page[len(page)-1].Key + } + for _, cp := range common { + res.CommonPrefixes = append(res.CommonPrefixes, commonPrefix{Prefix: cp}) + } + w.Header().Set("Content-Type", "application/xml") + _ = xml.NewEncoder(w).Encode(res) +} + +func (f *fakeS3) copyObject(w http.ResponseWriter, destKey, copySource string) { + f.mu.Lock() + _, failCopy := f.failCopyKeys[destKey] + f.mu.Unlock() + if failCopy { + writeS3Error(w, http.StatusInternalServerError, "InternalError", "copy failed") + return + } + for i := 0; i < len(copySource); i++ { + if b := copySource[i]; b == ' ' || b > 0x7F { + http.Error(w, "InvalidArgument: x-amz-copy-source must be URL-encoded", http.StatusBadRequest) + return + } + } + src, err := url.PathUnescape(copySource) + if err != nil { + http.Error(w, "InvalidArgument: bad copy-source escaping", http.StatusBadRequest) + return + } + src = strings.TrimPrefix(src, "/") + srcKey := strings.TrimPrefix(src, f.bucket+"/") + + f.mu.Lock() + body, ok := f.objects[f.bucket+"/"+srcKey] + if ok { + buf := make([]byte, len(body)) + copy(buf, body) + f.objects[f.bucket+"/"+destKey] = buf + } + f.mu.Unlock() + if !ok { + http.Error(w, "NoSuchKey", http.StatusNotFound) + return + } + w.Header().Set("Content-Type", "application/xml") + _, _ = io.WriteString(w, `"deadbeef"`) +} + +func (f *fakeS3) deleteObjects(w http.ResponseWriter, r *http.Request) { + body, _ := io.ReadAll(r.Body) + var req struct { + XMLName xml.Name `xml:"Delete"` + Objects []struct { + Key string `xml:"Key"` + } `xml:"Object"` + Quiet bool `xml:"Quiet"` + } + _ = xml.Unmarshal(body, &req) + + f.mu.Lock() + f.deleteObjectsCalls++ + f.lastDeleteBatch = len(req.Objects) + if f.failDeleteObjects { + f.mu.Unlock() + writeS3Error(w, http.StatusInternalServerError, "InternalError", "deleteobjects failed") + return + } + var deleted []string + var failed []string + for _, o := range req.Objects { + if _, bad := f.deleteFailKeys[o.Key]; bad { + failed = append(failed, o.Key) + continue } + delete(f.objects, f.bucket+"/"+o.Key) + deleted = append(deleted, o.Key) } + f.mu.Unlock() - res := listResult{Name: f.bucket, Prefix: prefix, KeyCount: len(contents), Contents: contents} + type deletedEntry struct { + Key string `xml:"Key"` + } + type errorEntry struct { + Key string `xml:"Key"` + Code string `xml:"Code"` + Message string `xml:"Message"` + } + var res struct { + XMLName xml.Name `xml:"DeleteResult"` + Deleted []deletedEntry `xml:"Deleted"` + Errors []errorEntry `xml:"Error"` + } + if !req.Quiet { + for _, k := range deleted { + res.Deleted = append(res.Deleted, deletedEntry{Key: k}) + } + } + for _, k := range failed { + res.Errors = append(res.Errors, errorEntry{Key: k, Code: "AccessDenied", Message: "AccessDenied"}) + } w.Header().Set("Content-Type", "application/xml") _ = xml.NewEncoder(w).Encode(res) } @@ -337,6 +549,152 @@ func TestHasPrefix_RequestsMaxKeysOne(t *testing.T) { "HasPrefix must send max-keys=1 to bound R2 cost") } +func TestDeleteObject_Idempotent(t *testing.T) { + fake := newFakeS3(t, "b") + c := newClient(t, fake) + require.NoError(t, c.PutObject(context.Background(), "www/x", bytes.NewReader([]byte("y")), "text/plain", 1)) + + require.NoError(t, c.DeleteObject(context.Background(), "www/x")) + fake.mu.Lock() + _, present := fake.objects["b/www/x"] + fake.mu.Unlock() + assert.False(t, present, "object should be gone after delete") + + require.NoError(t, c.DeleteObject(context.Background(), "www/x"), "re-delete is a no-op") + require.NoError(t, c.DeleteObject(context.Background(), "never-existed"), "delete of missing key is a no-op") +} + +func TestDeletePrefix(t *testing.T) { + fake := newFakeS3(t, "b") + c := newClient(t, fake) + for _, k := range []string{ + "www/deploys/d1/index.html", + "www/deploys/d1/assets/app.js", + "www/deploys/d1/style.css", + "www/deploys/d2/index.html", + } { + require.NoError(t, c.PutObject(context.Background(), k, bytes.NewReader([]byte("z")), "text/plain", 1)) + } + + n, err := c.DeletePrefix(context.Background(), "www/deploys/d1/") + require.NoError(t, err) + assert.Equal(t, 3, n) + + gone, err := c.HasPrefix(context.Background(), "www/deploys/d1/") + require.NoError(t, err) + assert.False(t, gone, "d1 prefix must be empty after DeletePrefix") + kept, err := c.HasPrefix(context.Background(), "www/deploys/d2/") + require.NoError(t, err) + assert.True(t, kept, "sibling prefix d2 must be untouched") +} + +func TestDeletePrefix_Paginates(t *testing.T) { + fake := newFakeS3(t, "b") + fake.pageSize = 2 + c := newClient(t, fake) + for i := 0; i < 5; i++ { + require.NoError(t, c.PutObject(context.Background(), + fmtKey("s/deploys/d/f%02d.html", i), bytes.NewReader([]byte("z")), "text/plain", 1)) + } + + n, err := c.DeletePrefix(context.Background(), "s/deploys/d/") + require.NoError(t, err) + assert.Equal(t, 5, n) + + gone, err := c.HasPrefix(context.Background(), "s/deploys/d/") + require.NoError(t, err) + assert.False(t, gone) + + fake.mu.Lock() + calls := fake.deleteObjectsCalls + fake.mu.Unlock() + assert.GreaterOrEqual(t, calls, 3, "5 objects at pageSize 2 must span >=3 delete batches") +} + +func TestDeletePrefix_EmptyNoop(t *testing.T) { + fake := newFakeS3(t, "b") + c := newClient(t, fake) + + n, err := c.DeletePrefix(context.Background(), "absent/") + require.NoError(t, err) + assert.Equal(t, 0, n) + + fake.mu.Lock() + calls := fake.deleteObjectsCalls + fake.mu.Unlock() + assert.Equal(t, 0, calls, "no objects under prefix -> no delete batch issued") +} + +func TestMovePrefix(t *testing.T) { + fake := newFakeS3(t, "b") + c := newClient(t, fake) + for k, v := range map[string]string{ + "www/deploys/d1/index.html": "home", + "www/deploys/d1/assets/app.js": "js", + "www/deploys/d2/index.html": "other", + } { + require.NoError(t, c.PutObject(context.Background(), k, bytes.NewReader([]byte(v)), "text/plain", int64(len(v)))) + } + + n, err := c.MovePrefix(context.Background(), "www/deploys/d1/", "_trash/www/d1/") + require.NoError(t, err) + assert.Equal(t, 2, n) + + src, err := c.HasPrefix(context.Background(), "www/deploys/d1/") + require.NoError(t, err) + assert.False(t, src, "source prefix must be empty after move") + + got, err := c.GetAlias(context.Background(), "_trash/www/d1/index.html") + require.NoError(t, err) + assert.Equal(t, "home", got, "bytes preserved at destination key") + + kept, err := c.HasPrefix(context.Background(), "www/deploys/d2/") + require.NoError(t, err) + assert.True(t, kept, "sibling deploy untouched") +} + +func TestMovePrefix_EncodesCopySource(t *testing.T) { + fake := newFakeS3(t, "b") + c := newClient(t, fake) + key := "www/deploys/d1/café menu.html" + require.NoError(t, c.PutObject(context.Background(), key, bytes.NewReader([]byte("body")), "text/html", 4)) + + n, err := c.MovePrefix(context.Background(), "www/deploys/d1/", "_trash/www/d1/") + require.NoError(t, err, "tombstone-move must handle keys with spaces / non-ASCII (URL-encoded copy-source)") + assert.Equal(t, 1, n) + + got, err := c.GetAlias(context.Background(), "_trash/www/d1/café menu.html") + require.NoError(t, err) + assert.Equal(t, "body", got, "object preserved at destination under its original (decoded) key") +} + +func TestMovePrefix_EmptyNoop(t *testing.T) { + fake := newFakeS3(t, "b") + c := newClient(t, fake) + n, err := c.MovePrefix(context.Background(), "absent/", "_trash/absent/") + require.NoError(t, err) + assert.Equal(t, 0, n) +} + +func TestListSites(t *testing.T) { + fake := newFakeS3(t, "b") + c := newClient(t, fake) + for _, k := range []string{ + "www/deploys/d1/index.html", + "www/production", + "learn/deploys/d2/x", + "_trash/www/d9/old.html", + "_artemis_meta.json", + } { + require.NoError(t, c.PutObject(context.Background(), k, bytes.NewReader([]byte("x")), "text/plain", 1)) + } + + sites, err := c.ListSites(context.Background()) + require.NoError(t, err) + assert.ElementsMatch(t, []string{"www", "learn"}, sites, + "top-level prefixes only; _* (e.g. _trash) excluded, bare objects ignored") +} + func TestVerifyDeployComplete_PassFail(t *testing.T) { fake := newFakeS3(t, "b") c := newClient(t, fake) @@ -358,3 +716,160 @@ func TestVerifyDeployComplete_PassFail(t *testing.T) { assert.True(t, errors.As(err, &verr)) assert.Contains(t, verr.Missing, "missing.html") } + +func TestDeletePrefix_PartialFailureReportsAccurateCount(t *testing.T) { + fake := newFakeS3(t, "b") + fake.deleteFailKeys = map[string]struct{}{"www/deploys/d1/style.css": {}} + c := newClient(t, fake) + for _, k := range []string{ + "www/deploys/d1/index.html", + "www/deploys/d1/app.js", + "www/deploys/d1/style.css", + } { + require.NoError(t, c.PutObject(context.Background(), k, bytes.NewReader([]byte("z")), "text/plain", 1)) + } + + n, err := c.DeletePrefix(context.Background(), "www/deploys/d1/") + require.Error(t, err, "per-key DeleteObjects errors must surface, not be swallowed") + assert.Equal(t, 2, n, "count must exclude the failed key (3 requested, 1 failed)") + assert.Contains(t, err.Error(), "1 of 3 failed", + "error must report how many of how many failed for GC/tombstone accounting") + assert.Contains(t, err.Error(), "www/deploys/d1/style.css", + "error must name the failing key") + + fake.mu.Lock() + _, stillThere := fake.objects["b/www/deploys/d1/style.css"] + fake.mu.Unlock() + assert.True(t, stillThere, "the failed key must remain present (it was not actually deleted)") +} + +func TestDeletePrefix_DeleteObjectsTransportError(t *testing.T) { + fake := newFakeS3(t, "b") + fake.failDeleteObjects = true + c := newClient(t, fake) + for _, k := range []string{ + "www/deploys/d1/index.html", + "www/deploys/d1/app.js", + } { + require.NoError(t, c.PutObject(context.Background(), k, bytes.NewReader([]byte("z")), "text/plain", 1)) + } + + n, err := c.DeletePrefix(context.Background(), "www/deploys/d1/") + require.Error(t, err, "a 5xx from DeleteObjects must surface, not be swallowed as success") + assert.Equal(t, 0, n, "no objects counted deleted when the batch transport-errors") + assert.Contains(t, err.Error(), "r2 deleteobjects", + "error must be wrapped with the deleteobjects context") + + fake.mu.Lock() + _, a := fake.objects["b/www/deploys/d1/index.html"] + _, b := fake.objects["b/www/deploys/d1/app.js"] + fake.mu.Unlock() + assert.True(t, a && b, "objects must remain since the delete batch failed") +} + +func TestMovePrefix_CopySucceedsThenDeleteFails_AbortsWithPartialProgress(t *testing.T) { + fake := newFakeS3(t, "b") + fake.failDeleteKeys = map[string]struct{}{"www/deploys/d1/b.html": {}} + c := newClient(t, fake) + for _, k := range []string{ + "www/deploys/d1/a.html", + "www/deploys/d1/b.html", + } { + require.NoError(t, c.PutObject(context.Background(), k, bytes.NewReader([]byte("v")), "text/plain", 1)) + } + + n, err := c.MovePrefix(context.Background(), "www/deploys/d1/", "_trash/www/d1/") + require.Error(t, err, "post-copy DeleteObject failure must abort the move") + assert.Contains(t, err.Error(), "moveprefix delete", + "error must be wrapped with the moveprefix delete context") + assert.Equal(t, 1, n, "only the cleanly moved object counts; the failed one aborts the loop") + + dst, derr := c.GetAlias(context.Background(), "_trash/www/d1/b.html") + require.NoError(t, derr) + assert.Equal(t, "v", dst, + "copy already landed at dst for the delete-failed key: a live duplicate now exists at both src and dst") + + fake.mu.Lock() + _, srcStill := fake.objects["b/www/deploys/d1/b.html"] + fake.mu.Unlock() + assert.True(t, srcStill, "src copy of the delete-failed key is still present, confirming the double-serve risk") +} + +func TestMovePrefix_CopyError_AbortsWithoutDeletingSource(t *testing.T) { + fake := newFakeS3(t, "b") + fake.failCopyKeys = map[string]struct{}{"_trash/www/d1/a.html": {}} + c := newClient(t, fake) + require.NoError(t, c.PutObject(context.Background(), + "www/deploys/d1/a.html", bytes.NewReader([]byte("v")), "text/plain", 1)) + + n, err := c.MovePrefix(context.Background(), "www/deploys/d1/", "_trash/www/d1/") + require.Error(t, err, "a CopyObject failure must abort before deleting the source") + assert.Contains(t, err.Error(), "moveprefix copy", + "error must be wrapped with the moveprefix copy context") + assert.Equal(t, 0, n, "nothing moved when the only copy fails") + + src, serr := c.GetAlias(context.Background(), "www/deploys/d1/a.html") + require.NoError(t, serr) + assert.Equal(t, "v", src, + "source bytes must NOT be deleted when the copy never succeeded") +} + +func TestGetAlias_NonNotFoundErrorIsWrappedNotMappedToNotFound(t *testing.T) { + t.Run("transient 5xx is not absence", func(t *testing.T) { + fake := newFakeS3(t, "b") + fake.failGetKeys = map[string]struct{}{"www/production": {}} + c := newClient(t, fake) + require.NoError(t, c.PutObject(context.Background(), + "www/production", bytes.NewReader([]byte("deploys/d1")), "text/plain", 10)) + + _, err := c.GetAlias(context.Background(), "www/production") + require.Error(t, err) + assert.False(t, IsNotFound(err), + "a 503 must NOT be misclassified as alias-absent or callers reset deploy pointers on a transient outage") + assert.Contains(t, err.Error(), "r2 get", + "non-NoSuchKey/NotFound API errors must be wrapped with the get context") + }) + + t.Run("body read error is wrapped", func(t *testing.T) { + fake := newFakeS3(t, "b") + fake.truncateGetKeys = map[string]struct{}{"www/production": {}} + c := newClient(t, fake) + require.NoError(t, c.PutObject(context.Background(), + "www/production", bytes.NewReader([]byte("deploys/d1")), "text/plain", 10)) + + _, err := c.GetAlias(context.Background(), "www/production") + require.Error(t, err) + assert.False(t, IsNotFound(err), "a truncated body is an error, not absence") + assert.Contains(t, err.Error(), "r2 read", + "a ReadAll failure on the alias body must be wrapped with the read context") + }) +} + +func TestListPrefix_PaginatesAcrossContinuationToken(t *testing.T) { + fake := newFakeS3(t, "b") + fake.pageSize = 2 + c := newClient(t, fake) + want := make([]string, 0, 5) + for i := 0; i < 5; i++ { + k := fmtKey("www/deploys/d1/f%02d.html", i) + want = append(want, k) + require.NoError(t, c.PutObject(context.Background(), k, bytes.NewReader([]byte("x")), "text/plain", 1)) + } + + keys, err := c.ListPrefix(context.Background(), "www/deploys/d1/") + require.NoError(t, err) + assert.ElementsMatch(t, want, keys, + "the continuation-token loop must return every key; a broken loop would truncate and falsely report files missing") +} + +func TestListPrefix_ListErrorIsWrapped(t *testing.T) { + fake := newFakeS3(t, "b") + fake.failList = true + c := newClient(t, fake) + + keys, err := c.ListPrefix(context.Background(), "www/deploys/d1/") + require.Error(t, err, "a list 5xx must surface, not be swallowed into an empty result") + assert.Nil(t, keys) + assert.Contains(t, err.Error(), "r2 list", + "the list error must be wrapped with the list context") +} diff --git a/internal/registry/valkey/cutover_test.go b/internal/registry/valkey/cutover_test.go new file mode 100644 index 0000000..cf78969 --- /dev/null +++ b/internal/registry/valkey/cutover_test.go @@ -0,0 +1,66 @@ +package valkey_test + +import ( + "context" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/freeCodeCamp/artemis/internal/registry" + "github.com/freeCodeCamp/artemis/internal/registry/valkey" +) + +type stubSource struct { + mu sync.Mutex + bySite map[string][]string +} + +func newStubSource() *stubSource { + return &stubSource{bySite: map[string][]string{}} +} + +func (s *stubSource) set(slug string, teams []string) { + s.mu.Lock() + defer s.mu.Unlock() + s.bySite[slug] = append([]string(nil), teams...) +} + +func (s *stubSource) Sites(_ context.Context) ([]registry.Site, error) { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]registry.Site, 0, len(s.bySite)) + for slug, teams := range s.bySite { + out = append(out, registry.Site{Slug: slug, Teams: append([]string(nil), teams...)}) + } + return out, nil +} + +func TestRegistryCutover(t *testing.T) { + t.Parallel() + + pubsub, _, _ := newStore(t) + source := newStubSource() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + source.set("preexisting", []string{"staff"}) + + reader, err := valkey.NewReaderFromSource(ctx, source, pubsub, valkey.DefaultRefreshFallback) + require.NoError(t, err) + + snap := reader.Snapshot() + require.Equal(t, []string{"preexisting"}, snap.Sites(), + "initial read served from PG source via cache-front") + require.Equal(t, []string{"staff"}, snap.TeamsForSite("preexisting")) + + onChange := valkey.PublishOnChange(ctx, pubsub) + source.set("blog", []string{"news-editors"}) + onChange("blog") + + eventually(t, 2*time.Second, "OnChange publish propagates a PG write into the cache-front", func() bool { + s := reader.Snapshot() + return len(s.Sites()) == 2 && s.TeamsForSite("blog") != nil + }) +} diff --git a/internal/registry/valkey/reader.go b/internal/registry/valkey/reader.go index cf63061..7fac56c 100644 --- a/internal/registry/valkey/reader.go +++ b/internal/registry/valkey/reader.go @@ -20,15 +20,22 @@ type onRefreshErrorFn = func(error) // DefaultRefreshFallback is the cap on how long the in-memory cache // can stay stale without an explicit registry.changed event refresh. // Even if pub-sub silently drops a message, callers see at most this -// much divergence between Valkey state and the artemis snapshot. +// much divergence between the source-of-truth state and the artemis +// snapshot. const DefaultRefreshFallback = 60 * time.Second -// Reader is the registry.Reader implementation backed by Valkey. -// It maintains an in-process snapshot of the entire registry that -// is refreshed eagerly on every registry.changed event and lazily -// on a TTL fallback (covers missed pub-sub deliveries). +type SitesSource interface { + Sites(ctx context.Context) ([]registry.Site, error) +} + +// Reader is the registry.Reader cache-front. It maintains an +// in-process snapshot of the entire registry that is refreshed +// eagerly on every registry.changed event (delivered over Valkey +// pub-sub) and lazily on a TTL fallback (covers missed deliveries). +// The snapshot is rebuilt from the SitesSource, the source-of-truth. type Reader struct { - store *Store + source SitesSource + pubsub *Store mu sync.RWMutex snapshot snapshot @@ -86,17 +93,25 @@ func (s snapshot) TeamsForSite(slug string) []string { return out } -// NewReader returns a Reader pre-populated with the current registry -// state. It launches a background goroutine that subscribes to -// registry.changed and refreshes the cache on every event. The +// NewReader returns a Reader whose source-of-truth and pub-sub +// transport are the same Valkey *Store. Retained for the Valkey-only +// configuration; the Postgres cutover uses NewReaderFromSource. +func NewReader(ctx context.Context, store *Store, ttl time.Duration) (*Reader, error) { + return NewReaderFromSource(ctx, store, store, ttl) +} + +// NewReaderFromSource returns a Reader pre-populated from source (the +// source-of-truth) and subscribed to registry.changed over the pubsub +// *Store (cross-replica invalidation transport). It launches a +// background goroutine that refreshes the cache on every event; the // goroutine exits when ctx is canceled. Pass DefaultRefreshFallback // for ttl unless tests need a tighter window. -func NewReader(ctx context.Context, store *Store, ttl time.Duration) (*Reader, error) { - r := &Reader{store: store} +func NewReaderFromSource(ctx context.Context, source SitesSource, pubsub *Store, ttl time.Duration) (*Reader, error) { + r := &Reader{source: source, pubsub: pubsub} if err := r.Refresh(ctx); err != nil { return nil, fmt.Errorf("registry: initial refresh: %w", err) } - events, err := store.Subscribe(ctx) + events, err := pubsub.Subscribe(ctx) if err != nil { return nil, fmt.Errorf("registry: subscribe: %w", err) } @@ -113,11 +128,11 @@ func (r *Reader) Snapshot() registry.Snapshot { return r.snapshot } -// Refresh re-reads the registry from Valkey, replacing the cached -// snapshot atomically. Exposed as a public method so tests (and the -// import binary) can drive refreshes deterministically. +// Refresh re-reads the registry from the source-of-truth, replacing +// the cached snapshot atomically. Exposed as a public method so tests +// (and the import binary) can drive refreshes deterministically. func (r *Reader) Refresh(ctx context.Context) error { - sites, err := r.store.Sites(ctx) + sites, err := r.source.Sites(ctx) if err != nil { return err } diff --git a/internal/registry/valkey/store.go b/internal/registry/valkey/store.go index 6ed013c..04d5b19 100644 --- a/internal/registry/valkey/store.go +++ b/internal/registry/valkey/store.go @@ -17,6 +17,7 @@ import ( "encoding/json" "errors" "fmt" + "log/slog" "sort" "time" @@ -150,6 +151,18 @@ func (s *Store) Subscribe(ctx context.Context) (<-chan string, error) { return out, nil } +func (s *Store) Publish(ctx context.Context, slug string) error { + return s.client.Publish(ctx, ChannelRegistryChanged, slug).Err() +} + +func PublishOnChange(ctx context.Context, store *Store) func(slug string) { + return func(slug string) { + if err := store.Publish(ctx, slug); err != nil { + slog.Warn("valkey registry publish failed", "slug", slug, "err", err) + } + } +} + // siteKey returns the hash key for a given slug. Defined in one place // so the wire format (`site:`) cannot drift between methods. func siteKey(slug string) string { diff --git a/internal/reporequest/valkey/store_test.go b/internal/reporequest/valkey/store_test.go index eec5c41..6e0035f 100644 --- a/internal/reporequest/valkey/store_test.go +++ b/internal/reporequest/valkey/store_test.go @@ -281,6 +281,123 @@ func TestStore_RejectFreesNameCaseInsensitively(t *testing.T) { require.NoError(t, err) } +func TestStore_MarkActiveRequiresApproved(t *testing.T) { + tests := []struct { + name string + toStatus func(t *testing.T, s *valkey.Store, ctx context.Context, id string) + }{ + { + name: "pending", + toStatus: func(t *testing.T, s *valkey.Store, ctx context.Context, id string) {}, + }, + { + name: "active", + toStatus: func(t *testing.T, s *valkey.Store, ctx context.Context, id string) { + _, err := s.Approve(ctx, id, "admin") + require.NoError(t, err) + _, err = s.MarkActive(ctx, id, "https://github.com/freeCodeCamp-Universe/x") + require.NoError(t, err) + }, + }, + { + name: "rejected", + toStatus: func(t *testing.T, s *valkey.Store, ctx context.Context, id string) { + _, err := s.Reject(ctx, id, "admin", "no") + require.NoError(t, err) + }, + }, + { + name: "failed", + toStatus: func(t *testing.T, s *valkey.Store, ctx context.Context, id string) { + _, err := s.Approve(ctx, id, "admin") + require.NoError(t, err) + _, err = s.MarkFailed(ctx, id, "boom") + require.NoError(t, err) + }, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + s := newStore(t) + ctx := context.Background() + r, err := s.Create(ctx, sampleReq("x")) + require.NoError(t, err) + tc.toStatus(t, s, ctx, r.ID) + + _, err = s.MarkActive(ctx, r.ID, "https://github.com/freeCodeCamp-Universe/x") + assert.ErrorIs(t, err, reporequest.ErrNotPending, + "only an approved request may go active; a %s row must be guarded", tc.name) + }) + } +} + +func TestStore_MarkFailedRequiresApproved(t *testing.T) { + tests := []struct { + name string + toStatus func(t *testing.T, s *valkey.Store, ctx context.Context, id string) + }{ + { + name: "pending", + toStatus: func(t *testing.T, s *valkey.Store, ctx context.Context, id string) {}, + }, + { + name: "active", + toStatus: func(t *testing.T, s *valkey.Store, ctx context.Context, id string) { + _, err := s.Approve(ctx, id, "admin") + require.NoError(t, err) + _, err = s.MarkActive(ctx, id, "https://github.com/freeCodeCamp-Universe/x") + require.NoError(t, err) + }, + }, + { + name: "rejected", + toStatus: func(t *testing.T, s *valkey.Store, ctx context.Context, id string) { + _, err := s.Reject(ctx, id, "admin", "no") + require.NoError(t, err) + }, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + s := newStore(t) + ctx := context.Background() + r, err := s.Create(ctx, sampleReq("x")) + require.NoError(t, err) + tc.toStatus(t, s, ctx, r.ID) + + _, err = s.MarkFailed(ctx, r.ID, "boom") + assert.ErrorIs(t, err, reporequest.ErrNotPending, + "only an approved request may be marked failed; a %s row must be guarded", tc.name) + }) + } +} + +func TestStore_DeleteFailedRowKeepsReclaimedName(t *testing.T) { + s := newStore(t) + ctx := context.Background() + + a, err := s.Create(ctx, sampleReq("x")) + require.NoError(t, err) + _, err = s.Approve(ctx, a.ID, "adm") + require.NoError(t, err) + _, err = s.MarkFailed(ctx, a.ID, "boom") + require.NoError(t, err) + + b, err := s.Create(ctx, sampleReq("x")) + require.NoError(t, err) + require.NotEqual(t, a.ID, b.ID) + + require.NoError(t, s.Delete(ctx, a.ID)) + + _, err = s.Create(ctx, sampleReq("x")) + assert.ErrorIs(t, err, reporequest.ErrAlreadyExists, + "deleting a failed row must not release a name a newer pending row reclaimed") + + got, err := s.Get(ctx, b.ID) + require.NoError(t, err) + assert.Equal(t, "x", got.Name) +} + func TestNewWithClient_NilClient(t *testing.T) { _, err := valkey.NewWithClient(nil) require.Error(t, err) diff --git a/internal/server/server.go b/internal/server/server.go index ae5fa1f..19457dd 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -73,6 +73,7 @@ func New(h *handler.Handlers, metricsGatherer prometheus.Gatherer) http.Handler r.Patch("/site/{slug}", h.SiteUpdate) r.Delete("/site/{slug}", h.SiteDelete) r.Get("/site/{site}/deploys", h.SiteDeploys) + r.Delete("/site/{site}/deploys/{deployId}", h.SiteDeployDelete) r.Get("/site/{site}/alias/{mode}", h.AliasGet) r.Post("/site/{site}/promote", h.SitePromote) r.Post("/site/{site}/rollback", h.SiteRollback) diff --git a/internal/teamcache/teamcache.go b/internal/teamcache/teamcache.go new file mode 100644 index 0000000..6407b0b --- /dev/null +++ b/internal/teamcache/teamcache.go @@ -0,0 +1,69 @@ +package teamcache + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "time" + + "github.com/redis/go-redis/v9" +) + +const keyPrefix = "ghteams:" + +type Cache struct { + client *redis.Client + ttl time.Duration +} + +func New(client *redis.Client, ttl time.Duration) *Cache { + return &Cache{client: client, ttl: ttl} +} + +func key(login string) string { return keyPrefix + login } + +func (c *Cache) Get(ctx context.Context, login string) ([]string, bool, error) { + raw, err := c.client.Get(ctx, key(login)).Result() + if errors.Is(err, redis.Nil) { + return nil, false, nil + } + if err != nil { + return nil, false, fmt.Errorf("teamcache get %s: %w", login, err) + } + var teams []string + if err := json.Unmarshal([]byte(raw), &teams); err != nil { + return nil, false, fmt.Errorf("teamcache decode %s: %w", login, err) + } + return teams, true, nil +} + +func (c *Cache) Set(ctx context.Context, login string, teams []string) error { + if teams == nil { + teams = []string{} + } + b, err := json.Marshal(teams) + if err != nil { + return fmt.Errorf("teamcache encode %s: %w", login, err) + } + if err := c.client.Set(ctx, key(login), b, c.ttl).Err(); err != nil { + return fmt.Errorf("teamcache set %s: %w", login, err) + } + return nil +} + +func (c *Cache) GetOrFetch(ctx context.Context, login string, fetch func(ctx context.Context) ([]string, error)) ([]string, error) { + if teams, hit, err := c.Get(ctx, login); err != nil { + return nil, err + } else if hit { + return teams, nil + } + teams, err := fetch(ctx) + if err != nil { + return nil, err + } + if err := c.Set(ctx, login, teams); err != nil { + return nil, err + } + return teams, nil +} diff --git a/internal/teamcache/teamcache_errors_test.go b/internal/teamcache/teamcache_errors_test.go new file mode 100644 index 0000000..fece72a --- /dev/null +++ b/internal/teamcache/teamcache_errors_test.go @@ -0,0 +1,25 @@ +package teamcache + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestTeamCache_GetOrFetch_AbortsOnGetError(t *testing.T) { + ctx := context.Background() + c, mr := newTestCache(t, time.Minute) + mr.SetError("valkey down") + + calls := 0 + _, err := c.GetOrFetch(ctx, "b", func(context.Context) ([]string, error) { + calls++ + return nil, nil + }) + require.Error(t, err) + assert.Equal(t, 0, calls, "fetch must not run when the cache Get itself errors") + assert.ErrorContains(t, err, "teamcache get b") +} diff --git a/internal/teamcache/teamcache_test.go b/internal/teamcache/teamcache_test.go new file mode 100644 index 0000000..d24e5be --- /dev/null +++ b/internal/teamcache/teamcache_test.go @@ -0,0 +1,164 @@ +package teamcache + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/alicebob/miniredis/v2" + "github.com/alicebob/miniredis/v2/server" + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func failCommands(t *testing.T, mr *miniredis.Miniredis, msg string, cmds ...string) { + t.Helper() + fail := make(map[string]struct{}, len(cmds)) + for _, c := range cmds { + fail[c] = struct{}{} + } + mr.Server().SetPreHook(func(c *server.Peer, cmd string, args ...string) bool { + if _, ok := fail[cmd]; ok { + c.WriteError(msg) + return true + } + return false + }) + t.Cleanup(func() { mr.Server().SetPreHook(nil) }) +} + +func newTestCache(t *testing.T, ttl time.Duration) (*Cache, *miniredis.Miniredis) { + t.Helper() + mr, err := miniredis.Run() + require.NoError(t, err) + t.Cleanup(mr.Close) + client := redis.NewClient(&redis.Options{Addr: mr.Addr()}) + t.Cleanup(func() { _ = client.Close() }) + return New(client, ttl), mr +} + +func TestTeamCache(t *testing.T) { + ctx := context.Background() + c, _ := newTestCache(t, 5*time.Minute) + + _, hit, err := c.Get(ctx, "alice") + require.NoError(t, err) + assert.False(t, hit, "cold cache misses") + + require.NoError(t, c.Set(ctx, "alice", []string{"staff", "team-eng"})) + teams, hit, err := c.Get(ctx, "alice") + require.NoError(t, err) + assert.True(t, hit) + assert.Equal(t, []string{"staff", "team-eng"}, teams) +} + +func TestTeamCache_GetOrFetch_FetchesOnceThenCaches(t *testing.T) { + ctx := context.Background() + c, _ := newTestCache(t, 5*time.Minute) + + calls := 0 + fetch := func(context.Context) ([]string, error) { + calls++ + return []string{"staff"}, nil + } + + teams, err := c.GetOrFetch(ctx, "bob", fetch) + require.NoError(t, err) + assert.Equal(t, []string{"staff"}, teams) + assert.Equal(t, 1, calls, "miss triggers exactly one upstream fetch") + + teams, err = c.GetOrFetch(ctx, "bob", fetch) + require.NoError(t, err) + assert.Equal(t, []string{"staff"}, teams) + assert.Equal(t, 1, calls, "second call served from Valkey cache; GitHub App quota protected") +} + +func TestTeamCache_CachesEmptyMembership(t *testing.T) { + ctx := context.Background() + c, _ := newTestCache(t, 5*time.Minute) + + calls := 0 + fetch := func(context.Context) ([]string, error) { + calls++ + return nil, nil + } + _, err := c.GetOrFetch(ctx, "outsider", fetch) + require.NoError(t, err) + + teams, hit, err := c.Get(ctx, "outsider") + require.NoError(t, err) + assert.True(t, hit, "an empty team list is cached, not treated as a miss") + assert.Empty(t, teams) + + _, err = c.GetOrFetch(ctx, "outsider", fetch) + require.NoError(t, err) + assert.Equal(t, 1, calls, "non-member result is cached too — no re-fetch storm") +} + +func TestTeamCache_TTLExpiry(t *testing.T) { + ctx := context.Background() + c, mr := newTestCache(t, time.Minute) + + require.NoError(t, c.Set(ctx, "alice", []string{"staff"})) + mr.FastForward(2 * time.Minute) + + _, hit, err := c.Get(ctx, "alice") + require.NoError(t, err) + assert.False(t, hit, "entry expires after TTL -> miss") +} + +func TestTeamCache_FetchErrorNotCached(t *testing.T) { + ctx := context.Background() + c, _ := newTestCache(t, time.Minute) + + _, err := c.GetOrFetch(ctx, "carol", func(context.Context) ([]string, error) { + return nil, errors.New("github 503") + }) + require.Error(t, err) + + _, hit, err := c.Get(ctx, "carol") + require.NoError(t, err) + assert.False(t, hit, "a failed upstream fetch is never cached") +} + +func TestTeamCache_Get_MalformedJSONIsAnError(t *testing.T) { + ctx := context.Background() + c, mr := newTestCache(t, time.Minute) + require.NoError(t, mr.Set("ghteams:eve", "not-json")) + + teams, hit, err := c.Get(ctx, "eve") + require.Error(t, err) + assert.False(t, hit, "a poisoned cache value must not read as a hit") + assert.Nil(t, teams, "a decode failure must not leak a partial team list") + assert.ErrorContains(t, err, "teamcache decode") +} + +func TestTeamCache_Get_RedisErrorPropagates(t *testing.T) { + ctx := context.Background() + c, mr := newTestCache(t, time.Minute) + failCommands(t, mr, "LOADING Redis is loading the dataset in memory", "GET") + + teams, hit, err := c.Get(ctx, "alice") + require.Error(t, err) + assert.False(t, hit, "a backend error must not read as a hit") + assert.Nil(t, teams) + assert.ErrorContains(t, err, "teamcache get") +} + +func TestTeamCache_GetOrFetch_SetFailurePropagates(t *testing.T) { + ctx := context.Background() + c, mr := newTestCache(t, time.Minute) + failCommands(t, mr, "READONLY You can't write against a read only replica.", "SET") + + calls := 0 + teams, err := c.GetOrFetch(ctx, "bob", func(context.Context) ([]string, error) { + calls++ + return []string{"staff"}, nil + }) + require.Error(t, err, "an unpersisted fetch must surface the write error, not pose as cached") + assert.Nil(t, teams) + assert.Equal(t, 1, calls, "the miss path runs fetch before the failing Set") + assert.ErrorContains(t, err, "teamcache set") +} diff --git a/internal/worker/debounce.go b/internal/worker/debounce.go new file mode 100644 index 0000000..e465c0b --- /dev/null +++ b/internal/worker/debounce.go @@ -0,0 +1,62 @@ +package worker + +import ( + "sync" + "time" +) + +type Debouncer struct { + Window time.Duration + Trigger func(site string) + + mu sync.Mutex + gen uint64 + timers map[string]debounceEntry + stopped bool +} + +type debounceEntry struct { + timer *time.Timer + gen uint64 +} + +func NewDebouncer(window time.Duration, trigger func(site string)) *Debouncer { + return &Debouncer{Window: window, Trigger: trigger, timers: map[string]debounceEntry{}} +} + +func (d *Debouncer) Notify(site string) { + d.mu.Lock() + defer d.mu.Unlock() + if d.stopped { + return + } + if e, ok := d.timers[site]; ok { + e.timer.Stop() + } + d.gen++ + gen := d.gen + timer := time.AfterFunc(d.Window, func() { d.fire(site, gen) }) + d.timers[site] = debounceEntry{timer: timer, gen: gen} +} + +func (d *Debouncer) fire(site string, gen uint64) { + d.mu.Lock() + e, ok := d.timers[site] + if d.stopped || !ok || e.gen != gen { + d.mu.Unlock() + return + } + delete(d.timers, site) + d.mu.Unlock() + d.Trigger(site) +} + +func (d *Debouncer) Stop() { + d.mu.Lock() + defer d.mu.Unlock() + d.stopped = true + for _, e := range d.timers { + e.timer.Stop() + } + d.timers = map[string]debounceEntry{} +} diff --git a/internal/worker/debounce_test.go b/internal/worker/debounce_test.go new file mode 100644 index 0000000..a97e0ea --- /dev/null +++ b/internal/worker/debounce_test.go @@ -0,0 +1,103 @@ +package worker + +import ( + "sync" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestDebounce(t *testing.T) { + var mu sync.Mutex + fired := map[string]int{} + d := NewDebouncer(30*time.Millisecond, func(site string) { + mu.Lock() + fired[site]++ + mu.Unlock() + }) + t.Cleanup(d.Stop) + + for i := 0; i < 5; i++ { + d.Notify("www") + } + d.Notify("learn") + + require.Eventually(t, func() bool { + mu.Lock() + defer mu.Unlock() + return fired["www"] == 1 && fired["learn"] == 1 + }, time.Second, 5*time.Millisecond, "a burst per site coalesces into exactly one trigger") + + mu.Lock() + assert.Equal(t, 1, fired["www"], "5 rapid site.changed events -> 1 gc-site trigger") + mu.Unlock() +} + +func TestDebounce_LaterChangeRetriggers(t *testing.T) { + var mu sync.Mutex + var count int + d := NewDebouncer(20*time.Millisecond, func(string) { + mu.Lock() + count++ + mu.Unlock() + }) + t.Cleanup(d.Stop) + + d.Notify("www") + require.Eventually(t, func() bool { mu.Lock(); defer mu.Unlock(); return count == 1 }, time.Second, 5*time.Millisecond) + + d.Notify("www") + require.Eventually(t, func() bool { mu.Lock(); defer mu.Unlock(); return count == 2 }, time.Second, 5*time.Millisecond, + "a change after processing triggers GC again (no lost updates; per-site order preserved by engine key, E2)") +} + +func TestDebounce_StaleCallbackDoesNotDropNewerTimer(t *testing.T) { + var mu sync.Mutex + var count int + d := NewDebouncer(time.Hour, func(string) { + mu.Lock() + count++ + mu.Unlock() + }) + t.Cleanup(d.Stop) + + d.Notify("www") + d.mu.Lock() + stale := d.timers["www"] + d.mu.Unlock() + + d.Notify("www") + d.mu.Lock() + fresh := d.timers["www"] + d.mu.Unlock() + require.NotEqual(t, stale.gen, fresh.gen, "second Notify installs a distinct timer") + + d.fire("www", stale.gen) + + mu.Lock() + assert.Equal(t, 0, count, "stale in-flight callback must not Trigger") + mu.Unlock() + + d.mu.Lock() + got := d.timers["www"] + d.mu.Unlock() + assert.Equal(t, fresh.gen, got.gen, "stale callback must not delete the newer timer entry") +} + +func TestDebounce_StopHaltsPendingTriggers(t *testing.T) { + var mu sync.Mutex + var count int + d := NewDebouncer(50*time.Millisecond, func(string) { + mu.Lock() + count++ + mu.Unlock() + }) + d.Notify("www") + d.Stop() + time.Sleep(80 * time.Millisecond) + mu.Lock() + assert.Equal(t, 0, count, "Stop cancels pending triggers") + mu.Unlock() +} diff --git a/internal/worker/deployflows.go b/internal/worker/deployflows.go new file mode 100644 index 0000000..8cf1b43 --- /dev/null +++ b/internal/worker/deployflows.go @@ -0,0 +1,21 @@ +package worker + +const ( + WorkflowFinalize = "finalize" + WorkflowPromote = "promote" + WorkflowRollback = "rollback" +) + +func RegisterDeployWorkflows(rt *Runtime, finalize, promote, rollback Handler) error { + defs := []WorkflowDef{ + {Name: WorkflowFinalize, ConcurrencyKey: ConcurrencyKeySite, Handler: finalize}, + {Name: WorkflowPromote, ConcurrencyKey: ConcurrencyKeySite, Handler: promote}, + {Name: WorkflowRollback, ConcurrencyKey: ConcurrencyKeySite, Handler: rollback}, + } + for _, d := range defs { + if err := rt.Register(d); err != nil { + return err + } + } + return nil +} diff --git a/internal/worker/deployflows_test.go b/internal/worker/deployflows_test.go new file mode 100644 index 0000000..49ffd9a --- /dev/null +++ b/internal/worker/deployflows_test.go @@ -0,0 +1,32 @@ +package worker + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestDeployWorkflowsRegisterWithSiteKey(t *testing.T) { + eng := &fakeEngine{} + rt := NewRuntime(eng) + + require.NoError(t, RegisterDeployWorkflows(rt, noop, noop, noop)) + + byName := map[string]WorkflowDef{} + for _, d := range eng.registered { + byName[d.Name] = d + } + require.Len(t, eng.registered, 3) + for _, name := range []string{WorkflowFinalize, WorkflowPromote, WorkflowRollback} { + assert.Equal(t, ConcurrencyKeySite, byName[name].ConcurrencyKey, + "%s must serialize per-site via concurrency key (V8 single-writer-per-site)", name) + } +} + +func TestRegisterDeployWorkflows_PropagatesError(t *testing.T) { + rt := NewRuntime(&fakeEngine{}) + require.NoError(t, RegisterDeployWorkflows(rt, noop, noop, noop)) + err := RegisterDeployWorkflows(rt, noop, noop, noop) + require.Error(t, err, "re-registering the same workflow names is rejected") +} diff --git a/internal/worker/metrics.go b/internal/worker/metrics.go new file mode 100644 index 0000000..e9f2198 --- /dev/null +++ b/internal/worker/metrics.go @@ -0,0 +1,92 @@ +package worker + +import "github.com/prometheus/client_golang/prometheus" + +type Metrics struct { + QueueDepth *prometheus.GaugeVec + DLQDepth prometheus.Gauge + WorkflowRuns *prometheus.CounterVec + WorkflowFailures *prometheus.CounterVec + DeadLettered *prometheus.CounterVec + RelayPublished prometheus.Counter + RelayFailures prometheus.Counter +} + +func NewMetrics(reg prometheus.Registerer) *Metrics { + m := &Metrics{ + QueueDepth: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "artemis_worker_queue_depth", + Help: "Pending tasks per workflow queue (sampled from the engine).", + }, []string{"workflow"}), + DLQDepth: prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "artemis_worker_dlq_depth", + Help: "Number of dead-lettered workflow runs awaiting operator attention.", + }), + WorkflowRuns: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "artemis_worker_workflow_runs_total", + Help: "Workflow runs, labelled by workflow and outcome.", + }, []string{"workflow", "outcome"}), + WorkflowFailures: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "artemis_worker_workflow_failures_total", + Help: "Workflow run failures, labelled by workflow.", + }, []string{"workflow"}), + DeadLettered: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "artemis_worker_dead_lettered_total", + Help: "Workflow runs that exhausted retries and dead-lettered, labelled by workflow.", + }, []string{"workflow"}), + RelayPublished: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "artemis_relay_published_total", + Help: "Outbox rows published to the engine by the relay loop (at-least-once).", + }), + RelayFailures: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "artemis_relay_failures_total", + Help: "Relay RunOnce passes that returned an error before draining the batch.", + }), + } + reg.MustRegister(m.QueueDepth, m.DLQDepth, m.WorkflowRuns, m.WorkflowFailures, m.DeadLettered, m.RelayPublished, m.RelayFailures) + return m +} + +func (m *Metrics) ObserveRun(workflow, outcome string) { + if m == nil { + return + } + m.WorkflowRuns.WithLabelValues(workflow, outcome).Inc() + if outcome == "failed" { + m.WorkflowFailures.WithLabelValues(workflow).Inc() + } +} + +func (m *Metrics) ObserveDeadLetter(workflow string) { + if m == nil { + return + } + m.DeadLettered.WithLabelValues(workflow).Inc() + m.DLQDepth.Inc() +} + +func (m *Metrics) SetQueueDepth(workflow string, depth float64) { + if m == nil { + return + } + m.QueueDepth.WithLabelValues(workflow).Set(depth) +} + +func (m *Metrics) SetDLQDepth(depth float64) { + if m == nil { + return + } + m.DLQDepth.Set(depth) +} + +func (m *Metrics) ObserveRelay(published int, err error) { + if m == nil { + return + } + if published > 0 { + m.RelayPublished.Add(float64(published)) + } + if err != nil { + m.RelayFailures.Inc() + } +} diff --git a/internal/worker/metrics_test.go b/internal/worker/metrics_test.go new file mode 100644 index 0000000..f67264d --- /dev/null +++ b/internal/worker/metrics_test.go @@ -0,0 +1,50 @@ +package worker + +import ( + "errors" + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" +) + +var errFail = errors.New("relay boom") + +func TestObs(t *testing.T) { + reg := prometheus.NewRegistry() + m := NewMetrics(reg) + + m.ObserveRun("gc-site", "ok") + m.ObserveRun("gc-site", "failed") + assert.EqualValues(t, 1, testutil.ToFloat64(m.WorkflowRuns.WithLabelValues("gc-site", "ok"))) + assert.EqualValues(t, 1, testutil.ToFloat64(m.WorkflowFailures.WithLabelValues("gc-site")), + "a failed outcome also bumps the failure counter") + + m.ObserveDeadLetter("gc-site") + assert.EqualValues(t, 1, testutil.ToFloat64(m.DeadLettered.WithLabelValues("gc-site"))) + assert.EqualValues(t, 1, testutil.ToFloat64(m.DLQDepth), "dead-letter raises DLQ depth") + + m.SetQueueDepth("gc-site", 42) + assert.EqualValues(t, 42, testutil.ToFloat64(m.QueueDepth.WithLabelValues("gc-site"))) + + m.SetDLQDepth(0) + assert.EqualValues(t, 0, testutil.ToFloat64(m.DLQDepth), "operator drained the DLQ") + + m.ObserveRelay(7, nil) + assert.EqualValues(t, 7, testutil.ToFloat64(m.RelayPublished)) + assert.EqualValues(t, 0, testutil.ToFloat64(m.RelayFailures)) + + m.ObserveRelay(3, errFail) + assert.EqualValues(t, 10, testutil.ToFloat64(m.RelayPublished), "partial drain still counts what published") + assert.EqualValues(t, 1, testutil.ToFloat64(m.RelayFailures)) +} + +func TestObs_NilSafe(t *testing.T) { + var m *Metrics + m.ObserveRun("x", "ok") + m.ObserveDeadLetter("x") + m.SetQueueDepth("x", 1) + m.SetDLQDepth(1) + m.ObserveRelay(1, nil) +} diff --git a/internal/worker/relay.go b/internal/worker/relay.go new file mode 100644 index 0000000..506a08a --- /dev/null +++ b/internal/worker/relay.go @@ -0,0 +1,64 @@ +package worker + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/freeCodeCamp/artemis/internal/pg" +) + +type OutboxSource interface { + FetchUnpublished(ctx context.Context, limit int) ([]pg.OutboxEvent, error) + MarkPublished(ctx context.Context, ids []int64, at time.Time) error +} + +type Publisher interface { + Publish(ctx context.Context, topic string, payload []byte) error +} + +type Relay struct { + Source OutboxSource + Publisher Publisher + Batch int + Now func() time.Time +} + +func (r *Relay) RunOnce(ctx context.Context) (int, error) { + batch := r.Batch + if batch <= 0 { + batch = 100 + } + events, err := r.Source.FetchUnpublished(ctx, batch) + if err != nil { + return 0, fmt.Errorf("relay: fetch: %w", err) + } + + var done []int64 + for _, e := range events { + if err := r.Publisher.Publish(ctx, e.Topic, e.Payload); err != nil { + pubErr := fmt.Errorf("relay: publish id=%d topic=%s: %w", e.ID, e.Topic, err) + return len(done), errors.Join(pubErr, r.mark(ctx, done)) + } + done = append(done, e.ID) + } + if err := r.mark(ctx, done); err != nil { + return len(done), err + } + return len(done), nil +} + +func (r *Relay) mark(ctx context.Context, ids []int64) error { + if len(ids) == 0 { + return nil + } + now := time.Now + if r.Now != nil { + now = r.Now + } + if err := r.Source.MarkPublished(ctx, ids, now()); err != nil { + return fmt.Errorf("relay: mark published: %w", err) + } + return nil +} diff --git a/internal/worker/relay_test.go b/internal/worker/relay_test.go new file mode 100644 index 0000000..8d968c4 --- /dev/null +++ b/internal/worker/relay_test.go @@ -0,0 +1,118 @@ +package worker + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/freeCodeCamp/artemis/internal/pg" +) + +type fakeSource struct { + events []pg.OutboxEvent + published map[int64]bool +} + +func newFakeSource(topics ...string) *fakeSource { + s := &fakeSource{published: map[int64]bool{}} + for i, tp := range topics { + s.events = append(s.events, pg.OutboxEvent{ID: int64(i + 1), Topic: tp, Payload: []byte(`{}`)}) + } + return s +} + +func (s *fakeSource) FetchUnpublished(_ context.Context, limit int) ([]pg.OutboxEvent, error) { + var out []pg.OutboxEvent + for _, e := range s.events { + if !s.published[e.ID] { + out = append(out, e) + } + if len(out) >= limit { + break + } + } + return out, nil +} + +func (s *fakeSource) MarkPublished(_ context.Context, ids []int64, _ time.Time) error { + for _, id := range ids { + s.published[id] = true + } + return nil +} + +type fakePublisher struct { + got []string + failOn int + calls int + failErr error +} + +func (p *fakePublisher) Publish(_ context.Context, topic string, _ []byte) error { + p.calls++ + if p.failOn != 0 && p.calls == p.failOn { + return p.failErr + } + p.got = append(p.got, topic) + return nil +} + +func TestOutboxRelay(t *testing.T) { + src := newFakeSource("site.changed", "site.changed", "site.changed") + pub := &fakePublisher{} + relay := &Relay{Source: src, Publisher: pub, Now: func() time.Time { return time.Unix(0, 0) }} + + n, err := relay.RunOnce(context.Background()) + require.NoError(t, err) + assert.Equal(t, 3, n) + assert.Len(t, pub.got, 3, "all events published") + assert.True(t, src.published[1] && src.published[2] && src.published[3], "all marked published") + + n, err = relay.RunOnce(context.Background()) + require.NoError(t, err) + assert.Equal(t, 0, n, "second pass finds nothing unpublished") +} + +func TestOutboxRelay_StopsAtFailurePreservingOrder(t *testing.T) { + src := newFakeSource("a", "b", "c") + pub := &fakePublisher{failOn: 2, failErr: errors.New("engine down")} + relay := &Relay{Source: src, Publisher: pub, Now: func() time.Time { return time.Unix(0, 0) }} + + n, err := relay.RunOnce(context.Background()) + require.Error(t, err) + assert.Equal(t, 1, n, "only the first event published before the failure") + assert.True(t, src.published[1], "succeeded event marked") + assert.False(t, src.published[2], "failed event NOT marked -> retried next tick") + assert.False(t, src.published[3], "later events not published out of order") +} + +func TestOutboxRelay_AtLeastOnceOnMarkFailure(t *testing.T) { + src := &markFailSource{fakeSource: newFakeSource("a"), failMark: true} + pub := &fakePublisher{} + relay := &Relay{Source: src, Publisher: pub} + + _, err := relay.RunOnce(context.Background()) + require.Error(t, err, "mark failure surfaces") + assert.Len(t, pub.got, 1, "event was published") + + src.failMark = false + _, err = relay.RunOnce(context.Background()) + require.NoError(t, err) + assert.Len(t, pub.got, 2, "unmarked event re-published (at-least-once; consumer must be idempotent, E1)") +} + +type markFailSource struct { + *fakeSource + failMark bool +} + +func (s *markFailSource) MarkPublished(ctx context.Context, ids []int64, at time.Time) error { + if s.failMark { + return errors.New("db down") + } + return s.fakeSource.MarkPublished(ctx, ids, at) +} diff --git a/internal/worker/runtime.go b/internal/worker/runtime.go new file mode 100644 index 0000000..59d9e4a --- /dev/null +++ b/internal/worker/runtime.go @@ -0,0 +1,71 @@ +package worker + +import ( + "context" + "errors" + "fmt" +) + +const ConcurrencyKeySite = "site" + +const ( + WorkflowGCSite = "gc-site" + WorkflowManualDelete = "manual-delete" + WorkflowSitePurge = "site-purge" + WorkflowTombstonePurge = "tombstone-purge" + WorkflowReconcile = "reconcile" +) + +type Handler func(ctx context.Context, input map[string]any) error + +type WorkflowDef struct { + Name string + ConcurrencyKey string + EventTriggers []string + Cron []string + Handler Handler +} + +type Engine interface { + Register(def WorkflowDef) error + Start(ctx context.Context) error + Stop(ctx context.Context) error +} + +type Runtime struct { + engine Engine + defs []WorkflowDef +} + +func NewRuntime(engine Engine) *Runtime { + return &Runtime{engine: engine} +} + +func (rt *Runtime) Register(def WorkflowDef) error { + if def.Name == "" { + return errors.New("worker: workflow name required") + } + if def.Handler == nil { + return fmt.Errorf("worker: workflow %s has nil handler", def.Name) + } + for _, existing := range rt.defs { + if existing.Name == def.Name { + return fmt.Errorf("worker: workflow %s already registered", def.Name) + } + } + if err := rt.engine.Register(def); err != nil { + return fmt.Errorf("worker: register %s: %w", def.Name, err) + } + rt.defs = append(rt.defs, def) + return nil +} + +func (rt *Runtime) Registered() []WorkflowDef { + out := make([]WorkflowDef, len(rt.defs)) + copy(out, rt.defs) + return out +} + +func (rt *Runtime) Start(ctx context.Context) error { return rt.engine.Start(ctx) } + +func (rt *Runtime) Stop(ctx context.Context) error { return rt.engine.Stop(ctx) } diff --git a/internal/worker/runtime_test.go b/internal/worker/runtime_test.go new file mode 100644 index 0000000..dc89853 --- /dev/null +++ b/internal/worker/runtime_test.go @@ -0,0 +1,74 @@ +package worker + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type fakeEngine struct { + registered []WorkflowDef + started bool + stopped bool + regErr error +} + +func (f *fakeEngine) Register(def WorkflowDef) error { + if f.regErr != nil { + return f.regErr + } + f.registered = append(f.registered, def) + return nil +} + +func (f *fakeEngine) Start(context.Context) error { f.started = true; return nil } +func (f *fakeEngine) Stop(context.Context) error { f.stopped = true; return nil } + +func noop(context.Context, map[string]any) error { return nil } + +func TestWorkerBoot(t *testing.T) { + eng := &fakeEngine{} + rt := NewRuntime(eng) + + perSite := []string{WorkflowGCSite, WorkflowManualDelete, WorkflowSitePurge} + for _, name := range perSite { + require.NoError(t, rt.Register(WorkflowDef{Name: name, ConcurrencyKey: ConcurrencyKeySite, Handler: noop})) + } + require.NoError(t, rt.Register(WorkflowDef{Name: WorkflowTombstonePurge, Handler: noop})) + + require.NoError(t, rt.Start(context.Background())) + assert.True(t, eng.started, "Start delegates to the engine") + + byName := map[string]WorkflowDef{} + for _, d := range eng.registered { + byName[d.Name] = d + } + require.Len(t, eng.registered, 4) + for _, name := range perSite { + assert.Equal(t, ConcurrencyKeySite, byName[name].ConcurrencyKey, + "%s must register with concurrency key=site (V7)", name) + } + assert.Empty(t, byName[WorkflowTombstonePurge].ConcurrencyKey, + "tombstone-purge is a cross-site sweep, not per-site keyed") + + require.NoError(t, rt.Stop(context.Background())) + assert.True(t, eng.stopped) +} + +func TestRuntime_RejectsDuplicateAndNil(t *testing.T) { + rt := NewRuntime(&fakeEngine{}) + require.NoError(t, rt.Register(WorkflowDef{Name: "x", Handler: noop})) + require.Error(t, rt.Register(WorkflowDef{Name: "x", Handler: noop}), "duplicate name rejected") + require.Error(t, rt.Register(WorkflowDef{Name: "y"}), "nil handler rejected") + require.Error(t, rt.Register(WorkflowDef{Handler: noop}), "empty name rejected") +} + +func TestRuntime_PropagatesRegisterError(t *testing.T) { + rt := NewRuntime(&fakeEngine{regErr: errors.New("boom")}) + err := rt.Register(WorkflowDef{Name: "x", Handler: noop}) + require.Error(t, err) + assert.Contains(t, err.Error(), "register x") +} diff --git a/justfile b/justfile index 596839e..a90ad7b 100644 --- a/justfile +++ b/justfile @@ -64,6 +64,22 @@ integration-help: @echo " SITE=test ROOT_DOMAIN=freecode.camp \\" @echo " just integration" +# Real-Hatchet suite: spins up hatchet-lite via compose, mints a token, runs R2/R3/R4/R5 +hatchet-integration: + #!/usr/bin/env bash + set -euo pipefail + cd test/integration/hatchet + compose="docker compose -f compose.hatchet.yaml" + tenant="707d0855-80ab-4e1f-a156-f1c4546cbf52" + $compose up -d --wait + trap "$compose down -v" EXIT + token=$($compose exec -T hatchet-lite /hatchet-admin token create --config /config --tenant-id "$tenant" | tr -d '\r\n') + HATCHET_CLIENT_TOKEN="$token" \ + HATCHET_CLIENT_HOST_PORT="${HATCHET_CLIENT_HOST_PORT:-127.0.0.1:7077}" \ + HATCHET_CLIENT_TLS_STRATEGY=none \ + HATCHET_COMPOSE_FILE="$PWD/compose.hatchet.yaml" \ + {{go}} test -tags=integration -count=1 -timeout=10m ../../../internal/hatchet/... + # go vet (CI also runs golangci-lint) lint: {{go}} vet {{pkg}} @@ -92,6 +108,14 @@ compose-logs: smoke: ./scripts/smoke.sh +# Full-stack E2E: boots artemis + pg + valkey + minio + hatchet, runs the e2e suite +e2e-local: + ./scripts/e2e-local.sh + +# Scalability load harness: ephemeral pg + registry/outbox/gc throughput (R14) +loadgen: + ./scripts/loadgen.sh + # docker build — multi-stage distroless image: docker build \ diff --git a/scripts/e2e-local.sh b/scripts/e2e-local.sh new file mode 100755 index 0000000..6e0c2ae --- /dev/null +++ b/scripts/e2e-local.sh @@ -0,0 +1,109 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd "$(dirname "$0")/.." + +GO="${GO:-go}" +COMPOSE_FILE="test/e2e/compose.e2e.yaml" +COMPOSE="docker compose -f ${COMPOSE_FILE}" +TENANT="${HATCHET_TENANT_ID:-707d0855-80ab-4e1f-a156-f1c4546cbf52}" +KEEP="${KEEP_STACK:-0}" + +ARTEMIS_HOST_PORT="${ARTEMIS_HOST_PORT:-8080}" +PG_HOST_PORT="${PG_HOST_PORT:-55432}" +MINIO_HOST_PORT="${MINIO_HOST_PORT:-59000}" +HATCHET_GRPC_HOST_PORT="${HATCHET_GRPC_HOST_PORT:-7077}" +HATCHET_DASHBOARD_HOST_PORT="${HATCHET_DASHBOARD_HOST_PORT:-8888}" +export ARTEMIS_HOST_PORT PG_HOST_PORT MINIO_HOST_PORT +export HATCHET_GRPC_HOST_PORT HATCHET_DASHBOARD_HOST_PORT + +ARTEMIS_URL="http://localhost:${ARTEMIS_HOST_PORT}" + +TMP="$(mktemp -d)" +cleanup() { + rm -rf "$TMP" + if [[ "$KEEP" != 1 ]]; then + ${COMPOSE} down -v >/dev/null 2>&1 || true + fi +} +trap cleanup EXIT + +echo "==> minting ephemeral Apollo-11-style App keypair" +openssl genrsa -out "$TMP/app.key" 2048 2>/dev/null +openssl rsa -in "$TMP/app.key" -pubout -out "$TMP/app.pub" 2>/dev/null +GH_APP_PRIVATE_KEY="$(cat "$TMP/app.key")" +FAKE_GH_APP_PUBLIC_KEY="$(cat "$TMP/app.pub")" +export GH_APP_PRIVATE_KEY FAKE_GH_APP_PUBLIC_KEY + +echo "==> minting self-signed TLS cert for the minio R2 stub" +CERTS_DIR="$TMP/certs" +mkdir -p "$CERTS_DIR" +openssl req -x509 -newkey rsa:2048 -nodes -days 1 \ + -keyout "$CERTS_DIR/private.key" -out "$CERTS_DIR/public.crt" \ + -subj "/CN=minio" \ + -addext "subjectAltName=DNS:minio,DNS:localhost,IP:127.0.0.1" 2>/dev/null +chmod 0644 "$CERTS_DIR/private.key" "$CERTS_DIR/public.crt" +export E2E_CERTS_DIR="$CERTS_DIR" + +echo "==> bringing up hatchet engine (postgres + hatchet-lite)" +${COMPOSE} up -d --wait hatchet-postgres hatchet-lite + +echo "==> minting hatchet client token" +HATCHET_CLIENT_TOKEN="$(${COMPOSE} exec -T hatchet-lite \ + /hatchet-admin token create --config /config --tenant-id "$TENANT" | tr -d '\r\n')" +export HATCHET_CLIENT_TOKEN +if [[ -z "$HATCHET_CLIENT_TOKEN" ]]; then + echo "FATAL: hatchet token mint returned empty" >&2 + exit 2 +fi + +echo "==> bringing up full stack (postgres + valkey + minio + fakegithub + artemis)" +${COMPOSE} up -d --build --wait \ + postgres valkey minio minio-setup fakegithub artemis + +echo "==> waiting for artemis /readyz at ${ARTEMIS_URL}" +ok=0 +for _ in $(seq 1 60); do + if curl -fsS "${ARTEMIS_URL}/readyz" >/dev/null 2>&1; then + ok=1 + break + fi + sleep 2 +done +if [[ "$ok" != 1 ]]; then + echo "FATAL: artemis /readyz never green" >&2 + ${COMPOSE} logs artemis 2>/dev/null | tail -60 + exit 1 +fi +echo " readyz green" + +echo "==> waiting for fakegithub-backed auth surface (whoami)" +GH_TOKEN_PROBE="${E2E_GH_TOKEN:-smoke-token-local}" +ok=0 +for _ in $(seq 1 30); do + code="$(curl -s -o /dev/null -w '%{http_code}' \ + -H "Authorization: Bearer ${GH_TOKEN_PROBE}" "${ARTEMIS_URL}/api/whoami")" + if [[ "$code" == "200" ]]; then + ok=1 + break + fi + sleep 2 +done +if [[ "$ok" != 1 ]]; then + echo "FATAL: /api/whoami never returned 200 (fakegithub not reachable from artemis)" >&2 + ${COMPOSE} logs artemis fakegithub 2>/dev/null | tail -60 + exit 1 +fi +echo " auth surface green" + +echo "==> running e2e suite (-tags=e2e) against ${ARTEMIS_URL}" +ARTEMIS_URL="${ARTEMIS_URL}" \ + AWS_CA_BUNDLE="${CERTS_DIR}/public.crt" \ + E2E_R2_CA_FILE="${CERTS_DIR}/public.crt" \ + E2E_PG_DSN="postgres://artemis:artemis@localhost:${PG_HOST_PORT}/artemis?sslmode=disable" \ + E2E_R2_ENDPOINT="https://localhost:${MINIO_HOST_PORT}" \ + E2E_R2_ACCESS_KEY_ID="minioadmin" \ + E2E_R2_SECRET_ACCESS_KEY="minioadmin" \ + E2E_R2_BUCKET="universe-static-apps-01" \ + E2E_GH_TOKEN="${E2E_GH_TOKEN:-smoke-token-local}" \ + "${GO}" test -tags=e2e -count=1 -timeout=10m -v ./test/e2e/... diff --git a/scripts/loadgen.sh b/scripts/loadgen.sh new file mode 100755 index 0000000..2c1cf59 --- /dev/null +++ b/scripts/loadgen.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd "$(dirname "$0")/.." + +GO="${GO:-go}" +SITES="${SITES:-500}" +DEPLOYS_PER_SITE="${DEPLOYS_PER_SITE:-40}" +CONCURRENCY="${CONCURRENCY:-16}" +PG_HOST_PORT="${PG_HOST_PORT:-55433}" +KEEP="${KEEP_STACK:-0}" +CONTAINER="artemis-loadgen-pg" + +cleanup() { + if [[ "$KEEP" != 1 ]]; then + docker rm -f "$CONTAINER" >/dev/null 2>&1 || true + fi +} +trap cleanup EXIT + +docker rm -f "$CONTAINER" >/dev/null 2>&1 || true +echo >&2 "==> starting ephemeral postgres on :${PG_HOST_PORT}" +docker run -d --name "$CONTAINER" \ + -e POSTGRES_USER=artemis -e POSTGRES_PASSWORD=artemis -e POSTGRES_DB=artemis \ + -p "${PG_HOST_PORT}:5432" postgres:17-alpine \ + -c max_connections=200 -c shared_buffers=256MB >/dev/null + +echo >&2 "==> waiting for postgres" +ok=0 +for _ in $(seq 1 30); do + if docker exec "$CONTAINER" pg_isready -U artemis -d artemis >/dev/null 2>&1; then + ok=1 + break + fi + sleep 1 +done +if [[ "$ok" != 1 ]]; then + echo "FATAL: postgres never became ready on :${PG_HOST_PORT}" >&2 + docker logs "$CONTAINER" 2>/dev/null | tail -40 + exit 1 +fi + +DSN="${LOADGEN_DATABASE_URL:-postgres://artemis:artemis@localhost:${PG_HOST_PORT}/artemis?sslmode=disable}" +echo >&2 "==> running load harness sites=${SITES} deploys-per-site=${DEPLOYS_PER_SITE} concurrency=${CONCURRENCY}" +LOADGEN_DATABASE_URL="$DSN" "$GO" run -tags=load ./cmd/loadgen \ + -sites "$SITES" -deploys-per-site "$DEPLOYS_PER_SITE" -concurrency "$CONCURRENCY" diff --git a/test/e2e/catalog_test.go b/test/e2e/catalog_test.go new file mode 100644 index 0000000..fb51a37 --- /dev/null +++ b/test/e2e/catalog_test.go @@ -0,0 +1,451 @@ +//go:build e2e + +package e2e_test + +import ( + "context" + "fmt" + "net/http" + "strings" + "testing" + "time" +) + +func TestHealthZ(t *testing.T) { + e := requireStack(t) + var resp struct { + OK bool `json:"ok"` + } + mustStatus(t, e.call(t, http.MethodGet, "/healthz", "", nil, &resp), http.StatusOK, "healthz") + if !resp.OK { + t.Fatalf("healthz ok=false") + } +} + +func TestReadyZ_FullyHealthy(t *testing.T) { + e := requireStack(t) + var resp struct { + Ready bool `json:"ready"` + Degraded bool `json:"degraded"` + } + mustStatus(t, e.call(t, http.MethodGet, "/readyz", "", nil, &resp), http.StatusOK, "readyz") + if !resp.Ready { + t.Fatalf("readyz ready=false") + } + if resp.Degraded { + t.Fatalf("readyz degraded=true with PG up; want non-degraded (valkey+r2+pg all reachable)") + } +} + +func TestMetrics(t *testing.T) { + e := requireStack(t) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + status, body, err := e.raw(ctx, http.MethodGet, "/metrics", "", nil) + if err != nil { + t.Fatalf("metrics: %v", err) + } + mustStatus(t, status, http.StatusOK, "metrics") + if !strings.Contains(string(body), "artemis_") { + t.Fatalf("metrics body missing artemis_ prefix: %s", truncate(body, 200)) + } + if !strings.Contains(string(body), "go_") { + t.Fatalf("metrics body missing go_ prefix: %s", truncate(body, 200)) + } +} + +func TestWhoAmI(t *testing.T) { + e := requireStack(t) + var resp struct { + Login string `json:"login"` + AuthorizedSites []string `json:"authorizedSites"` + } + mustStatus(t, e.call(t, http.MethodGet, "/api/whoami", e.GHToken, nil, &resp), http.StatusOK, "whoami") + if resp.Login != "smoke-bot" { + t.Fatalf("whoami login=%q want smoke-bot", resp.Login) + } +} + +func TestAuthRejections(t *testing.T) { + e := requireStack(t) + + t.Run("missing_token", func(t *testing.T) { + mustStatus(t, e.call(t, http.MethodGet, "/api/whoami", "", nil, nil), http.StatusUnauthorized, "missing token") + }) + + t.Run("missing_deploy_jwt", func(t *testing.T) { + mustStatus(t, e.call(t, http.MethodPost, "/api/deploy/20260101-000000-abc1234/finalize", "", nil, nil), + http.StatusUnauthorized, "missing deploy jwt") + }) + + t.Run("unknown_site", func(t *testing.T) { + body := map[string]any{"site": "neverregistered", "sha": "deadbeef"} + mustStatus(t, e.call(t, http.MethodPost, "/api/deploy/init", e.GHToken, body, nil), + http.StatusForbidden, "unknown site") + }) + + t.Run("missing_site_field", func(t *testing.T) { + body := map[string]any{"sha": "abc"} + mustStatus(t, e.call(t, http.MethodPost, "/api/deploy/init", e.GHToken, body, nil), + http.StatusBadRequest, "missing site") + }) +} + +func TestRegistryCRUD(t *testing.T) { + e := requireStack(t) + slug := uniqueSlug("reg") + + var created struct { + Slug string `json:"slug"` + Teams []string `json:"teams"` + } + mustStatus(t, e.call(t, http.MethodPost, "/api/site/register", e.GHToken, + map[string]any{"slug": slug, "teams": []string{"staff"}}, &created), http.StatusCreated, "register") + if created.Slug != slug { + t.Fatalf("register slug=%q want %q", created.Slug, slug) + } + + t.Run("duplicate_conflict", func(t *testing.T) { + mustStatus(t, e.call(t, http.MethodPost, "/api/site/register", e.GHToken, + map[string]any{"slug": slug, "teams": []string{"staff"}}, nil), http.StatusConflict, "register dup") + }) + + var list []struct { + Slug string `json:"slug"` + } + mustStatus(t, e.call(t, http.MethodGet, "/api/sites", e.GHToken, nil, &list), http.StatusOK, "sites list") + if !containsSlug(list, slug) { + t.Fatalf("sites list missing %q", slug) + } + + var patched struct { + Teams []string `json:"teams"` + } + mustStatus(t, e.call(t, http.MethodPatch, "/api/site/"+slug, e.GHToken, + map[string]any{"teams": []string{"staff", "apollo-11-approvers"}}, &patched), http.StatusOK, "patch") + if len(patched.Teams) != 2 { + t.Fatalf("patch teams=%v want 2", patched.Teams) + } + + mustStatus(t, e.call(t, http.MethodGet, "/api/site/"+slug+"/alias/production", e.GHToken, nil, nil), + http.StatusNotFound, "alias before finalize") + + mustStatus(t, e.call(t, http.MethodDelete, "/api/site/"+slug, e.GHToken, nil, nil), http.StatusNoContent, "delete") + + t.Run("delete_missing", func(t *testing.T) { + mustStatus(t, e.call(t, http.MethodDelete, "/api/site/"+slug, e.GHToken, nil, nil), + http.StatusNotFound, "delete missing") + }) +} + +func TestDeployFlow_DeepAssert(t *testing.T) { + e := requireStack(t) + r2c := e.r2Client(t) + pool := e.pgPool(t) + ctx := context.Background() + + slug := uniqueSlug("dep") + registerSite(t, e, slug) + t.Cleanup(func() { _ = e.call(t, http.MethodDelete, "/api/site/"+slug, e.GHToken, nil, nil) }) + + var initResp struct { + DeployID string `json:"deployId"` + JWT string `json:"jwt"` + } + mustStatus(t, e.call(t, http.MethodPost, "/api/deploy/init", e.GHToken, + map[string]any{"site": slug, "sha": deploySHA(), "files": []string{"index.html"}}, &initResp), + http.StatusOK, "init") + if initResp.DeployID == "" || initResp.JWT == "" { + t.Fatalf("init empty deployId/jwt: %+v", initResp) + } + + html := []byte("e2e\n") + var upResp struct { + Received string `json:"received"` + Key string `json:"key"` + } + mustStatus(t, e.upload(t, initResp.DeployID, initResp.JWT, "index.html", "text/html", html, &upResp), + http.StatusOK, "upload") + if upResp.Received != "index.html" { + t.Fatalf("upload received=%q", upResp.Received) + } + + if !hasPrefix(t, r2c, upResp.Key) { + t.Fatalf("R2 object %q absent after upload", upResp.Key) + } + + var finResp struct { + DeployID string `json:"deployId"` + Mode string `json:"mode"` + } + mustStatus(t, e.call(t, http.MethodPost, fmt.Sprintf("/api/deploy/%s/finalize", initResp.DeployID), + initResp.JWT, map[string]any{"mode": "preview", "files": []string{"index.html"}}, &finResp), + http.StatusOK, "finalize") + if finResp.Mode != "preview" { + t.Fatalf("finalize mode=%q", finResp.Mode) + } + + previewAlias, err := r2c.GetAlias(ctx, slug+"/preview") + if err != nil { + t.Fatalf("R2 preview alias get: %v", err) + } + if strings.TrimSpace(previewAlias) != initResp.DeployID { + t.Fatalf("R2 preview alias=%q want %q", previewAlias, initResp.DeployID) + } + + waitOutbox(t, pool, slug) + + var promoteResp struct { + DeployID string `json:"deployId"` + } + mustStatus(t, e.call(t, http.MethodPost, "/api/site/"+slug+"/promote", e.GHToken, + map[string]any{"deployId": initResp.DeployID}, &promoteResp), http.StatusOK, "promote") + if promoteResp.DeployID != initResp.DeployID { + t.Fatalf("promote deployId=%q want %q", promoteResp.DeployID, initResp.DeployID) + } + + prodAlias, err := r2c.GetAlias(ctx, slug+"/production") + if err != nil { + t.Fatalf("R2 prod alias get: %v", err) + } + if strings.TrimSpace(prodAlias) != initResp.DeployID { + t.Fatalf("R2 prod alias=%q want %q", prodAlias, initResp.DeployID) + } + + var aliasResp struct { + DeployID string `json:"deployId"` + } + mustStatus(t, e.call(t, http.MethodGet, "/api/site/"+slug+"/alias/production", e.GHToken, nil, &aliasResp), + http.StatusOK, "alias get") + if aliasResp.DeployID != initResp.DeployID { + t.Fatalf("alias get deployId=%q want %q", aliasResp.DeployID, initResp.DeployID) + } + + var deploys []struct { + DeployID string `json:"deployId"` + } + mustStatus(t, e.call(t, http.MethodGet, "/api/site/"+slug+"/deploys", e.GHToken, nil, &deploys), + http.StatusOK, "deploys list") + if len(deploys) == 0 || deploys[0].DeployID != initResp.DeployID { + t.Fatalf("deploys list missing %q: %+v", initResp.DeployID, deploys) + } +} + +func TestSiteRollback_DeepAssert(t *testing.T) { + e := requireStack(t) + r2c := e.r2Client(t) + ctx := context.Background() + + slug := uniqueSlug("rbk") + registerSite(t, e, slug) + t.Cleanup(func() { _ = e.call(t, http.MethodDelete, "/api/site/"+slug, e.GHToken, nil, nil) }) + + prior := mintDeploy(t, e, slug, "preview") + current := mintDeploy(t, e, slug, "preview") + if prior == current { + t.Fatalf("deploy ids collided: %q", prior) + } + + var promoteResp struct { + DeployID string `json:"deployId"` + } + mustStatus(t, e.call(t, http.MethodPost, "/api/site/"+slug+"/promote", e.GHToken, + map[string]any{"deployId": current}, &promoteResp), http.StatusOK, "promote") + if promoteResp.DeployID != current { + t.Fatalf("promote deployId=%q want %q", promoteResp.DeployID, current) + } + + prodAlias, err := r2c.GetAlias(ctx, slug+"/production") + if err != nil { + t.Fatalf("R2 prod alias get after promote: %v", err) + } + if strings.TrimSpace(prodAlias) != current { + t.Fatalf("R2 prod alias=%q want %q after promote", prodAlias, current) + } + + var rollbackResp struct { + DeployID string `json:"deployId"` + } + mustStatus(t, e.call(t, http.MethodPost, "/api/site/"+slug+"/rollback", e.GHToken, + map[string]any{"to": prior}, &rollbackResp), http.StatusOK, "rollback") + if rollbackResp.DeployID != prior { + t.Fatalf("rollback deployId=%q want %q", rollbackResp.DeployID, prior) + } + + rolledAlias, err := r2c.GetAlias(ctx, slug+"/production") + if err != nil { + t.Fatalf("R2 prod alias get after rollback: %v", err) + } + if strings.TrimSpace(rolledAlias) != prior { + t.Fatalf("R2 prod alias=%q want %q after rollback", rolledAlias, prior) + } + + var aliasResp struct { + DeployID string `json:"deployId"` + } + mustStatus(t, e.call(t, http.MethodGet, "/api/site/"+slug+"/alias/production", e.GHToken, nil, &aliasResp), + http.StatusOK, "alias get") + if aliasResp.DeployID != prior { + t.Fatalf("alias get deployId=%q want %q after rollback", aliasResp.DeployID, prior) + } +} + +func TestManualDelete_Tombstone(t *testing.T) { + e := requireStack(t) + r2c := e.r2Client(t) + pool := e.pgPool(t) + ctx := context.Background() + + slug := uniqueSlug("del") + registerSite(t, e, slug) + t.Cleanup(func() { _ = e.call(t, http.MethodDelete, "/api/site/"+slug, e.GHToken, nil, nil) }) + + deployID := mintDeploy(t, e, slug, "preview") + + deployPrefix := slug + "/deploys/" + deployID + "/" + if !hasPrefix(t, r2c, deployPrefix) { + t.Fatalf("R2 prefix %q absent before delete", deployPrefix) + } + + t.Run("aliased_conflict", func(t *testing.T) { + mustStatus(t, e.call(t, http.MethodDelete, + fmt.Sprintf("/api/site/%s/deploys/%s", slug, deployID), e.GHToken, nil, nil), + http.StatusConflict, "delete aliased") + }) + + mintDeploy(t, e, slug, "preview") + + var delResp struct { + Status string `json:"status"` + } + mustStatus(t, e.call(t, http.MethodDelete, + fmt.Sprintf("/api/site/%s/deploys/%s", slug, deployID), e.GHToken, nil, &delResp), + http.StatusOK, "delete deploy") + if delResp.Status != "tombstoned" { + t.Fatalf("delete status=%q want tombstoned", delResp.Status) + } + + if hasPrefix(t, r2c, deployPrefix) { + t.Fatalf("R2 prefix %q still present after tombstone", deployPrefix) + } + if !hasPrefix(t, r2c, "_trash/"+slug+"/"+deployID+"/") { + t.Fatalf("R2 trash prefix absent after tombstone") + } + + var n int + if err := pool.QueryRow(ctx, + `SELECT count(*) FROM tombstones WHERE site=$1 AND id=$2`, slug, deployID).Scan(&n); err != nil { + t.Fatalf("pg tombstone query: %v", err) + } + if n != 1 { + t.Fatalf("pg tombstone rows=%d want 1", n) + } +} + +func TestSitePurge_Tombstone(t *testing.T) { + e := requireStack(t) + r2c := e.r2Client(t) + pool := e.pgPool(t) + ctx := context.Background() + + slug := uniqueSlug("pur") + registerSite(t, e, slug) + mintDeploy(t, e, slug, "preview") + + var purgeResp struct { + Status string `json:"status"` + } + mustStatus(t, e.call(t, http.MethodDelete, "/api/site/"+slug+"?purge=true", e.GHToken, nil, &purgeResp), + http.StatusOK, "purge") + if purgeResp.Status != "purged" { + t.Fatalf("purge status=%q want purged", purgeResp.Status) + } + + if hasPrefix(t, r2c, slug+"/") { + t.Fatalf("R2 site prefix %q/ still present after purge", slug) + } + if !hasPrefix(t, r2c, "_trash/"+slug+"/") { + t.Fatalf("R2 trash prefix absent after purge") + } + + var n int + if err := pool.QueryRow(ctx, + `SELECT count(*) FROM tombstones WHERE site=$1`, slug).Scan(&n); err != nil { + t.Fatalf("pg tombstone query: %v", err) + } + if n == 0 { + t.Fatalf("pg tombstone rows=0 after purge; want >=1") + } +} + +func TestRepoQueue(t *testing.T) { + e := requireStack(t) + name := uniqueSlug("repo") + + var tmpl struct { + Templates []string `json:"templates"` + } + mustStatus(t, e.call(t, http.MethodGet, "/api/repo/templates", e.GHToken, nil, &tmpl), http.StatusOK, "templates") + if !containsString(tmpl.Templates, "universe-static-template") { + t.Fatalf("templates missing universe-static-template: %v", tmpl.Templates) + } + + var created struct { + ID string `json:"id"` + Status string `json:"status"` + } + mustStatus(t, e.call(t, http.MethodPost, "/api/repo", e.GHToken, + map[string]any{"name": name, "visibility": "public"}, &created), http.StatusCreated, "repo create") + if created.Status != "pending" || created.ID == "" { + t.Fatalf("repo create status=%q id=%q", created.Status, created.ID) + } + + var got struct { + ID string `json:"id"` + Status string `json:"status"` + } + mustStatus(t, e.call(t, http.MethodGet, "/api/repo/"+created.ID, e.GHToken, nil, &got), http.StatusOK, "repo get") + if got.ID != created.ID { + t.Fatalf("repo get id=%q want %q", got.ID, created.ID) + } + + var pending []struct { + ID string `json:"id"` + } + mustStatus(t, e.call(t, http.MethodGet, "/api/repos?status=pending", e.GHToken, nil, &pending), + http.StatusOK, "repos list") + if !containsID(pending, created.ID) { + t.Fatalf("repos pending list missing %q", created.ID) + } + + var approve struct { + Outcome string `json:"outcome"` + Request struct { + Status string `json:"status"` + } `json:"request"` + } + mustStatus(t, e.call(t, http.MethodPost, "/api/repo/"+created.ID+"/approve", e.GHToken, nil, &approve), + http.StatusOK, "repo approve") + if approve.Outcome != "ok" || approve.Request.Status != "active" { + t.Fatalf("repo approve outcome=%q status=%q", approve.Outcome, approve.Request.Status) + } + + t.Run("reject_path", func(t *testing.T) { + other := uniqueSlug("repj") + var c struct { + ID string `json:"id"` + } + mustStatus(t, e.call(t, http.MethodPost, "/api/repo", e.GHToken, + map[string]any{"name": other, "visibility": "public"}, &c), http.StatusCreated, "repo create2") + var rej struct { + Status string `json:"status"` + } + mustStatus(t, e.call(t, http.MethodPost, "/api/repo/"+c.ID+"/reject", e.GHToken, + map[string]any{"reason": "e2e"}, &rej), http.StatusOK, "repo reject") + if rej.Status != "rejected" { + t.Fatalf("repo reject status=%q", rej.Status) + } + mustStatus(t, e.call(t, http.MethodDelete, "/api/repo/"+c.ID, e.GHToken, nil, nil), + http.StatusNoContent, "repo delete") + }) +} diff --git a/test/e2e/compose.e2e.yaml b/test/e2e/compose.e2e.yaml new file mode 100644 index 0000000..66d1e80 --- /dev/null +++ b/test/e2e/compose.e2e.yaml @@ -0,0 +1,152 @@ +name: artemis-e2e + +services: + postgres: + image: postgres:17-alpine + environment: + POSTGRES_USER: artemis + POSTGRES_PASSWORD: artemis + POSTGRES_DB: artemis + ports: + - "${PG_HOST_PORT:-55432}:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U artemis -d artemis"] + interval: 2s + timeout: 3s + retries: 30 + + hatchet-postgres: + image: postgres:17-alpine + environment: + POSTGRES_USER: hatchet + POSTGRES_PASSWORD: hatchet + POSTGRES_DB: hatchet + healthcheck: + test: ["CMD-SHELL", "pg_isready -U hatchet -d hatchet"] + interval: 2s + timeout: 3s + retries: 30 + + hatchet-lite: + image: ghcr.io/hatchet-dev/hatchet/hatchet-lite:v0.88.1 + depends_on: + hatchet-postgres: + condition: service_healthy + ports: + - "${HATCHET_GRPC_HOST_PORT:-7077}:7077" + - "${HATCHET_DASHBOARD_HOST_PORT:-8888}:8888" + environment: + DATABASE_URL: "postgresql://hatchet:hatchet@hatchet-postgres:5432/hatchet?sslmode=disable" + SERVER_AUTH_COOKIE_DOMAIN: localhost + SERVER_AUTH_COOKIE_INSECURE: "t" + SERVER_AUTH_SET_EMAIL_VERIFIED: "t" + SERVER_GRPC_BIND_ADDRESS: "0.0.0.0" + SERVER_GRPC_INSECURE: "t" + SERVER_GRPC_BROADCAST_ADDRESS: "hatchet-lite:7077" + SERVER_GRPC_PORT: "7077" + SERVER_URL: "http://localhost:8888" + SERVER_DEFAULT_ENGINE_VERSION: "V1" + SERVER_INTERNAL_CLIENT_INTERNAL_GRPC_BROADCAST_ADDRESS: "hatchet-lite:7077" + healthcheck: + test: + [ + "CMD-SHELL", + "wget -q -O /dev/null http://127.0.0.1:8888/api/ready || exit 1", + ] + interval: 3s + timeout: 3s + retries: 40 + volumes: + - hatchet_lite_config:/config + + valkey: + image: valkey/valkey:8-alpine + healthcheck: + test: ["CMD", "valkey-cli", "ping"] + interval: 2s + timeout: 3s + retries: 20 + + minio: + image: minio/minio:latest + command: server /data --address ":9000" --certs-dir /certs + environment: + MINIO_ROOT_USER: minioadmin + MINIO_ROOT_PASSWORD: minioadmin + volumes: + - "${E2E_CERTS_DIR:-./.certs}:/certs:ro" + ports: + - "${MINIO_HOST_PORT:-59000}:9000" + + minio-setup: + image: minio/mc:latest + depends_on: + - minio + entrypoint: + - /bin/sh + - -c + - > + until mc --insecure alias set local https://minio:9000 minioadmin minioadmin; do + echo 'waiting for minio'; sleep 1; + done; + mc --insecure mb --ignore-existing local/universe-static-apps-01; + echo 'bucket ready'; + + fakegithub: + build: + context: ../.. + dockerfile: Dockerfile.fakegithub + network_mode: "service:artemis" + depends_on: + artemis: + condition: service_started + environment: + FAKE_GH_ADDR: "127.0.0.1:9001" + FAKE_GH_ORG: "freeCodeCamp-Universe" + FAKE_GH_USER: "smoke-bot" + FAKE_GH_TEAMS: "staff,apollo-11-approvers" + FAKE_GH_APP_ID: "123" + FAKE_GH_APP_PUBLIC_KEY: "${FAKE_GH_APP_PUBLIC_KEY:-}" + FAKE_GH_TEMPLATES: "universe-static-template" + + artemis: + build: + context: ../.. + depends_on: + postgres: + condition: service_healthy + valkey: + condition: service_healthy + minio-setup: + condition: service_completed_successfully + hatchet-lite: + condition: service_healthy + volumes: + - "${E2E_CERTS_DIR:-./.certs}:/certs:ro" + environment: + PORT: "8080" + LOG_LEVEL: "debug" + R2_ENDPOINT: "https://minio:9000" + R2_ACCESS_KEY_ID: "minioadmin" + R2_SECRET_ACCESS_KEY: "minioadmin" + R2_BUCKET: "universe-static-apps-01" + SSL_CERT_FILE: "/certs/public.crt" + VALKEY_ADDR: "valkey:6379" + DATABASE_URL: "postgres://artemis:artemis@postgres:5432/artemis?sslmode=disable" + GH_API_BASE: "http://127.0.0.1:9001" + GH_ORG: "freeCodeCamp-Universe" + GH_REPO_ORG: "freeCodeCamp-Universe" + GH_CLIENT_ID: "Iv1.fakelocalclientid" + JWT_SIGNING_KEY: "local-e2e-jwt-signing-key-not-secret-32bytes" + GH_APP_ID: "123" + GH_APP_INSTALLATION_ID: "456" + GH_APP_PRIVATE_KEY: "${GH_APP_PRIVATE_KEY:-}" + REPO_APPROVE_AUTHZ_TEAM: "apollo-11-approvers" + HATCHET_ADDR: "hatchet-lite:7077" + HATCHET_CLIENT_TOKEN: "${HATCHET_CLIENT_TOKEN:-}" + HATCHET_CLIENT_TLS_STRATEGY: "none" + ports: + - "${ARTEMIS_HOST_PORT:-8080}:8080" + +volumes: + hatchet_lite_config: diff --git a/test/e2e/harness_test.go b/test/e2e/harness_test.go new file mode 100644 index 0000000..bb4b318 --- /dev/null +++ b/test/e2e/harness_test.go @@ -0,0 +1,231 @@ +//go:build e2e + +package e2e_test + +import ( + "bytes" + "context" + "crypto/tls" + "crypto/x509" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "os" + "strings" + "testing" + "time" + + "github.com/jackc/pgx/v5/pgxpool" + + "github.com/freeCodeCamp/artemis/internal/r2" +) + +type env struct { + ArtemisURL string + GHToken string + PGDSN string + R2Endpoint string + R2Key string + R2Secret string + R2Bucket string + HTTP *http.Client +} + +var suite env + +func TestMain(m *testing.M) { + suite = env{ + ArtemisURL: strings.TrimRight(os.Getenv("ARTEMIS_URL"), "/"), + GHToken: os.Getenv("E2E_GH_TOKEN"), + PGDSN: os.Getenv("E2E_PG_DSN"), + R2Endpoint: os.Getenv("E2E_R2_ENDPOINT"), + R2Key: os.Getenv("E2E_R2_ACCESS_KEY_ID"), + R2Secret: os.Getenv("E2E_R2_SECRET_ACCESS_KEY"), + R2Bucket: os.Getenv("E2E_R2_BUCKET"), + HTTP: newHTTPClient(os.Getenv("E2E_R2_CA_FILE")), + } + + if suite.ArtemisURL == "" { + log.Printf("[setup] ARTEMIS_URL unset; tests will Skip. Run via: just e2e-local") + os.Exit(m.Run()) + } + + if err := waitReady(suite); err != nil { + log.Printf("[setup] FATAL: artemis readyz preflight failed: %v", err) + os.Exit(2) + } + log.Printf("[setup] artemis ready at %s", suite.ArtemisURL) + + os.Exit(m.Run()) +} + +func newHTTPClient(caFile string) *http.Client { + c := &http.Client{Timeout: 30 * time.Second} + if caFile == "" { + return c + } + pem, err := os.ReadFile(caFile) + if err != nil { + log.Printf("[setup] WARN: read E2E_R2_CA_FILE %q: %v", caFile, err) + return c + } + pool := x509.NewCertPool() + if !pool.AppendCertsFromPEM(pem) { + log.Printf("[setup] WARN: no certs parsed from %q", caFile) + return c + } + c.Transport = &http.Transport{TLSClientConfig: &tls.Config{RootCAs: pool, MinVersion: tls.VersionTLS12}} + return c +} + +func requireStack(t *testing.T) env { + t.Helper() + if suite.ArtemisURL == "" { + t.Skip("ARTEMIS_URL unset; run via: just e2e-local") + } + return suite +} + +func waitReady(e env) error { + deadline := time.Now().Add(60 * time.Second) + var last error + for time.Now().Before(deadline) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + status, _, err := e.raw(ctx, http.MethodGet, "/readyz", "", nil) + cancel() + if err == nil && status == http.StatusOK { + return nil + } + last = err + time.Sleep(2 * time.Second) + } + return fmt.Errorf("readyz not green within 60s: %v", last) +} + +func (e env) r2Client(t *testing.T) *r2.Client { + t.Helper() + if e.R2Endpoint == "" { + t.Skip("E2E_R2_ENDPOINT unset; run via: just e2e-local") + } + cli, err := r2.New(context.Background(), r2.Config{ + Endpoint: e.R2Endpoint, + AccessKeyID: e.R2Key, + SecretAccessKey: e.R2Secret, + Bucket: e.R2Bucket, + Region: "auto", + }) + if err != nil { + t.Fatalf("r2 client: %v", err) + } + return cli +} + +func (e env) pgPool(t *testing.T) *pgxpool.Pool { + t.Helper() + if e.PGDSN == "" { + t.Skip("E2E_PG_DSN unset; run via: just e2e-local") + } + pool, err := pgxpool.New(context.Background(), e.PGDSN) + if err != nil { + t.Fatalf("pg pool: %v", err) + } + t.Cleanup(pool.Close) + return pool +} + +func (e env) raw(ctx context.Context, method, path, bearer string, body []byte) (int, []byte, error) { + var rdr io.Reader + if body != nil { + rdr = bytes.NewReader(body) + } + req, err := http.NewRequestWithContext(ctx, method, e.ArtemisURL+path, rdr) + if err != nil { + return 0, nil, err + } + if body != nil { + req.Header.Set("Content-Type", "application/json") + } + req.Header.Set("Accept", "application/json") + if bearer != "" { + req.Header.Set("Authorization", "Bearer "+bearer) + } + resp, err := e.HTTP.Do(req) + if err != nil { + return 0, nil, err + } + defer resp.Body.Close() + raw, _ := io.ReadAll(resp.Body) + return resp.StatusCode, raw, nil +} + +func (e env) call(t *testing.T, method, path, bearer string, reqBody, respBody any) int { + t.Helper() + var body []byte + if reqBody != nil { + var err error + body, err = json.Marshal(reqBody) + if err != nil { + t.Fatalf("marshal req: %v", err) + } + } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + status, raw, err := e.raw(ctx, method, path, bearer, body) + if err != nil { + t.Fatalf("%s %s: %v", method, path, err) + } + if respBody != nil && len(raw) > 0 { + if err := json.Unmarshal(raw, respBody); err != nil { + t.Fatalf("%s %s: decode resp (status=%d): %v body=%s", method, path, status, err, truncate(raw, 300)) + } + } + return status +} + +func (e env) upload(t *testing.T, deployID, jwt, relPath, contentType string, body []byte, respBody any) int { + t.Helper() + url := fmt.Sprintf("%s/api/deploy/%s/upload?path=%s", e.ArtemisURL, deployID, relPath) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + req, err := http.NewRequestWithContext(ctx, http.MethodPut, url, bytes.NewReader(body)) + if err != nil { + t.Fatalf("upload req: %v", err) + } + if contentType == "" { + contentType = "application/octet-stream" + } + req.Header.Set("Content-Type", contentType) + req.Header.Set("Authorization", "Bearer "+jwt) + req.ContentLength = int64(len(body)) + resp, err := e.HTTP.Do(req) + if err != nil { + t.Fatalf("upload: %v", err) + } + defer resp.Body.Close() + raw, _ := io.ReadAll(resp.Body) + if respBody != nil && len(raw) > 0 { + if err := json.Unmarshal(raw, respBody); err != nil { + t.Fatalf("upload decode (status=%d): %v body=%s", resp.StatusCode, err, truncate(raw, 300)) + } + } + return resp.StatusCode +} + +func truncate(b []byte, n int) []byte { + if len(b) <= n { + return b + } + out := make([]byte, n+3) + copy(out, b[:n]) + copy(out[n:], "...") + return out +} + +func mustStatus(t *testing.T, got, want int, what string) { + t.Helper() + if got != want { + t.Fatalf("%s: status=%d want=%d", what, got, want) + } +} diff --git a/test/e2e/helpers_test.go b/test/e2e/helpers_test.go new file mode 100644 index 0000000..df3f167 --- /dev/null +++ b/test/e2e/helpers_test.go @@ -0,0 +1,125 @@ +//go:build e2e + +package e2e_test + +import ( + "context" + "fmt" + "net/http" + "sync/atomic" + "testing" + "time" + + "github.com/jackc/pgx/v5/pgxpool" + + "github.com/freeCodeCamp/artemis/internal/r2" +) + +var slugSeq atomic.Int64 + +func uniqueSlug(prefix string) string { + n := slugSeq.Add(1) + return fmt.Sprintf("%se2e%d%d", prefix, time.Now().UnixNano()%1_000_000, n) +} + +func registerSite(t *testing.T, e env, slug string) { + t.Helper() + mustStatus(t, e.call(t, http.MethodPost, "/api/site/register", e.GHToken, + map[string]any{"slug": slug, "teams": []string{"staff"}}, nil), http.StatusCreated, "registerSite "+slug) + waitSiteVisible(t, e, slug) +} + +func waitSiteVisible(t *testing.T, e env, slug string) { + t.Helper() + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + var resp struct { + AuthorizedSites []string `json:"authorizedSites"` + } + if e.call(t, http.MethodGet, "/api/whoami", e.GHToken, nil, &resp) == http.StatusOK { + if containsString(resp.AuthorizedSites, slug) { + return + } + } + time.Sleep(250 * time.Millisecond) + } + t.Fatalf("site %q not visible in whoami authorizedSites within 10s (registry cache propagation)", slug) +} + +func deploySHA() string { + return fmt.Sprintf("%07d", slugSeq.Add(1)%10_000_000) +} + +func mintDeploy(t *testing.T, e env, slug, mode string) string { + t.Helper() + var initResp struct { + DeployID string `json:"deployId"` + JWT string `json:"jwt"` + } + mustStatus(t, e.call(t, http.MethodPost, "/api/deploy/init", e.GHToken, + map[string]any{"site": slug, "sha": deploySHA(), "files": []string{"index.html"}}, &initResp), + http.StatusOK, "mintDeploy init") + mustStatus(t, e.upload(t, initResp.DeployID, initResp.JWT, "index.html", "text/html", + []byte("e2e"), nil), http.StatusOK, "mintDeploy upload") + mustStatus(t, e.call(t, http.MethodPost, fmt.Sprintf("/api/deploy/%s/finalize", initResp.DeployID), + initResp.JWT, map[string]any{"mode": mode, "files": []string{"index.html"}}, nil), + http.StatusOK, "mintDeploy finalize") + return initResp.DeployID +} + +func hasPrefix(t *testing.T, c *r2.Client, prefix string) bool { + t.Helper() + has, err := c.HasPrefix(context.Background(), prefix) + if err != nil { + t.Fatalf("R2 HasPrefix %q: %v", prefix, err) + } + return has +} + +func waitOutbox(t *testing.T, pool *pgxpool.Pool, site string) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + var n int + err := pool.QueryRow(ctx, + `SELECT count(*) FROM outbox WHERE topic='site.changed' AND payload->>'site'=$1`, site).Scan(&n) + if err == nil && n >= 1 { + return + } + time.Sleep(250 * time.Millisecond) + } + t.Fatalf("pg outbox: no site.changed row for site=%q within 10s", site) +} + +func containsSlug(rows []struct { + Slug string `json:"slug"` +}, slug string) bool { + for _, r := range rows { + if r.Slug == slug { + return true + } + } + return false +} + +func containsID(rows []struct { + ID string `json:"id"` +}, id string) bool { + for _, r := range rows { + if r.ID == id { + return true + } + } + return false +} + +func containsString(xs []string, want string) bool { + for _, x := range xs { + if x == want { + return true + } + } + return false +} diff --git a/test/integration/hatchet/compose.hatchet.yaml b/test/integration/hatchet/compose.hatchet.yaml new file mode 100644 index 0000000..73f270a --- /dev/null +++ b/test/integration/hatchet/compose.hatchet.yaml @@ -0,0 +1,49 @@ +name: artemis-hatchet-it + +services: + postgres: + image: postgres:17-alpine + environment: + POSTGRES_USER: hatchet + POSTGRES_PASSWORD: hatchet + POSTGRES_DB: hatchet + healthcheck: + test: ["CMD-SHELL", "pg_isready -U hatchet -d hatchet"] + interval: 2s + timeout: 3s + retries: 30 + + hatchet-lite: + image: ghcr.io/hatchet-dev/hatchet/hatchet-lite:v0.88.1 + depends_on: + postgres: + condition: service_healthy + ports: + - "${HATCHET_GRPC_HOST_PORT:-7077}:7077" + - "${HATCHET_DASHBOARD_HOST_PORT:-8888}:8888" + environment: + DATABASE_URL: "postgresql://hatchet:hatchet@postgres:5432/hatchet?sslmode=disable" + SERVER_AUTH_COOKIE_DOMAIN: localhost + SERVER_AUTH_COOKIE_INSECURE: "t" + SERVER_AUTH_SET_EMAIL_VERIFIED: "t" + SERVER_GRPC_BIND_ADDRESS: "0.0.0.0" + SERVER_GRPC_INSECURE: "t" + SERVER_GRPC_BROADCAST_ADDRESS: "127.0.0.1:7077" + SERVER_GRPC_PORT: "7077" + SERVER_URL: "http://localhost:8888" + SERVER_DEFAULT_ENGINE_VERSION: "V1" + SERVER_INTERNAL_CLIENT_INTERNAL_GRPC_BROADCAST_ADDRESS: "127.0.0.1:7077" + healthcheck: + test: + [ + "CMD-SHELL", + "wget -q -O /dev/null http://127.0.0.1:8888/api/ready || exit 1", + ] + interval: 3s + timeout: 3s + retries: 40 + volumes: + - hatchet_lite_config:/config + +volumes: + hatchet_lite_config: