Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
78 commits
Select commit Hold shift + click to select a range
154674c
docs(design): add durable-execution model (0001)
raisedadead Jun 2, 2026
4a9b89f
docs(design): lift 0001 to ADR-020 scope
raisedadead Jun 2, 2026
fd1f4bf
feat(pg): add Postgres layer with embedded schema migrations
raisedadead Jun 2, 2026
53e14fe
feat(r2): add DeleteObject + paginated batch DeletePrefix
raisedadead Jun 2, 2026
b1f248d
feat(r2): add MovePrefix (copy+delete) for tombstone moves
raisedadead Jun 2, 2026
1698048
feat(r2): add ListSites (top-level delimiter, _* excluded)
raisedadead Jun 2, 2026
7be6dc6
feat(gc): add pure retain predicate (alias/keepN/grace/retention/serv…
raisedadead Jun 2, 2026
c211aeb
feat(gc): add site GC planner with blast-cap abort
raisedadead Jun 2, 2026
b9cd06f
feat(deploy): write _artemis_meta.json marker on finalize
raisedadead Jun 2, 2026
ade56a4
feat(worker): add engine-agnostic durable workflow runtime (concurren…
raisedadead Jun 2, 2026
52138cc
feat(gc): add gc-site workflow (retain, TOCTOU re-check, tombstone-mo…
raisedadead Jun 2, 2026
e3d19e1
feat(handler): add manual deploy-delete endpoint (409 if aliased, els…
raisedadead Jun 2, 2026
ef3bc4b
feat(handler): add site-purge (?purge=true cascade tombstone)
raisedadead Jun 2, 2026
882ae27
feat(gc): add tombstone-purge workflow (2-phase reclaim past recovery…
raisedadead Jun 2, 2026
8286666
feat(pg): add deploy/alias/tombstone repo + one-time R2->PG backfill
raisedadead Jun 2, 2026
b6c1a08
feat(config): add DATABASE_URL, HATCHET_*, CLEANUP_* with grace>=ttl …
raisedadead Jun 2, 2026
61315ac
feat(gc): add prometheus metrics + slog reporting for GC workflows
raisedadead Jun 2, 2026
7988ba0
feat(pg): add transactional outbox + emit site.changed on finalize/pr…
raisedadead Jun 2, 2026
e67ee1c
feat(worker): add outbox relay to publisher (at-least-once, order-pre…
raisedadead Jun 2, 2026
060a98d
feat(worker): add per-site debouncer for site.changed gc-site triggers
raisedadead Jun 2, 2026
6807518
feat(pg): add Postgres-backed site registry store (Valkey cache-front…
raisedadead Jun 2, 2026
c6a3827
feat(pg): add Postgres-backed repo-request queue (partial-index name …
raisedadead Jun 2, 2026
d8a1653
feat(pg): add atomic finalize saga (deploy+alias+outbox in one tx)
raisedadead Jun 2, 2026
701ac31
feat(worker): register finalize/promote/rollback as durable workflows…
raisedadead Jun 2, 2026
dc392ff
feat(pg): add alias CAS for last-writer-safe promote/rollback (no los…
raisedadead Jun 2, 2026
f4f9786
feat(gc): add reconcile-slice drift audit (orphan tombstone, reindex,…
raisedadead Jun 2, 2026
26032dc
feat(worker): add queue/DLQ/workflow metrics + reconcile drift counters
raisedadead Jun 2, 2026
a166a35
feat(teamcache): add Valkey-backed shared GitHub team-membership cache
raisedadead Jun 2, 2026
2792bb7
fix(worker): surface mark-published error on relay publish failure (e…
raisedadead Jun 2, 2026
935df49
fix(gc): never tombstone an alias-pinned deploy in reconcile (V1)
raisedadead Jun 2, 2026
2dff82e
fix(r2): URL-encode MovePrefix copy-source for space/non-ASCII keys (V5)
raisedadead Jun 2, 2026
10d0074
fix(backfill): honor configurable ALIAS_*_KEY_FORMAT instead of hardc…
raisedadead Jun 2, 2026
81e3ccd
fix(backfill): revert alias-key templating; read R2-dir-relative <dir…
raisedadead Jun 2, 2026
55a535e
feat(boot): open pg pool + run migrations gated on DATABASE_URL
raisedadead Jun 3, 2026
8bc0170
fix(config): validate GH_API_BASE; reject cleartext-remote + malforme…
raisedadead Jun 3, 2026
130638c
fix(gc): re-read aliases before reconcile tombstone to close TOCTOU (V1)
raisedadead Jun 3, 2026
ae9ebf8
fix(auth): surface io.ReadAll + parse errors on GitHub OK path
raisedadead Jun 3, 2026
9714509
feat(hatchet): adapter implementing worker.Engine + worker.Publisher
raisedadead Jun 3, 2026
ded5b80
feat(boot): wire gc closures + policy + pg.Repo stores (prod R2 layout)
raisedadead Jun 3, 2026
d7f68b4
feat(worker): add event/cron triggers to WorkflowDef + Hatchet adapter
raisedadead Jun 3, 2026
c8fd6bf
feat(boot): register + start gc workflows on the Hatchet worker
raisedadead Jun 3, 2026
0dcaaa5
feat(boot): inject pg.Repo as handler Outbox + Tombstones (TrashPrefi…
raisedadead Jun 3, 2026
7d95639
feat(boot): run outbox-relay ticker loop draining to the Hatchet adapter
raisedadead Jun 3, 2026
bd002f2
feat(observability): capture outbox-enqueue failures to Sentry
raisedadead Jun 3, 2026
d075130
feat(readyz): PG-degraded probe semantics (R6)
raisedadead Jun 3, 2026
a185b51
feat(backfill): one-shot BACKFILL_ON_BOOT R2-to-PG index runner
raisedadead Jun 3, 2026
a40efe5
feat(registry): cut registry SoT to pg.RegistryStore + valkey cache-f…
raisedadead Jun 4, 2026
e5fd3cb
refactor: back repo-queue with pg.RepoQueue at boot
raisedadead Jun 4, 2026
6355ea7
feat(auth): consult durable teamcache before GitHub team probe
raisedadead Jun 4, 2026
cd3b012
test(hatchet): gated real-engine integration suite for R2/R3/R4/R5
raisedadead Jun 4, 2026
0ac1577
test(e2e): local full-stack harness via compose with e2e-local target
raisedadead Jun 4, 2026
c3830a9
feat(metrics): expose worker run + relay counters at /metrics
raisedadead Jun 4, 2026
734b534
test(load): gated pg throughput harness for registry/outbox/gc paths
raisedadead Jun 4, 2026
db482ef
docs(design): scalability capacity envelope from load harness (R14)
raisedadead Jun 4, 2026
f4c05a5
fix(compose): boot smoke stack past R11 via loopback fakegithub + pg
raisedadead Jun 4, 2026
b54f2cf
test(e2e): cover POST /api/site/{site}/rollback
raisedadead Jun 4, 2026
11b9be8
feat(registry): one-shot Valkey-to-PG import on boot when empty
raisedadead Jun 4, 2026
4587b39
fix(worker): guard debounce callback against stale timer race
raisedadead Jun 4, 2026
f9f0d10
fix(pg): rebuild outbox_unpublished_idx on id to match fetch order
raisedadead Jun 4, 2026
2170962
fix(pg): return DB-read value as current from SetAliasCAS
raisedadead Jun 4, 2026
fd23e57
fix(pg): panic on crypto/rand failure in repo request id gen
raisedadead Jun 4, 2026
26a3225
test(e2e): fail metrics check if either prefix is missing
raisedadead Jun 4, 2026
b94f581
fix(handler): purge R2 before registry delete
raisedadead Jun 4, 2026
069ab55
fix(handler): detach emit from request ctx
raisedadead Jun 4, 2026
71489c8
fix(pg): unlock advisory locks on fresh ctx
raisedadead Jun 4, 2026
662b9c2
test(hatchet): fix poison test data race
raisedadead Jun 4, 2026
bf6c73d
docs: label bare code fences in design docs
raisedadead Jun 4, 2026
535c721
fix(scripts): fail fast on pg readiness timeout
raisedadead Jun 4, 2026
531f491
fix(auth): tolerate durable team cache write fail
raisedadead Jun 4, 2026
a5db15b
fix(pg): count only inserted rows in import
raisedadead Jun 4, 2026
c06a5c9
test(hatchet): assert distinct-site overlap
raisedadead Jun 4, 2026
e963f80
test(hatchet): bound engine restart with timeout
raisedadead Jun 4, 2026
2e8556f
test(e2e): set TLS min version in harness
raisedadead Jun 4, 2026
70120ff
fix(worker): close debounce timer capture race
raisedadead Jun 4, 2026
939559a
fix(config): reject whitespace-only authz team
raisedadead Jun 4, 2026
7108494
test(handler): isolate package metrics global
raisedadead Jun 4, 2026
468c88e
test: close audited coverage gaps
raisedadead Jun 4, 2026
4433050
test(hatchet): skip restart test without compose
raisedadead Jun 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Binaries
/bin/
/artemis
/loadgen
*.exe
*.test
*.out
Expand Down
122 changes: 122 additions & 0 deletions cmd/artemis/gcwire.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
package main

import (
"fmt"
"strings"
"time"

"github.com/freeCodeCamp/artemis/internal/backfill"
"github.com/freeCodeCamp/artemis/internal/config"
"github.com/freeCodeCamp/artemis/internal/gc"
"github.com/freeCodeCamp/artemis/internal/handler"
"github.com/freeCodeCamp/artemis/internal/pg"
"github.com/freeCodeCamp/artemis/internal/r2"
"github.com/freeCodeCamp/artemis/internal/registry/valkey"
)

var (
_ handler.SiteChangeEmitter = (*pg.Repo)(nil)
_ handler.TombstoneStore = (*pg.Repo)(nil)
_ handler.RepoStore = (*pg.RepoQueue)(nil)
_ backfill.Lister = (*r2.Client)(nil)
_ backfill.Indexer = (*pg.Repo)(nil)
_ pg.SitesSource = (*valkey.Store)(nil)
)

func openRepoQueue(pgDB *pg.DB) (handler.RepoStore, error) {
if pgDB == nil {
return nil, fmt.Errorf("repo-creation feature requires DATABASE_URL")
}
return pg.NewRepoQueue(pgDB), nil
}

const deployIDToken = "<ts>-<sha>"

type gcLayout struct {
sitePrefix func(site string) string
deployPrefix func(site, id string) string
trashPrefix func(site, id string) string
}

func newGCLayout(format, trashBase string) (gcLayout, error) {
idx := strings.Index(format, deployIDToken)
if idx < 0 {
return gcLayout{}, fmt.Errorf("DEPLOY_PREFIX_FORMAT %q must contain %s", format, deployIDToken)
}
head := format[:idx]
tail := format[idx+len(deployIDToken):]
slash := strings.IndexByte(head, '/')
if slash < 0 {
return gcLayout{}, fmt.Errorf("DEPLOY_PREFIX_FORMAT %q must contain '/' after the site segment", format)
}
subPath := head[slash+1:]
if trashBase == "" {
trashBase = "_trash/"
}
return gcLayout{
sitePrefix: func(site string) string { return site + "/" + subPath },
deployPrefix: func(site, id string) string {
p := site + "/" + subPath + id + tail
if !strings.HasSuffix(p, "/") {
p += "/"
}
return p
},
trashPrefix: func(site, id string) string { return trashBase + site + "/" + id + "/" },
}, nil
}

func gcPolicy(c config.CleanupConfig) gc.Policy {
return gc.Policy{
RecentKeep: c.RecentKeep,
Grace: c.Grace,
Retention: time.Duration(c.RetentionDays) * 24 * time.Hour,
ServeCacheTTL: c.ServeCacheTTL,
}
}

type gcWiring struct {
Repo *pg.Repo
SiteGC *gc.SiteGC
Reconciler *gc.Reconciler
Purge *gc.TombstonePurge
}

func newGCWiring(cfg *config.Config, repo *pg.Repo, r2c *r2.Client, metrics *gc.Metrics) (*gcWiring, error) {
layout, err := newGCLayout(cfg.DeployPrefixFormat, cfg.Cleanup.TrashPrefix)
if err != nil {
return nil, err
}
return &gcWiring{
Repo: repo,
SiteGC: &gc.SiteGC{
Store: repo,
Mover: r2c,
Policy: gcPolicy(cfg.Cleanup),
BlastCap: cfg.Cleanup.BlastCap,
DeployPrefix: layout.deployPrefix,
TrashPrefix: layout.trashPrefix,
Now: time.Now,
Metrics: metrics,
},
Reconciler: &gc.Reconciler{
Lister: r2c,
Store: repo,
Mover: r2c,
Grace: cfg.Cleanup.Grace,
SitePrefix: layout.sitePrefix,
DeployPrefix: layout.deployPrefix,
TrashPrefix: layout.trashPrefix,
Now: time.Now,
Metrics: metrics,
},
Purge: &gc.TombstonePurge{
Store: repo,
Deleter: r2c,
Recovery: time.Duration(cfg.Cleanup.RecoveryDays) * 24 * time.Hour,
TrashBase: cfg.Cleanup.TrashPrefix,
Now: time.Now,
Metrics: metrics,
},
}, nil
}
130 changes: 130 additions & 0 deletions cmd/artemis/gcwire_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
package main

import (
"testing"
"time"

"github.com/freeCodeCamp/artemis/internal/config"
"github.com/freeCodeCamp/artemis/internal/pg"
"github.com/freeCodeCamp/artemis/internal/r2"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestOpenRepoQueue_RequiresDatabase(t *testing.T) {
q, err := openRepoQueue(nil)
require.Error(t, err, "repo feature without a database must be rejected at boot")
require.Nil(t, q)
}

func TestOpenRepoQueue_IsPostgresBacked(t *testing.T) {
q, err := openRepoQueue(&pg.DB{})
require.NoError(t, err)
_, ok := q.(*pg.RepoQueue)
assert.True(t, ok, "repo queue must be backed by pg.RepoQueue")
}

func TestBootWiringProdLayout(t *testing.T) {
cases := []struct {
name string
format string
trashBase string
site string
id string
wantSite string
wantDeploy string
wantTrash string
}{
{
name: "default-dev-layout",
format: "<site>/deploys/<ts>-<sha>/",
trashBase: "_trash/",
site: "www",
id: "20260101-000000-abc1234",
wantSite: "www/deploys/",
wantDeploy: "www/deploys/20260101-000000-abc1234/",
wantTrash: "_trash/www/20260101-000000-abc1234/",
},
{
name: "prod-dirname-layout",
format: "<site>.freecode.camp/deploys/<ts>-<sha>/",
trashBase: "_trash/",
site: "www.freecode.camp",
id: "20260101-000000-abc1234",
wantSite: "www.freecode.camp/deploys/",
wantDeploy: "www.freecode.camp/deploys/20260101-000000-abc1234/",
wantTrash: "_trash/www.freecode.camp/20260101-000000-abc1234/",
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
l, err := newGCLayout(tc.format, tc.trashBase)
require.NoError(t, err)
assert.Equal(t, tc.wantSite, l.sitePrefix(tc.site), "sitePrefix")
assert.Equal(t, tc.wantDeploy, l.deployPrefix(tc.site, tc.id), "deployPrefix")
assert.Equal(t, tc.wantTrash, l.trashPrefix(tc.site, tc.id), "trashPrefix")
})
}
}

func TestBootWiring_LayoutRejectsBadFormat(t *testing.T) {
_, err := newGCLayout("<site>/deploys/", "_trash/")
require.Error(t, err, "format without the deploy-id token must be rejected")
}

func TestNewGCWiring_PlumbsBlastCapAndPrefixes(t *testing.T) {
cfg := &config.Config{
DeployPrefixFormat: "<site>/deploys/<ts>-<sha>/",
Cleanup: config.CleanupConfig{
BlastCap: 5,
RetentionDays: 7,
RecoveryDays: 3,
TrashPrefix: "_trash/",
},
}
repo := &pg.Repo{}
r2c := &r2.Client{}

w, err := newGCWiring(cfg, repo, r2c, nil)
require.NoError(t, err)
require.NotNil(t, w)

assert.Same(t, repo, w.Repo, "repo must be plumbed through")
assert.Equal(t, 5, w.SiteGC.BlastCap, "BlastCap=0 would disable the mass-delete safety cap")
assert.Equal(t, 7*24*time.Hour, w.SiteGC.Policy.Retention, "policy retention must derive from RetentionDays")
assert.Equal(t, "_trash/", w.Purge.TrashBase, "purge must scan the configured trash base")
assert.Equal(t, 3*24*time.Hour, w.Purge.Recovery, "recovery window must derive from RecoveryDays")

require.NotNil(t, w.SiteGC.DeployPrefix)
require.NotNil(t, w.SiteGC.TrashPrefix)
require.NotNil(t, w.Reconciler.SitePrefix)
require.NotNil(t, w.Reconciler.DeployPrefix)

assert.Equal(t, "www/deploys/id/", w.SiteGC.DeployPrefix("www", "id"),
"a wrong deploy-prefix closure would mass-move the wrong R2 prefix")
assert.Equal(t, "_trash/www/id/", w.SiteGC.TrashPrefix("www", "id"))
assert.Equal(t, "www/deploys/", w.Reconciler.SitePrefix("www"))
}

func TestNewGCWiring_RejectsBadFormat(t *testing.T) {
cfg := &config.Config{
DeployPrefixFormat: "<site>/deploys/",
Cleanup: config.CleanupConfig{BlastCap: 5, TrashPrefix: "_trash/"},
}
w, err := newGCWiring(cfg, &pg.Repo{}, &r2.Client{}, nil)
require.Error(t, err, "a format missing the deploy-id token must fail boot wiring, not produce a degenerate prefix fn")
require.Nil(t, w)
}

func TestGCPolicyFromConfig(t *testing.T) {
p := gcPolicy(config.CleanupConfig{
RecentKeep: 3,
Grace: time.Hour,
RetentionDays: 7,
ServeCacheTTL: 15 * time.Second,
})
assert.Equal(t, 3, p.RecentKeep)
assert.Equal(t, time.Hour, p.Grace)
assert.Equal(t, 7*24*time.Hour, p.Retention)
assert.Equal(t, 15*time.Second, p.ServeCacheTTL)
}
113 changes: 113 additions & 0 deletions cmd/artemis/gcworkflows.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
package main

import (
"context"
"errors"
"log/slog"
"time"

"github.com/freeCodeCamp/artemis/internal/observability"
"github.com/freeCodeCamp/artemis/internal/pg"
"github.com/freeCodeCamp/artemis/internal/worker"
)

const (
topicSiteReconcile = "site.reconcile"
cronTombstonePurge = "0 3 * * *"
relayInterval = 5 * time.Second
)

func runRelayLoop(ctx context.Context, relay *worker.Relay, interval time.Duration, metrics *worker.Metrics) {
ticker := time.NewTicker(interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
n, err := relay.RunOnce(ctx)
metrics.ObserveRelay(n, err)
if err != nil {
slog.Error("relay.run", "err", err)
observability.CaptureBackground("relay.run", err)
}
}
}
}

func observeWorkflow(metrics *worker.Metrics, name string, fn worker.Handler) worker.Handler {
return func(ctx context.Context, input map[string]any) error {
err := fn(ctx, input)
outcome := "ok"
if err != nil {
outcome = "failed"
}
metrics.ObserveRun(name, outcome)
return err
}
}

func gcWorkflowDefs(gcw *gcWiring, dryRun bool, metrics *worker.Metrics) []worker.WorkflowDef {
return []worker.WorkflowDef{
{
Name: worker.WorkflowGCSite,
ConcurrencyKey: worker.ConcurrencyKeySite,
EventTriggers: []string{pg.TopicSiteChanged},
Handler: observeWorkflow(metrics, worker.WorkflowGCSite, func(ctx context.Context, input map[string]any) error {
site, err := siteFromInput(input)
if err != nil {
return err
}
if _, err := gcw.SiteGC.Run(ctx, site, dryRun); err != nil {
observability.CaptureBackground("gc.site.run", err)
return err
}
return nil
}),
},
{
Name: worker.WorkflowTombstonePurge,
Cron: []string{cronTombstonePurge},
Handler: observeWorkflow(metrics, worker.WorkflowTombstonePurge, func(ctx context.Context, _ map[string]any) error {
if _, err := gcw.Purge.Run(ctx, dryRun); err != nil {
observability.CaptureBackground("tombstone.purge", err)
return err
}
return nil
}),
},
{
Name: worker.WorkflowReconcile,
ConcurrencyKey: worker.ConcurrencyKeySite,
EventTriggers: []string{topicSiteReconcile},
Handler: observeWorkflow(metrics, worker.WorkflowReconcile, func(ctx context.Context, input map[string]any) error {
site, err := siteFromInput(input)
if err != nil {
return err
}
if _, err := gcw.Reconciler.ReconcileSite(ctx, site); err != nil {
observability.CaptureBackground("reconcile.run", err)
return err
}
return nil
}),
},
}
}

func siteFromInput(input map[string]any) (string, error) {
s, ok := input["site"].(string)
if !ok || s == "" {
return "", errors.New("workflow input missing site")
}
return s, nil
}

func registerGCWorkflows(rt *worker.Runtime, gcw *gcWiring, dryRun bool, metrics *worker.Metrics) error {
for _, def := range gcWorkflowDefs(gcw, dryRun, metrics) {
if err := rt.Register(def); err != nil {
return err
}
}
return nil
}
Loading
Loading