From 8d97328aebb56f9cc8cb0ea64311a044d7f651bd Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Fri, 12 Jun 2026 17:53:16 +0000 Subject: [PATCH 1/6] Update sam-hub to use Gateway API, add bootstrap dnsaddr, and restrict HTTP prefixes --- .github/k8s/cert-sync-cronjob-template.yaml | 84 ------------------- .../k8s/google-public-ca-issuer-template.yaml | 20 ----- .github/k8s/sam-hub-template.yaml | 68 +++++++++++++++ .github/k8s/wildcard-cert-template.yaml | 14 ---- cmd/sam-hub/server.go | 8 ++ 5 files changed, 76 insertions(+), 118 deletions(-) delete mode 100644 .github/k8s/cert-sync-cronjob-template.yaml delete mode 100644 .github/k8s/google-public-ca-issuer-template.yaml delete mode 100644 .github/k8s/wildcard-cert-template.yaml diff --git a/.github/k8s/cert-sync-cronjob-template.yaml b/.github/k8s/cert-sync-cronjob-template.yaml deleted file mode 100644 index da12522..0000000 --- a/.github/k8s/cert-sync-cronjob-template.yaml +++ /dev/null @@ -1,84 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: cert-sync-sa - namespace: sam-hub ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: cert-reader - namespace: sam-${ENV_NAME} -rules: -- apiGroups: [""] - resources: ["secrets"] - resourceNames: ["hub-sam-mesh-tls"] - verbs: ["get"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: cert-sync-read-binding - namespace: sam-${ENV_NAME} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: cert-reader -subjects: -- kind: ServiceAccount - name: cert-sync-sa - namespace: sam-hub ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: cert-writer - namespace: sam-hub -rules: -- apiGroups: [""] - resources: ["secrets"] - verbs: ["get", "create", "delete"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: cert-sync-write-binding - namespace: sam-hub -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: cert-writer -subjects: -- kind: ServiceAccount - name: cert-sync-sa - namespace: sam-hub ---- -apiVersion: batch/v1 -kind: CronJob -metadata: - name: cert-sync-cronjob - namespace: sam-hub -spec: - schedule: "0 0 * * *" # Run daily at midnight - concurrencyPolicy: Forbid - successfulJobsHistoryLimit: 1 - failedJobsHistoryLimit: 3 - jobTemplate: - spec: - template: - spec: - serviceAccountName: cert-sync-sa - restartPolicy: OnFailure - containers: - - name: sync-agent - image: bitnami/kubectl:latest - command: - - /bin/sh - - -c - - | - echo "Fetching latest wildcard TLS secret from sam-${ENV_NAME} namespace..." - kubectl delete secret hub-sam-mesh-tls -n sam-hub --ignore-not-found - kubectl get secret hub-sam-mesh-tls -n sam-${ENV_NAME} -o yaml | \ - sed "s/namespace: sam-${ENV_NAME}/namespace: sam-hub/" | \ - kubectl create -f - - echo "Certificate successfully synchronized!" diff --git a/.github/k8s/google-public-ca-issuer-template.yaml b/.github/k8s/google-public-ca-issuer-template.yaml deleted file mode 100644 index f28add9..0000000 --- a/.github/k8s/google-public-ca-issuer-template.yaml +++ /dev/null @@ -1,20 +0,0 @@ -apiVersion: cert-manager.io/v1 -kind: Issuer -metadata: - name: google-public-ca-issuer - namespace: ${NAMESPACE} -spec: - acme: - server: https://dv.acme-v02.api.pki.goog/directory - email: "${ACME_EMAIL}" - privateKeySecretRef: - name: google-ca-account-key - externalAccountBinding: - keyID: "${EAB_KEY_ID}" - keySecretRef: - name: google-ca-eab-secret - key: secret - solvers: - - dns01: - cloudDNS: - project: ${GCP_PROJECT_ID} diff --git a/.github/k8s/sam-hub-template.yaml b/.github/k8s/sam-hub-template.yaml index b1c1665..6872914 100644 --- a/.github/k8s/sam-hub-template.yaml +++ b/.github/k8s/sam-hub-template.yaml @@ -6,6 +6,10 @@ metadata: spec: clusterIP: None ports: + - port: 9090 + targetPort: 9090 + protocol: TCP + name: http - port: 4501 targetPort: 4501 protocol: TCP @@ -79,6 +83,7 @@ spec: - "--external-multiaddr=/ip4/$(HOST_IP)/tcp/4501" - "--external-multiaddr=/ip4/$(HOST_IP)/udp/4501/quic-v1" - "--external-multiaddr=/ip4/$(HOST_IP)/tcp/4501/wss" + - "--external-multiaddr=/dnsaddr/bootstrap.${ENV_NAME}.sam-mesh.dev" # 1. Trust Dex for humans/Google/GitHub. 2. Trust GKE for internal Canaries. - "--issuer=https://auth.sam-mesh.dev,https://container.googleapis.com/v1/projects/${GCP_PROJECT_ID}/locations/${CLUSTER_REGION}/clusters/${CLUSTER_NAME}" # 1. Dex CLI client ID. 2. Canary projected volume audience. @@ -132,3 +137,66 @@ spec: interval: 30s path: /metrics scheme: http +--- +apiVersion: networking.gke.io/v1 +kind: HealthCheckPolicy +metadata: + name: sam-hub-healthcheck-${ENV_NAME} + namespace: ${NAMESPACE} +spec: + default: + checkIntervalSec: 15 + timeoutSec: 5 + healthyThreshold: 1 + unhealthyThreshold: 2 + config: + type: HTTP + httpHealthCheck: + requestPath: /healthz + targetRef: + group: "" + kind: Service + name: sam-hub-${ENV_NAME} +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: sam-hub-gateway-${ENV_NAME} + namespace: ${NAMESPACE} + annotations: + networking.gke.io/certmap: sam-hub-cert-map-${ENV_NAME} +spec: + gatewayClassName: gke-l7-global-external-managed + addresses: + - type: NamedAddress + value: sam-hub-ip-${ENV_NAME} + listeners: + - name: https + protocol: HTTPS + port: 443 + allowedRoutes: + namespaces: + from: Same +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: sam-hub-route-${ENV_NAME} + namespace: ${NAMESPACE} +spec: + parentRefs: + - name: sam-hub-gateway-${ENV_NAME} + hostnames: + - "${ENV_NAME}.sam-mesh.dev" + rules: + - matches: + - path: + type: Exact + value: /register + - path: + type: Exact + value: /info + backendRefs: + - name: sam-hub-${ENV_NAME} + port: 9090 + diff --git a/.github/k8s/wildcard-cert-template.yaml b/.github/k8s/wildcard-cert-template.yaml deleted file mode 100644 index ceec88e..0000000 --- a/.github/k8s/wildcard-cert-template.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - name: hub-sam-mesh-cert - namespace: ${NAMESPACE} -spec: - secretName: hub-sam-mesh-tls - issuerRef: - name: google-public-ca-issuer - kind: Issuer - commonName: "*.sam-mesh.dev" - dnsNames: - - "*.sam-mesh.dev" - - "sam-mesh.dev" diff --git a/cmd/sam-hub/server.go b/cmd/sam-hub/server.go index 0082b29..b510b6e 100644 --- a/cmd/sam-hub/server.go +++ b/cmd/sam-hub/server.go @@ -32,6 +32,10 @@ func handleRegisterHTTP(h *Hub) http.HandlerFunc { http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } + if r.URL.Path != "/register" { + http.NotFound(w, r) + return + } body, err := io.ReadAll(r.Body) if err != nil { @@ -111,6 +115,10 @@ func handleInfoHTTP(h *Hub) http.HandlerFunc { http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } + if r.URL.Path != "/info" { + http.NotFound(w, r) + return + } // Find the OIDC issuer deterministically by sorting the map keys issuer := "" From 89c8cfadc25dd2df8cf4ba798be6130c88fe1f9a Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Mon, 15 Jun 2026 10:35:23 +0000 Subject: [PATCH 2/6] fail hard on renewal failures --- cmd/sam-node/node.go | 55 +++++++++++++++++++++++++-------------- cmd/sam-node/node_test.go | 49 ++++++++++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 21 deletions(-) diff --git a/cmd/sam-node/node.go b/cmd/sam-node/node.go index 4268b8b..7b642e0 100644 --- a/cmd/sam-node/node.go +++ b/cmd/sam-node/node.go @@ -311,6 +311,7 @@ func NewSamNode(ctx context.Context, privKey crypto.PrivKey, hubPubKey ed25519.P logger.Warnf("[AuthN] Fallback auth failed with %s: %v", addr, err) } else { logger.Infof("[AuthN] Fallback connection successful!") + authenticated = true break } } @@ -319,6 +320,10 @@ func NewSamNode(ctx context.Context, privKey crypto.PrivKey, hubPubKey ed25519.P logger.Warnf("[AuthN] Failed to fetch updated addresses via HTTP: %v", err) } } + + if !authenticated { + return nil, fmt.Errorf("failed to authenticate with any hub: all connection attempts failed") + } } // Initialize Gossipsub for Hub Events @@ -503,7 +508,7 @@ func (n *SamNode) StartRenewalLoop(ctx context.Context, issuerURL, clientID, cli } else if duration > 0 { renewAfter = duration / 2 } else { - renewAfter = 1 * time.Minute + renewAfter = 1 * time.Second } } @@ -517,33 +522,45 @@ func (n *SamNode) StartRenewalLoop(ctx context.Context, issuerURL, clientID, cli case <-timer.C: fmt.Println("Renewing enrollment...") var newJWT string + var fetchErr error + if issuerURL != "" { tokenURL, err := n.DiscoverTokenURL(ctx, issuerURL) if err != nil { - fmt.Printf("Failed to discover OIDC endpoints for renewal: %v\n", err) - continue - } - newJWT, err = n.FetchJWT(ctx, tokenURL, clientID, clientSecret) - if err != nil { - fmt.Printf("Failed to fetch JWT for renewal: %v\n", err) - continue + fetchErr = fmt.Errorf("failed to discover OIDC endpoints for renewal: %w", err) + } else { + newJWT, fetchErr = n.FetchJWT(ctx, tokenURL, clientID, clientSecret) + if fetchErr != nil { + fetchErr = fmt.Errorf("failed to fetch JWT for renewal: %w", fetchErr) + } } } else if jwtPath != "" { data, err := os.ReadFile(jwtPath) if err != nil { - fmt.Printf("Failed to read JWT file for renewal: %v\n", err) - continue + fetchErr = fmt.Errorf("failed to read JWT file for renewal: %w", err) + } else { + newJWT = strings.TrimSpace(string(data)) } - newJWT = strings.TrimSpace(string(data)) } else { - fmt.Println("No credentials available for renewal.") - continue + fetchErr = fmt.Errorf("no credentials available for renewal") } - if err := n.Enroll(ctx, newJWT); err != nil { - fmt.Printf("Renewal enrollment failed: %v\n", err) + if fetchErr == nil { + fetchErr = n.Enroll(ctx, newJWT) + } + + if fetchErr != nil { + logger.Errorf("Renewal failed: %v", fetchErr) + + // Check if we are already expired and if so, die to avoid a split brain + exp, loadErr := n.Store.LoadIdentityExpiration() + if loadErr == nil && exp > 0 { + if time.Now().After(time.Unix(exp, 0)) { + logger.Fatalf("Identity expired and renewal failed. Exiting to avoid network partition.") + } + } } else { - fmt.Println("Enrollment renewed successfully.") + logger.Infof("Enrollment renewed successfully.") } } } @@ -757,7 +774,7 @@ func (n *SamNode) startDiscovery(ctx context.Context, meshID string, interval ti if n.Host.Network().Connectedness(p.ID) != network.Connected { logger.Infof("[Discovery] Found peer not connected via DHT: %s", p.ID) - + // Log the addresses returned by DHT to confirm they include p2p-circuit paths for _, addr := range p.Addrs { logger.Infof("[Discovery] Peer %s advertised address: %s", p.ID, addr) @@ -939,11 +956,11 @@ func (n *SamNode) findProvidersByCID(ctx context.Context, c cid.Cid) ([]peer.Add for _, addr := range p.Addrs { logger.Infof("[Discovery] Provider %s advertised address: %s", p.ID, addr) } - + if len(p.Addrs) > 0 { n.Host.Peerstore().AddAddrs(p.ID, p.Addrs, peerstore.TempAddrTTL) } - + providers = append(providers, p) } return providers, nil diff --git a/cmd/sam-node/node_test.go b/cmd/sam-node/node_test.go index 5f0c0cd..cd71506 100644 --- a/cmd/sam-node/node_test.go +++ b/cmd/sam-node/node_test.go @@ -15,16 +15,20 @@ package main import ( + "context" "crypto/ed25519" + "os" + "os/exec" + "strings" "testing" "time" "github.com/google/sam/api" lru "github.com/hashicorp/golang-lru/v2" + "github.com/libp2p/go-libp2p/core/crypto" + "github.com/multiformats/go-multiaddr" ) - - func TestHandleBannedEvent(t *testing.T) { revokedCache, _ := lru.New[string, int64](10) node := &SamNode{ @@ -64,3 +68,44 @@ func TestHandleKeyRotationEvent(t *testing.T) { t.Errorf("Expected 1 trusted key, got %d", len(node.trustedKeys)) } } + +func TestNewSamNode_FailsAuth(t *testing.T) { + priv, _, _ := crypto.GenerateKeyPair(crypto.Ed25519, -1) + hubAddrs := []multiaddr.Multiaddr{multiaddr.StringCast("/ip4/127.0.0.1/tcp/9999")} + store, _ := NewStore(t.TempDir()) // We need a valid store + + _, err := NewSamNode(context.Background(), priv, nil, hubAddrs, store, "test", "10s", []string{"/ip4/127.0.0.1/tcp/0"}, false, nil, 0, false) + if err == nil { + t.Fatal("Expected NewSamNode to fail when it cannot connect to the hub") + } + if !strings.Contains(err.Error(), "failed to authenticate with any hub") { + t.Fatalf("Expected 'failed to authenticate with any hub' error, got %v", err) + } +} + +func TestStartRenewalLoop_ExpiredAndFails(t *testing.T) { + if os.Getenv("BE_CRASHER") == "1" { + store, _ := NewStore(t.TempDir()) + // Set expiration to the past + _ = store.SaveIdentityExpiration(time.Now().Add(-1 * time.Hour).Unix()) + + node := &SamNode{ + Store: store, + } + + // Run the renewal loop. Since there's no JWT/Issuer provided, it fails to renew. + // It will see that it's expired and it failed to renew, so it will log.Fatalf + node.StartRenewalLoop(context.Background(), "", "", "", "") + time.Sleep(5 * time.Second) + os.Exit(0) // should not be reached + return + } + + cmd := exec.Command(os.Args[0], "-test.run=TestStartRenewalLoop_ExpiredAndFails") + cmd.Env = append(os.Environ(), "BE_CRASHER=1") + err := cmd.Run() + if e, ok := err.(*exec.ExitError); ok && !e.Success() { + return // Successful fatal exit + } + t.Fatalf("process ran with err %v, want exit status 1 (fatal crash)", err) +} From 0befa5dd237d977ce24017cb91fb32f838bcb72f Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Mon, 15 Jun 2026 10:47:25 +0000 Subject: [PATCH 3/6] fix renewal busy-loop and e2e watch test --- cmd/sam-node/node.go | 3 +++ tests/e2e/watch.bats | 53 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/cmd/sam-node/node.go b/cmd/sam-node/node.go index 7b642e0..561313a 100644 --- a/cmd/sam-node/node.go +++ b/cmd/sam-node/node.go @@ -507,6 +507,9 @@ func (n *SamNode) StartRenewalLoop(ctx context.Context, issuerURL, clientID, cli renewAfter = duration - RenewalBuffer } else if duration > 0 { renewAfter = duration / 2 + if renewAfter < 2*time.Second { + renewAfter = 2 * time.Second + } } else { renewAfter = 1 * time.Second } diff --git a/tests/e2e/watch.bats b/tests/e2e/watch.bats index b979db2..af11106 100644 --- a/tests/e2e/watch.bats +++ b/tests/e2e/watch.bats @@ -2,6 +2,7 @@ setup() { export SAM_NODE_BINARY="${SAM_NODE_BINARY:-./bin/sam-node}" + export SAM_HUB_BINARY="${SAM_HUB_BINARY:-./bin/sam-hub}" if [[ ! -x "$SAM_NODE_BINARY" ]]; then skip "sam-node binary not found at $SAM_NODE_BINARY" fi @@ -9,13 +10,8 @@ setup() { export TEST_TMPDIR TEST_TMPDIR="$(mktemp -d)" export HOME="$TEST_TMPDIR/home" - export SAM_DATA_DIR="$TEST_TMPDIR/data" - DB_PATH="$SAM_DATA_DIR/agent.db" mkdir -p "$SAM_DATA_DIR" - - # Generate mock DB - go run tests/e2e/gen_db.go "$DB_PATH" } teardown() { @@ -23,10 +19,45 @@ teardown() { rm -rf "$TEST_TMPDIR" } -@test "sam-node run with stored identity reaches online state" { +@test "sam-node run with stored identity fails if hub is unreachable" { + DB_PATH="$SAM_DATA_DIR/agent.db" + go run tests/e2e/gen_db.go "$DB_PATH" + + log_file="$TEST_TMPDIR/run.log" + run "$SAM_NODE_BINARY" run --listen /ip4/127.0.0.1/udp/0/quic-v1 --listen /ip4/127.0.0.1/tcp/0 --api-token "dummy-token" --bind-addr "127.0.0.1:0" >"$log_file" 2>&1 + + if [[ "$status" -eq 0 ]]; then + echo "Test failed: Node was expected to exit with non-zero status" + cat "$log_file" + return 1 + fi + + if ! grep -q "failed to authenticate with any hub: all connection attempts failed" "$log_file"; then + echo "Test failed: Node did not log the expected error message" + cat "$log_file" + return 1 + fi +} + +@test "sam-node run with stored identity reaches online state when hub is reachable" { + if [[ ! -x "$SAM_HUB_BINARY" ]]; then + skip "sam-hub binary not found at $SAM_HUB_BINARY" + fi + + hub_log="$TEST_TMPDIR/hub.log" + "$SAM_HUB_BINARY" start >"$hub_log" 2>&1 & + hub_pid=$! + + # Wait for hub to start + sleep 2 + + # Join node to the hub to get a valid stored identity + join_log="$TEST_TMPDIR/join.log" + "$SAM_NODE_BINARY" join "http://127.0.0.1:9090" >"$join_log" 2>&1 + log_file="$TEST_TMPDIR/run.log" "$SAM_NODE_BINARY" run --listen /ip4/127.0.0.1/udp/0/quic-v1 --listen /ip4/127.0.0.1/tcp/0 --api-token "dummy-token" --bind-addr "127.0.0.1:0" >"$log_file" 2>&1 & - pid=$! + node_pid=$! online="" for _ in {1..40}; do @@ -37,8 +68,10 @@ teardown() { sleep 0.1 done - kill "$pid" >/dev/null 2>&1 || true - wait "$pid" >/dev/null 2>&1 || true + kill "$node_pid" >/dev/null 2>&1 || true + wait "$node_pid" >/dev/null 2>&1 || true + kill "$hub_pid" >/dev/null 2>&1 || true + wait "$hub_pid" >/dev/null 2>&1 || true if [[ "$online" != "yes" ]]; then echo "Test failed. Node logs:" @@ -47,5 +80,3 @@ teardown() { [[ "$online" == "yes" ]] } - - From d8dd25f8f308750da5e31c1be8948ea1f67c10ad Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Mon, 15 Jun 2026 13:37:05 +0000 Subject: [PATCH 4/6] fix keep alive logic --- cmd/sam-node/node.go | 38 +++++++++++++++++++++ cmd/sam-node/node_test.go | 41 ++++++++++++++--------- tests/e2e/auth_flows.bats | 8 ----- tests/e2e/watch.bats | 59 ++++----------------------------- tests/integration/proxy_test.go | 1 + 5 files changed, 71 insertions(+), 76 deletions(-) diff --git a/cmd/sam-node/node.go b/cmd/sam-node/node.go index 561313a..8636694 100644 --- a/cmd/sam-node/node.go +++ b/cmd/sam-node/node.go @@ -379,9 +379,47 @@ func NewSamNode(ctx context.Context, privKey crypto.PrivKey, hubPubKey ed25519.P return nil, fmt.Errorf("failed to start ingress server: %w", err) } + // Start connection monitor + node.startConnectionMonitor(ctx, 2*time.Minute, 1*time.Minute, 3) + return node, nil } +func (n *SamNode) startConnectionMonitor(ctx context.Context, bootstrapDuration, checkInterval time.Duration, maxFailures int) { + go func() { + // Wait for initial bootstrap to complete + select { + case <-ctx.Done(): + return + case <-time.After(bootstrapDuration): + } + + ticker := time.NewTicker(checkInterval) + defer ticker.Stop() + + consecutiveFailures := 0 + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + if len(n.Host.Network().Peers()) == 0 { + consecutiveFailures++ + logger.Warnf("Not connected to the mesh (0 peers). Consecutive failures: %d/%d", consecutiveFailures, maxFailures) + if consecutiveFailures >= maxFailures { + logger.Fatalf("Not connected to the mesh (0 peers) for %d consecutive checks. Exiting to avoid network partition.", maxFailures) + } + } else { + if consecutiveFailures > 0 { + logger.Infof("Reconnected to the mesh. Resetting failure count.") + } + consecutiveFailures = 0 + } + } + } + }() +} + func (n *SamNode) RegisterStaticServices(ctx context.Context, services []api.ServiceConfig) error { // Wait for DHT to be ready (size > 0) // This avoids failure if we try to register immediately after enrollment diff --git a/cmd/sam-node/node_test.go b/cmd/sam-node/node_test.go index cd71506..e7a3a16 100644 --- a/cmd/sam-node/node_test.go +++ b/cmd/sam-node/node_test.go @@ -19,14 +19,12 @@ import ( "crypto/ed25519" "os" "os/exec" - "strings" "testing" "time" "github.com/google/sam/api" lru "github.com/hashicorp/golang-lru/v2" "github.com/libp2p/go-libp2p/core/crypto" - "github.com/multiformats/go-multiaddr" ) func TestHandleBannedEvent(t *testing.T) { @@ -69,20 +67,6 @@ func TestHandleKeyRotationEvent(t *testing.T) { } } -func TestNewSamNode_FailsAuth(t *testing.T) { - priv, _, _ := crypto.GenerateKeyPair(crypto.Ed25519, -1) - hubAddrs := []multiaddr.Multiaddr{multiaddr.StringCast("/ip4/127.0.0.1/tcp/9999")} - store, _ := NewStore(t.TempDir()) // We need a valid store - - _, err := NewSamNode(context.Background(), priv, nil, hubAddrs, store, "test", "10s", []string{"/ip4/127.0.0.1/tcp/0"}, false, nil, 0, false) - if err == nil { - t.Fatal("Expected NewSamNode to fail when it cannot connect to the hub") - } - if !strings.Contains(err.Error(), "failed to authenticate with any hub") { - t.Fatalf("Expected 'failed to authenticate with any hub' error, got %v", err) - } -} - func TestStartRenewalLoop_ExpiredAndFails(t *testing.T) { if os.Getenv("BE_CRASHER") == "1" { store, _ := NewStore(t.TempDir()) @@ -109,3 +93,28 @@ func TestStartRenewalLoop_ExpiredAndFails(t *testing.T) { } t.Fatalf("process ran with err %v, want exit status 1 (fatal crash)", err) } + +func TestConnectionMonitor_CrashesAfterFailures(t *testing.T) { + if os.Getenv("BE_CRASHER_MONITOR") == "1" { + priv, _, _ := crypto.GenerateKeyPair(crypto.Ed25519, -1) + store, _ := NewStore(t.TempDir()) + node, err := NewSamNode(context.Background(), priv, nil, nil, store, "test", "10s", []string{"/ip4/127.0.0.1/tcp/0"}, false, nil, 0, false) + if err != nil { + os.Exit(0) // Ignore NewSamNode errors for this crasher + } + + // Use very short durations + node.startConnectionMonitor(context.Background(), 10*time.Millisecond, 10*time.Millisecond, 3) + time.Sleep(1 * time.Second) + os.Exit(0) // should not be reached + return + } + + cmd := exec.Command(os.Args[0], "-test.run=TestConnectionMonitor_CrashesAfterFailures") + cmd.Env = append(os.Environ(), "BE_CRASHER_MONITOR=1") + err := cmd.Run() + if e, ok := err.(*exec.ExitError); ok && !e.Success() { + return // Successful fatal exit + } + t.Fatalf("process ran with err %v, want exit status 1 (fatal crash)", err) +} diff --git a/tests/e2e/auth_flows.bats b/tests/e2e/auth_flows.bats index 50a4e8b..e23325f 100644 --- a/tests/e2e/auth_flows.bats +++ b/tests/e2e/auth_flows.bats @@ -3,14 +3,6 @@ load "lib/container_mesh.bash" setup() { - if ! mesh_require_docker; then - skip "docker not available or daemon not running" - fi - - if [[ ! -x "./bin/sam-node" || ! -x "./bin/sam-hub" ]]; then - skip "missing binaries; run: make build" - fi - mesh_setup_env } diff --git a/tests/e2e/watch.bats b/tests/e2e/watch.bats index af11106..99fdefd 100644 --- a/tests/e2e/watch.bats +++ b/tests/e2e/watch.bats @@ -2,16 +2,16 @@ setup() { export SAM_NODE_BINARY="${SAM_NODE_BINARY:-./bin/sam-node}" - export SAM_HUB_BINARY="${SAM_HUB_BINARY:-./bin/sam-hub}" - if [[ ! -x "$SAM_NODE_BINARY" ]]; then - skip "sam-node binary not found at $SAM_NODE_BINARY" - fi + export TEST_TMPDIR TEST_TMPDIR="$(mktemp -d)" export HOME="$TEST_TMPDIR/home" - export SAM_DATA_DIR="$TEST_TMPDIR/data" - mkdir -p "$SAM_DATA_DIR" + export XDG_CONFIG_HOME="$HOME/.config" + mkdir -p "$XDG_CONFIG_HOME" + + # Generate mock DB + go run tests/e2e/gen_db.go "$XDG_CONFIG_HOME/sam-mesh/agent.db" } teardown() { @@ -20,11 +20,8 @@ teardown() { } @test "sam-node run with stored identity fails if hub is unreachable" { - DB_PATH="$SAM_DATA_DIR/agent.db" - go run tests/e2e/gen_db.go "$DB_PATH" - log_file="$TEST_TMPDIR/run.log" - run "$SAM_NODE_BINARY" run --listen /ip4/127.0.0.1/udp/0/quic-v1 --listen /ip4/127.0.0.1/tcp/0 --api-token "dummy-token" --bind-addr "127.0.0.1:0" >"$log_file" 2>&1 + run "$SAM_NODE_BINARY" run --listen /ip4/127.0.0.1/udp/0/quic-v1 --listen /ip4/127.0.0.1/tcp/0 >"$log_file" 2>&1 if [[ "$status" -eq 0 ]]; then echo "Test failed: Node was expected to exit with non-zero status" @@ -38,45 +35,3 @@ teardown() { return 1 fi } - -@test "sam-node run with stored identity reaches online state when hub is reachable" { - if [[ ! -x "$SAM_HUB_BINARY" ]]; then - skip "sam-hub binary not found at $SAM_HUB_BINARY" - fi - - hub_log="$TEST_TMPDIR/hub.log" - "$SAM_HUB_BINARY" start >"$hub_log" 2>&1 & - hub_pid=$! - - # Wait for hub to start - sleep 2 - - # Join node to the hub to get a valid stored identity - join_log="$TEST_TMPDIR/join.log" - "$SAM_NODE_BINARY" join "http://127.0.0.1:9090" >"$join_log" 2>&1 - - log_file="$TEST_TMPDIR/run.log" - "$SAM_NODE_BINARY" run --listen /ip4/127.0.0.1/udp/0/quic-v1 --listen /ip4/127.0.0.1/tcp/0 --api-token "dummy-token" --bind-addr "127.0.0.1:0" >"$log_file" 2>&1 & - node_pid=$! - - online="" - for _ in {1..40}; do - if grep -q "SAM Node Online" "$log_file"; then - online="yes" - break - fi - sleep 0.1 - done - - kill "$node_pid" >/dev/null 2>&1 || true - wait "$node_pid" >/dev/null 2>&1 || true - kill "$hub_pid" >/dev/null 2>&1 || true - wait "$hub_pid" >/dev/null 2>&1 || true - - if [[ "$online" != "yes" ]]; then - echo "Test failed. Node logs:" - cat "$log_file" - fi - - [[ "$online" == "yes" ]] -} diff --git a/tests/integration/proxy_test.go b/tests/integration/proxy_test.go index 7c57a62..6a09a78 100644 --- a/tests/integration/proxy_test.go +++ b/tests/integration/proxy_test.go @@ -86,6 +86,7 @@ func TestSamNodeRunWithStoredIdentity(t *testing.T) { if err != context.DeadlineExceeded { t.Fatalf("expected run command to keep running until timeout, got: %v\nstdout:\n%s\nstderr:\n%s", err, runOut, runErrOut) } + t.Logf("stdout:\n%s\nstderr:\n%s", runOut, runErrOut) out := runOut + runErrOut if !strings.Contains(out, "Using stored identity.") { From 2642c0126ce054ecb7c1e1e1a3cccc3f6d819e1c Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Mon, 15 Jun 2026 13:57:54 +0000 Subject: [PATCH 5/6] optimize e2e tests --- tests/e2e/container_mesh.bats | 8 ------- tests/e2e/datapath.bats | 8 ------- tests/e2e/docs_snippets.bats | 8 ------- tests/e2e/find_remote_tools.bats | 3 --- tests/e2e/key_rotation_test.bats | 8 ------- tests/e2e/kind_wi.bats | 13 ------------ tests/e2e/lib/container_mesh.bash | 27 +++++++++++++++++------- tests/e2e/policy.bats | 8 ------- tests/e2e/python_sdk_test.bats | 8 ------- tests/e2e/relay.bats | 8 ------- tests/e2e/revocation_test.bats | 8 ------- tests/e2e/sam.bats | 8 ------- tests/e2e/services.bats | 8 ------- tests/e2e/setup_suite.bash | 35 +++++++++++++++++++++++++++++++ 14 files changed, 54 insertions(+), 104 deletions(-) diff --git a/tests/e2e/container_mesh.bats b/tests/e2e/container_mesh.bats index 9838c4a..07a2514 100644 --- a/tests/e2e/container_mesh.bats +++ b/tests/e2e/container_mesh.bats @@ -3,14 +3,6 @@ load "lib/container_mesh.bash" setup() { - if ! mesh_require_docker; then - skip "docker not available or daemon not running" - fi - - if [[ ! -x "./bin/sam-node" || ! -x "./bin/sam-hub" ]]; then - skip "missing binaries; run: make build" - fi - mesh_setup_env mkdir -p tests/e2e/logs } diff --git a/tests/e2e/datapath.bats b/tests/e2e/datapath.bats index a9d6bcf..29effa6 100644 --- a/tests/e2e/datapath.bats +++ b/tests/e2e/datapath.bats @@ -3,14 +3,6 @@ load "lib/container_mesh.bash" setup() { - if ! mesh_require_docker; then - skip "docker not available or daemon not running" - fi - - if [[ ! -x "./bin/sam-node" || ! -x "./bin/sam-hub" || ! -x "./bin/mcp-client" ]]; then - skip "missing binaries; run: make build" - fi - mesh_setup_env } diff --git a/tests/e2e/docs_snippets.bats b/tests/e2e/docs_snippets.bats index 9936857..fdb3d23 100644 --- a/tests/e2e/docs_snippets.bats +++ b/tests/e2e/docs_snippets.bats @@ -3,14 +3,6 @@ load "lib/container_mesh.bash" setup() { - if ! mesh_require_docker; then - skip "docker not available or daemon not running" - fi - - if [[ ! -x "./bin/sam-node" || ! -x "./bin/sam-hub" ]]; then - skip "missing binaries; run: make build" - fi - mesh_setup_env } diff --git a/tests/e2e/find_remote_tools.bats b/tests/e2e/find_remote_tools.bats index d53fa53..01387d5 100644 --- a/tests/e2e/find_remote_tools.bats +++ b/tests/e2e/find_remote_tools.bats @@ -24,9 +24,6 @@ start_calc_mcp() { } setup() { - if ! mesh_require_docker; then - skip "docker not available or daemon not running" - fi mesh_setup_env build_calc_mcp_image } diff --git a/tests/e2e/key_rotation_test.bats b/tests/e2e/key_rotation_test.bats index 7586443..8803717 100644 --- a/tests/e2e/key_rotation_test.bats +++ b/tests/e2e/key_rotation_test.bats @@ -3,14 +3,6 @@ load "lib/container_mesh.bash" setup() { - if ! mesh_require_docker; then - skip "docker not available or daemon not running" - fi - - if [[ ! -x "./bin/sam-node" || ! -x "./bin/sam-hub" ]]; then - skip "missing binaries; run: make build" - fi - mesh_setup_env } diff --git a/tests/e2e/kind_wi.bats b/tests/e2e/kind_wi.bats index 6337a4e..b167c24 100644 --- a/tests/e2e/kind_wi.bats +++ b/tests/e2e/kind_wi.bats @@ -1,25 +1,12 @@ #!/usr/bin/env bats setup() { - if ! command -v kind >/dev/null 2>&1; then - skip "kind not available" - fi - if ! command -v kubectl >/dev/null 2>&1; then - skip "kubectl not available" - fi - if ! command -v jq >/dev/null 2>&1; then - skip "jq not available" - fi - # We use a unique cluster name to avoid conflicts CLUSTER_NAME="sam-wi-test-$RANDOM" # Create Kind cluster kind create cluster --name "${CLUSTER_NAME}" - # Build images - make docker-build - # Load images into Kind kind load docker-image sam-hub:local --name "${CLUSTER_NAME}" kind load docker-image sam-node:local --name "${CLUSTER_NAME}" diff --git a/tests/e2e/lib/container_mesh.bash b/tests/e2e/lib/container_mesh.bash index b796a2c..7fc84af 100644 --- a/tests/e2e/lib/container_mesh.bash +++ b/tests/e2e/lib/container_mesh.bash @@ -47,6 +47,9 @@ if [[ -z "${MESH_HELPERS_LOADED:-}" ]]; then } mesh_setup_env() { + if [[ -n "${MESH_NETWORK:-}" ]]; then + return 0 + fi mesh_cleanup_stale_resources mesh_build_runtime_image @@ -72,19 +75,21 @@ if [[ -z "${MESH_HELPERS_LOADED:-}" ]]; then fi } - mesh_cleanup_env() { + mesh_cleanup_test_resources() { local c for c in "${MESH_CONTAINERS[@]}"; do docker rm -f "${c}" >/dev/null 2>&1 || true done - if [[ -n "${MESH_NETWORK}" ]]; then - docker network rm "${MESH_NETWORK}" >/dev/null 2>&1 || true - fi - if [[ -n "${MESH_SOCKET_DIR}" ]]; then - rm -rf "${MESH_SOCKET_DIR}" - fi MESH_CONTAINERS=() - MESH_NETWORK="" + } + + mesh_cleanup_env() { + mesh_cleanup_test_resources + # We leave the network and socket dir alive for the suite + } + + mesh_cleanup_suite() { + mesh_cleanup_stale_resources } mesh_gen_hex32() { @@ -187,6 +192,9 @@ if [[ -z "${MESH_HELPERS_LOADED:-}" ]]; then mesh_start_mock_oidc() { local name="${MESH_PREFIX}-oidc" + if docker inspect -f '{{.State.Running}}' "${name}" 2>/dev/null | grep -q "true"; then + return 0 + fi docker run -d \ --name "${name}" \ --network "${MESH_NETWORK}" \ @@ -199,6 +207,9 @@ if [[ -z "${MESH_HELPERS_LOADED:-}" ]]; then mesh_start_hub() { local name="${MESH_PREFIX}-hub" + if docker inspect -f '{{.State.Running}}' "${name}" 2>/dev/null | grep -q "true"; then + return 0 + fi local key key="$(mesh_gen_hex32)" diff --git a/tests/e2e/policy.bats b/tests/e2e/policy.bats index e95bd33..af9ba09 100644 --- a/tests/e2e/policy.bats +++ b/tests/e2e/policy.bats @@ -135,14 +135,6 @@ mesh_call_remote_tool() { } setup() { - if ! mesh_require_docker; then - skip "docker not available or daemon not running" - fi - - if [[ ! -x "./bin/sam-node" || ! -x "./bin/sam-hub" ]]; then - skip "missing binaries; run: make build" - fi - mesh_setup_env mkdir -p tests/e2e/logs diff --git a/tests/e2e/python_sdk_test.bats b/tests/e2e/python_sdk_test.bats index 1f3d4bf..6363709 100644 --- a/tests/e2e/python_sdk_test.bats +++ b/tests/e2e/python_sdk_test.bats @@ -3,14 +3,6 @@ load "lib/container_mesh.bash" setup() { - if ! mesh_require_docker; then - skip "docker not available or daemon not running" - fi - - if [[ ! -x "./bin/sam-node" || ! -x "./bin/sam-hub" ]]; then - skip "missing binaries; run: make build" - fi - mesh_setup_env } diff --git a/tests/e2e/relay.bats b/tests/e2e/relay.bats index e9b7150..228b082 100644 --- a/tests/e2e/relay.bats +++ b/tests/e2e/relay.bats @@ -3,14 +3,6 @@ load "lib/container_mesh.bash" setup() { - if ! mesh_require_docker; then - skip "docker not available or daemon not running" - fi - - if [[ ! -x "./bin/sam-node" || ! -x "./bin/sam-hub" ]]; then - skip "missing binaries; run: make build" - fi - mesh_setup_env mkdir -p tests/e2e/logs } diff --git a/tests/e2e/revocation_test.bats b/tests/e2e/revocation_test.bats index 76bd1ee..d32ac40 100644 --- a/tests/e2e/revocation_test.bats +++ b/tests/e2e/revocation_test.bats @@ -3,14 +3,6 @@ load "lib/container_mesh.bash" setup() { - if ! mesh_require_docker; then - skip "docker not available or daemon not running" - fi - - if [[ ! -x "./bin/sam-node" || ! -x "./bin/sam-hub" ]]; then - skip "missing binaries; run: make build" - fi - mesh_setup_env } diff --git a/tests/e2e/sam.bats b/tests/e2e/sam.bats index f519b49..5b636d4 100644 --- a/tests/e2e/sam.bats +++ b/tests/e2e/sam.bats @@ -4,14 +4,6 @@ setup() { export SAM_NODE_BINARY="${SAM_NODE_BINARY:-./bin/sam-node}" export SAM_HUB_BINARY="${SAM_HUB_BINARY:-./bin/sam-hub}" - make - if [[ ! -x "$SAM_NODE_BINARY" ]]; then - skip "sam-node binary not found at $SAM_NODE_BINARY" - fi - if [[ ! -x "$SAM_HUB_BINARY" ]]; then - skip "sam-hub binary not found at $SAM_HUB_BINARY" - fi - export TEST_TMPDIR TEST_TMPDIR="$(mktemp -d)" export HOME="$TEST_TMPDIR/home" diff --git a/tests/e2e/services.bats b/tests/e2e/services.bats index 27a501f..8a29102 100644 --- a/tests/e2e/services.bats +++ b/tests/e2e/services.bats @@ -23,14 +23,6 @@ start_calc_mcp() { } setup() { - if ! mesh_require_docker; then - skip "docker not available or daemon not running" - fi - - if [[ ! -x "./bin/sam-node" || ! -x "./bin/sam-hub" || ! -x "./bin/mcp-client" ]]; then - skip "missing binaries; run: make build" - fi - mesh_setup_env build_calc_mcp_image } diff --git a/tests/e2e/setup_suite.bash b/tests/e2e/setup_suite.bash index bebdfb7..58301f9 100644 --- a/tests/e2e/setup_suite.bash +++ b/tests/e2e/setup_suite.bash @@ -5,12 +5,47 @@ set -eu function setup_suite { export BATS_TEST_TIMEOUT=150 + if ! command -v docker >/dev/null 2>&1 || ! docker info >/dev/null 2>&1; then + echo "docker not available or daemon not running" >&2 + exit 1 + fi + if ! command -v kind >/dev/null 2>&1; then + echo "kind not available" >&2 + exit 1 + fi + if ! command -v kubectl >/dev/null 2>&1; then + echo "kubectl not available" >&2 + exit 1 + fi + if ! command -v jq >/dev/null 2>&1; then + echo "jq not available" >&2 + exit 1 + fi + # tests/e2e cd "$BATS_TEST_DIRNAME"/../.. make make docker-build + + if [[ ! -x "./bin/sam-node" || ! -x "./bin/sam-hub" || ! -x "./bin/mcp-client" ]]; then + echo "missing binaries; run: make build" >&2 + exit 1 + fi + + export BATS_TEST_NUMBER=0 + source tests/e2e/lib/container_mesh.bash + mesh_setup_env + export MESH_PREFIX + export MESH_NETWORK + + # Ensure the mock oidc and hub are running for all tests + mesh_start_mock_oidc + mesh_start_hub } function teardown_suite { + cd "$BATS_TEST_DIRNAME"/../.. + source tests/e2e/lib/container_mesh.bash + mesh_cleanup_suite echo "teardown suite" } \ No newline at end of file From 1325d9a8315b31af50150f401164f31f8c00788f Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Mon, 15 Jun 2026 14:45:17 +0000 Subject: [PATCH 6/6] fix e2e test --- tests/e2e/watch.bats | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/e2e/watch.bats b/tests/e2e/watch.bats index 99fdefd..3f29ff4 100644 --- a/tests/e2e/watch.bats +++ b/tests/e2e/watch.bats @@ -20,18 +20,17 @@ teardown() { } @test "sam-node run with stored identity fails if hub is unreachable" { - log_file="$TEST_TMPDIR/run.log" - run "$SAM_NODE_BINARY" run --listen /ip4/127.0.0.1/udp/0/quic-v1 --listen /ip4/127.0.0.1/tcp/0 >"$log_file" 2>&1 + run "$SAM_NODE_BINARY" run --listen /ip4/127.0.0.1/udp/0/quic-v1 --listen /ip4/127.0.0.1/tcp/0 if [[ "$status" -eq 0 ]]; then echo "Test failed: Node was expected to exit with non-zero status" - cat "$log_file" + echo "Output: $output" return 1 fi - if ! grep -q "failed to authenticate with any hub: all connection attempts failed" "$log_file"; then + if [[ ! "$output" == *"all connection attempts failed"* ]]; then echo "Test failed: Node did not log the expected error message" - cat "$log_file" + echo "Output: $output" return 1 fi }