From 49d10efea830f55f6486f4cc32b1c5a81ea998b0 Mon Sep 17 00:00:00 2001
From: Steven Normore <snormore@gmail.com>
Date: Sat, 28 Mar 2026 15:49:58 -0400
Subject: [PATCH] client: support incremental multicast group subscription

Remove the guard in the CLI that blocked `doublezero connect multicast`
when a multicast service was already running. The onchain subscribe
instruction, CLI's find_or_create_user_and_subscribe, and daemon's
InfraEqual + UpdateGroups path already support incremental group
additions without tearing down the tunnel.

Also improve e2e test ergonomics:
- Add Make targets: test-debug, test-nobuild, test-keep, test-cleanup
- Rename DZ_E2E_DEBUG env var to DEBUG for consistency with shreds repo
- Update dev/e2e-test.sh and dev/e2e-until-fail.sh to use Make
- Update CLAUDE.md and DEVELOPMENT.md with new Make target usage
---
 CHANGELOG.md                             |  1 +
 CLAUDE.md                                | 17 ++++++--
 DEVELOPMENT.md                           | 32 ++++++++++------
 Makefile                                 | 41 ++++++++++++++++++--
 client/doublezero/src/command/connect.rs | 17 --------
 dev/e2e-test.sh                          |  6 +--
 dev/e2e-until-fail.sh                    |  2 +-
 e2e/Makefile                             | 49 +++++++++++++++++-------
 e2e/main_test.go                         | 28 +++++++++++++-
 e2e/multicast_test.go                    | 18 +++++++--
 e2e/qa_multicast_test.go                 | 29 +++++++-------
 11 files changed, 165 insertions(+), 75 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a8f6592bf9..999ff521f4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file.
 ### Changes
 
 - CLI
+  - Allow incremental multicast group addition without disconnecting
   - Reset SIGPIPE to SIG_DFL at the start of main() in all 3 CLI binaries (doublezero, doublezero-geolocation, doublezero-admin) so the process exits silently like standard CLI tools
 - SDK
   - Add Go SDK for shred subscription program with read-only account deserialization (epoch state, seat assignments, pricing, settlement, validator client rewards), PDA derivation helpers, RPC fetchers, compatibility tests, and a fetch example CLI
diff --git a/CLAUDE.md b/CLAUDE.md
index cb70c4a153..0c7a7e6107 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -257,11 +257,20 @@ Only use `dev/dzctl destroy -y` when you need a completely clean slate (e.g., le
 
 ```bash
 # Run a specific test (preferred)
-go test -tags e2e -run TestE2E_Multicast_Publisher -v -count=1 ./e2e/...
+make e2e-test RUN=TestE2E_Multicast_Publisher
 
-# Run all tests (requires high-memory machine)
-dev/e2e-test.sh
+# Run with debug logging
+make e2e-test-debug RUN=TestE2E_Multicast_Publisher
+
+# Skip docker image rebuild
+make e2e-test-nobuild RUN=TestE2E_Multicast_Publisher
 
 # Keep containers after test completion/failure for debugging
-TESTCONTAINERS_RYUK_DISABLED=true go test -tags e2e -run TestE2E_Multicast_Publisher -v -count=1 ./e2e/...
+make e2e-test-keep RUN=TestE2E_Multicast_Publisher
+
+# Run all tests (requires high-memory machine)
+make e2e-test
+
+# Clean up leftover containers
+make e2e-test-cleanup
 ```
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index 9a85a00a0e..22dd19abd4 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -55,23 +55,31 @@ The required image (`ghcr.io/malbeclabs/ceos:4.33.1F`) will be pulled automatica
 End-to-end tests exercise the full DoubleZero stack — smartcontracts, controller, activator, client, and device agents — all running in isolated Docker containers.
 
 ```bash
-# Run a specific E2E test directly
-cd e2e/
-go test -tags e2e -v -run TestE2E_MultiClient
+# Run a specific test
+make e2e-test RUN=TestE2E_MultiClient
 
-# Or use the helper script
-dev/e2e-test.sh TestE2E_MultiClient
+# Run with debug logging
+make e2e-test-debug RUN=TestE2E_MultiClient
+
+# Skip docker image rebuild (faster iteration)
+make e2e-test-nobuild RUN=TestE2E_MultiClient
+
+# Keep containers after test for debugging
+make e2e-test-keep RUN=TestE2E_MultiClient
+
+# Both: skip rebuild + keep containers
+make e2e-test-keep-nobuild RUN=TestE2E_MultiClient
+
+# Clean up leftover containers from previous runs
+make e2e-test-cleanup
+
+# Run all tests (requires high-memory machine)
+make e2e-test
 ```
 
 > ⚠️ Note:
 >
->
-> E2E tests are resource-intensive. It’s recommended to run them individually or with low parallelism:
->
-> ```bash
-> go test -tags e2e -v -parallel=1 -timeout=20m
-> ```
->
+> E2E tests are resource-intensive. It’s recommended to run them individually.
 > Running all tests together may require at least 64 GB of memory available to Docker.
 >
 
diff --git a/Makefile b/Makefile
index b4599a0196..011fa07be9 100644
--- a/Makefile
+++ b/Makefile
@@ -198,15 +198,48 @@ generate-fixtures:
 
 # -----------------------------------------------------------------------------
 # E2E targets
+#
+# Usage:
+#   make e2e-test                           # run all tests
+#   make e2e-test RUN=TestE2E_Multicast     # run a specific test
+#   make e2e-test-debug RUN=TestE2E_Multicast # with debug logging
+#   make e2e-test-nobuild                   # skip docker image build
+#   make e2e-test-keep                      # keep containers after test
+#   make e2e-test-keep-nobuild              # both
+#   make e2e-test-cleanup                   # remove leftover containers
 # -----------------------------------------------------------------------------
-.PHONY: e2e-test
-e2e-test:
-	cd e2e && $(MAKE) test
-
 .PHONY: e2e-build
 e2e-build:
 	cd e2e && $(MAKE) build
 
+.PHONY: e2e-build-debug
+e2e-build-debug:
+	cd e2e && $(MAKE) build-debug
+
+.PHONY: e2e-test
+e2e-test:
+	cd e2e && $(MAKE) test $(if $(RUN),RUN=$(RUN))
+
+.PHONY: e2e-test-debug
+e2e-test-debug:
+	cd e2e && $(MAKE) test-debug $(if $(RUN),RUN=$(RUN))
+
+.PHONY: e2e-test-nobuild
+e2e-test-nobuild:
+	cd e2e && $(MAKE) test-nobuild $(if $(RUN),RUN=$(RUN))
+
+.PHONY: e2e-test-keep
+e2e-test-keep:
+	cd e2e && $(MAKE) test-keep $(if $(RUN),RUN=$(RUN))
+
+.PHONY: e2e-test-keep-nobuild
+e2e-test-keep-nobuild:
+	cd e2e && $(MAKE) test-keep-nobuild $(if $(RUN),RUN=$(RUN))
+
+.PHONY: e2e-test-cleanup
+e2e-test-cleanup:
+	cd e2e && $(MAKE) test-cleanup
+
 # -----------------------------------------------------------------------------
 # Build programs for specific environments
 # -----------------------------------------------------------------------------
diff --git a/client/doublezero/src/command/connect.rs b/client/doublezero/src/command/connect.rs
index b866a49435..cb5ea306bf 100644
--- a/client/doublezero/src/command/connect.rs
+++ b/client/doublezero/src/command/connect.rs
@@ -204,23 +204,6 @@ impl ProvisioningCliCommand {
         client_ip: Ipv4Addr,
         spinner: &ProgressBar,
     ) -> eyre::Result<()> {
-        // Check if the daemon already has a multicast service running. The daemon
-        // does not support updating an existing service — both publisher and subscriber
-        // roles must be specified in a single connect command.
-        if let Ok(statuses) = controller.status().await {
-            if statuses.iter().any(|s| {
-                s.user_type
-                    .as_ref()
-                    .is_some_and(|t| t.eq_ignore_ascii_case("multicast"))
-            }) {
-                eyre::bail!(
-                    "A multicast service is already running. Disconnect first with \
-                     `doublezero disconnect multicast`, then reconnect with all desired \
-                     groups in a single command (e.g. --publish and --subscribe)."
-                );
-            }
-        }
-
         let mcast_groups = client.list_multicastgroup(ListMulticastGroupCommand)?;
 
         // Resolve pub group codes to pubkeys
diff --git a/dev/e2e-test.sh b/dev/e2e-test.sh
index 4f01f2885b..8de15f6e90 100755
--- a/dev/e2e-test.sh
+++ b/dev/e2e-test.sh
@@ -6,10 +6,8 @@ workspace_dir=$(dirname "${script_dir}")
 
 test=${1:-}
 
-cd "${workspace_dir}/e2e"
-
 if [ -n "${test}" ]; then
-    go test -v -tags e2e -run="${test}" -timeout 20m
+    make -C "${workspace_dir}/e2e" test RUN="${test}"
 else
-    make test verbose
+    make -C "${workspace_dir}/e2e" test
 fi
diff --git a/dev/e2e-until-fail.sh b/dev/e2e-until-fail.sh
index f674404d81..f1e5b095ef 100755
--- a/dev/e2e-until-fail.sh
+++ b/dev/e2e-until-fail.sh
@@ -57,7 +57,7 @@ while :; do
   fi
 
   set +e
-  "${workspace_dir}/dev/e2e-test.sh" "${target_test}"
+  make -C "${workspace_dir}/e2e" test-nobuild $(if [ -n "$target_test" ]; then echo "RUN=$target_test"; fi)
   ret_val=$?
   set -e
 
diff --git a/e2e/Makefile b/e2e/Makefile
index 783e23baf9..fa4b759562 100644
--- a/e2e/Makefile
+++ b/e2e/Makefile
@@ -6,8 +6,19 @@ GIT_SHA:=`git rev-parse --short HEAD`
 # Enabled by default on mac, but not always on linux.
 export DOCKER_BUILDKIT=1
 
+.PHONY: build build-debug test test-debug test-nobuild test-keep test-keep-nobuild test-cleanup
+
+# -----------------------------------------------------------------------------
+# Build
 # -----------------------------------------------------------------------------
-# Run the e2e tests.
+build:
+	go run ./cmd/dzctl/main.go build
+
+build-debug:
+	go run ./cmd/dzctl/main.go build -v
+
+# -----------------------------------------------------------------------------
+# Test
 #
 # This will build the docker images first, so it's not necessary to run `build`
 # before running `test`.
@@ -15,25 +26,35 @@ export DOCKER_BUILDKIT=1
 # We configure -timeout=20m for the case where the user is running the tests
 # sequentially. This should be more than enough time for the tests to run in
 # that case, and leave room in case more tests are added in the future.
+#
+# Usage:
+#   make test                           # run all tests
+#   make test RUN=TestE2E_Multicast     # run a specific test
+#   make test-debug RUN=TestE2E_Multicast # with debug logging
+#   make test-nobuild                   # skip docker image build
+#   make test-keep                      # keep containers after test
+#   make test-keep-nobuild              # both
 # -----------------------------------------------------------------------------
-.PHONY: test
 test:
-	$(if $(findstring nobuild,$(MAKECMDGOALS)),DZ_E2E_NO_BUILD=1) go test -tags=e2e -timeout=20m $(if $(parallel),-parallel=$(parallel)) $(if $(run),-run=$(run)) $(if $(findstring verbose,$(MAKECMDGOALS)),-v)
+	go test -tags=e2e -timeout=20m -v -count=1 $(if $(RUN),-run $(RUN)) .
 
-# Dummy target to suppress errors when using 'nobuild' as a flag in `make test nobuild
-.PHONY: nobuild
-nobuild:
-	@:
+test-debug:
+	DEBUG=1 go test -tags=e2e -timeout=20m -v -count=1 $(if $(RUN),-run $(RUN)) .
 
-# Dummy target to suppress errors when using 'verbose' as a flag in `make test verbose`
-.PHONY: verbose
-verbose:
-	@:
+test-nobuild:
+	DZ_E2E_NO_BUILD=1 go test -tags=e2e -timeout=20m -v -count=1 $(if $(RUN),-run $(RUN)) .
 
-.PHONY: build
-build:
-	go run ./cmd/dzctl/main.go build
+test-keep:
+	TESTCONTAINERS_RYUK_DISABLED=true go test -tags=e2e -timeout=20m -v -count=1 $(if $(RUN),-run $(RUN)) .
+
+test-keep-nobuild:
+	TESTCONTAINERS_RYUK_DISABLED=true DZ_E2E_NO_BUILD=1 go test -tags=e2e -timeout=20m -v -count=1 $(if $(RUN),-run $(RUN)) .
 
+test-cleanup:
+	@echo "Removing containers with label dz.malbeclabs.com..."
+	@docker rm -f $$(docker ps -aq --filter label=dz.malbeclabs.com) 2>/dev/null || true
+	@echo "Removing networks with label dz.malbeclabs.com..."
+	@docker network rm $$(docker network ls -q --filter label=dz.malbeclabs.com) 2>/dev/null || true
 
 # -----------------------------------------------------------------------------
 # Solana image build and push.
diff --git a/e2e/main_test.go b/e2e/main_test.go
index accb906926..b32e3b4c75 100644
--- a/e2e/main_test.go
+++ b/e2e/main_test.go
@@ -76,7 +76,7 @@ func TestMain(m *testing.M) {
 	if vFlag := flag.Lookup("test.v"); vFlag != nil && vFlag.Value.String() == "true" {
 		verbose = true
 	}
-	if os.Getenv("DZ_E2E_DEBUG") != "" {
+	if os.Getenv("DEBUG") != "" {
 		debug = true
 	}
 
@@ -534,6 +534,30 @@ func (dn *TestDevnet) ConnectMulticastSubscriberSkipAccessPass(t *testing.T, cli
 	dn.log.Debug("--> Multicast subscriber connected")
 }
 
+// AddMulticastPublisherGroupSkipAccessPass incrementally adds publish groups to
+// an existing multicast service without disconnecting.
+func (dn *TestDevnet) AddMulticastPublisherGroupSkipAccessPass(t *testing.T, client *devnet.Client, multicastGroupCodes ...string) {
+	dn.log.Debug("==> Adding multicast publisher groups incrementally", "clientIP", client.CYOANetworkIP, "groups", multicastGroupCodes)
+
+	groupArgs := strings.Join(multicastGroupCodes, " ")
+	_, err := client.Exec(t.Context(), []string{"bash", "-c", "doublezero connect multicast --publish " + groupArgs})
+	require.NoError(t, err)
+
+	dn.log.Debug("--> Multicast publisher groups added incrementally")
+}
+
+// AddMulticastSubscriberGroupSkipAccessPass incrementally adds subscribe groups to
+// an existing multicast service without disconnecting.
+func (dn *TestDevnet) AddMulticastSubscriberGroupSkipAccessPass(t *testing.T, client *devnet.Client, multicastGroupCodes ...string) {
+	dn.log.Debug("==> Adding multicast subscriber groups incrementally", "clientIP", client.CYOANetworkIP, "groups", multicastGroupCodes)
+
+	groupArgs := strings.Join(multicastGroupCodes, " ")
+	_, err := client.Exec(t.Context(), []string{"bash", "-c", "doublezero connect multicast --subscribe " + groupArgs})
+	require.NoError(t, err)
+
+	dn.log.Debug("--> Multicast subscriber groups added incrementally")
+}
+
 func (dn *TestDevnet) DisconnectMulticastSubscriber(t *testing.T, client *devnet.Client) {
 	dn.log.Debug("==> Disconnecting multicast subscriber", "clientIP", client.CYOANetworkIP)
 
@@ -765,7 +789,7 @@ func (dn *TestDevnet) BuildAgentConfigData(t *testing.T, deviceCode string, extr
 
 // newTestLoggerForTest creates a logger for individual test runs.
 // Logs are written to t.Log() so they only appear on test failure (unless -v is passed).
-// With DZ_E2E_DEBUG=1, shows DEBUG level logs; otherwise shows INFO level.
+// With DEBUG=1, shows DEBUG level logs; otherwise shows INFO level.
 func newTestLoggerForTest(t *testing.T) *slog.Logger {
 	w := &testWriter{t: t}
 	logLevel := slog.LevelInfo
diff --git a/e2e/multicast_test.go b/e2e/multicast_test.go
index aadb0b8d86..d54d226a2e 100644
--- a/e2e/multicast_test.go
+++ b/e2e/multicast_test.go
@@ -192,13 +192,23 @@ func TestE2E_Multicast(t *testing.T) {
 	createMulticastGroupForBothClients(t, tdn, publisherClient, subscriberClient, "mg02")
 
 	if !t.Run("connect", func(t *testing.T) {
-		// Connect publisher.
-		tdn.ConnectMulticastPublisherSkipAccessPass(t, publisherClient, "mg01", "mg02")
+		// Connect publisher to first group only.
+		tdn.ConnectMulticastPublisherSkipAccessPass(t, publisherClient, "mg01")
 		err = publisherClient.WaitForTunnelUp(t.Context(), 90*time.Second)
 		require.NoError(t, err)
 
-		// Connect subscriber.
-		tdn.ConnectMulticastSubscriberSkipAccessPass(t, subscriberClient, "mg01", "mg02")
+		// Incrementally add second publish group without disconnecting.
+		tdn.AddMulticastPublisherGroupSkipAccessPass(t, publisherClient, "mg02")
+		err = publisherClient.WaitForTunnelUp(t.Context(), 90*time.Second)
+		require.NoError(t, err)
+
+		// Connect subscriber to first group only.
+		tdn.ConnectMulticastSubscriberSkipAccessPass(t, subscriberClient, "mg01")
+		err = subscriberClient.WaitForTunnelUp(t.Context(), 90*time.Second)
+		require.NoError(t, err)
+
+		// Incrementally add second subscribe group without disconnecting.
+		tdn.AddMulticastSubscriberGroupSkipAccessPass(t, subscriberClient, "mg02")
 		err = subscriberClient.WaitForTunnelUp(t.Context(), 90*time.Second)
 		require.NoError(t, err)
 
diff --git a/e2e/qa_multicast_test.go b/e2e/qa_multicast_test.go
index 1618e813e0..f2f654cb40 100644
--- a/e2e/qa_multicast_test.go
+++ b/e2e/qa_multicast_test.go
@@ -347,17 +347,17 @@ func TestQA_MulticastPublisherMultipleGroups(t *testing.T) {
 	require.Greater(t, reportB.PacketCount, uint64(0), "subscriberB received no packets from group B")
 	log.Info("Received multicast packets", "subscriber", subscriberB.Host, "group", groupB.Code, "packetCount", reportB.PacketCount)
 
-	// --- Phase 2: Dynamic subscription ---
-	// SubA disconnects and reconnects with both groups A+B — verify identity preserved and receives from both.
-	log.Debug("Phase 2: dynamic subscription")
+	// --- Phase 2: Incremental subscription ---
+	// SubA adds group B without disconnecting — verify tunnel stays up, identity preserved, and receives from both.
+	log.Debug("Phase 2: incremental subscription (no disconnect)")
 
 	statusBefore, err := subscriberA.GetUserStatus(ctx)
 	require.NoError(t, err, "failed to get subscriberA status before adding group B")
 	log.Debug("SubscriberA status before", "status", statusBefore)
 
-	log.Debug("SubscriberA reconnecting with both groups", "codes", []string{groupA.Code, groupB.Code})
-	err = subscriberA.ConnectUserMulticast_Subscriber_Wait(ctx, groupA.Code, groupB.Code)
-	require.NoError(t, err, "failed to reconnect subscriberA with both groups")
+	log.Debug("SubscriberA adding group B incrementally (no disconnect)", "code", groupB.Code)
+	err = subscriberA.ConnectUserMulticast_Subscriber_AddTunnel(ctx, groupB.Code)
+	require.NoError(t, err, "failed to incrementally add group B to subscriberA")
 
 	err = subscriberA.WaitForStatusUp(ctx)
 	require.NoError(t, err, "failed to wait for subscriberA status up after adding group B")
@@ -389,15 +389,18 @@ func TestQA_MulticastPublisherMultipleGroups(t *testing.T) {
 	require.NotNil(t, reportB, "no report for group B")
 	require.Greater(t, reportB.PacketCount, uint64(0), "no packets from group B")
 
-	log.Debug("Phase 2 passed: dynamic subscription verified",
+	log.Debug("Phase 2 passed: incremental subscription verified",
 		"groupA_packets", reportA.PacketCount, "groupB_packets", reportB.PacketCount)
 
-	// --- Phase 3: Simultaneous pub+sub ---
-	// SubA reconnects as both publisher and subscriber on group A, sends to itself.
-	log.Debug("Phase 3: simultaneous pub+sub")
+	// --- Phase 3: Incremental publish after subscribe (cross-role) ---
+	// SubA adds a publisher role on group A without disconnecting. This triggers a
+	// full reprovision in the daemon (publisher role transition) but should still
+	// converge to a working pub+sub state.
+	log.Debug("Phase 3: incremental publish after subscribe (cross-role)")
 
-	err = subscriberA.ConnectUserMulticast_PubAndSub_Wait(ctx, []string{groupA.Code}, []string{groupA.Code})
-	require.NoError(t, err, "failed to connect subscriberA as pub+sub")
+	log.Debug("SubscriberA adding publisher role on group A incrementally", "code", groupA.Code)
+	err = subscriberA.ConnectUserMulticast_Publisher_AddTunnel(ctx, groupA.Code)
+	require.NoError(t, err, "failed to incrementally add publisher role to subscriberA")
 
 	err = subscriberA.WaitForStatusUp(ctx)
 	require.NoError(t, err, "failed to wait for subscriberA status up as pub+sub")
@@ -419,5 +422,5 @@ func TestQA_MulticastPublisherMultipleGroups(t *testing.T) {
 	reportPubSub, err := subscriberA.WaitForMulticastReport(ctx, groupA)
 	require.NoError(t, err, "failed to get report for group A as pub+sub")
 	require.Greater(t, reportPubSub.PacketCount, uint64(0), "pub+sub client received no packets")
-	log.Debug("Phase 3 passed: pub+sub verified", "group", groupA.Code, "packetCount", reportPubSub.PacketCount)
+	log.Debug("Phase 3 passed: incremental publish after subscribe verified", "group", groupA.Code, "packetCount", reportPubSub.PacketCount)
 }