diff --git a/changes/40322-fix-ddm-pending-issues b/changes/40322-fix-ddm-pending-issues new file mode 100644 index 0000000000..fa5a6e6a1a --- /dev/null +++ b/changes/40322-fix-ddm-pending-issues @@ -0,0 +1,3 @@ +- Fixed an issue where the DDM reconciler would not self-heal for stuck remove/pending profiles due to resend with update. +- Fixed an issue where a host DDM cleanup function was not executed for stale remove/pending profiles that weren't reported by the device. +- Fixed an issue where batch processing many DDM profile changes would result in stuck remove/pending profiles. \ No newline at end of file diff --git a/server/datastore/mysql/apple_mdm.go b/server/datastore/mysql/apple_mdm.go index 8b654574d6..501f0f1c14 100644 --- a/server/datastore/mysql/apple_mdm.go +++ b/server/datastore/mysql/apple_mdm.go @@ -5725,7 +5725,14 @@ func (ds *Datastore) MDMAppleBatchSetHostDeclarationState(ctx context.Context) ( err := ds.withRetryTxx(ctx, func(tx sqlx.ExtContext) error { var err error uuids, _, err = mdmAppleBatchSetHostDeclarationStateDB(ctx, tx, batchSize, &fleet.MDMDeliveryPending) - return err + if err != nil { + return err + } + + // Safety net: always clean up orphaned remove/pending rows, even when + // there are no changed declarations. This handles stuck rows from + // previous runs that can't self-heal via device status reports. + return cleanUpOrphanedPendingRemoves(ctx, tx) }) return uuids, ctxerr.Wrap(ctx, err, "upserting host declaration state") @@ -5878,6 +5885,42 @@ func mdmAppleBatchSetPendingHostDeclarationsDB( return updatedDB, ctxerr.Wrap(ctx, err, "inserting changed host declaration state") } +// cleanUpOrphanedPendingRemoves deletes remove/pending rows from +// host_mdm_apple_declarations where a matching install row already exists with +// the same host, token, and identifier in a verified or verifying state. This +// means the declaration content is already on the device — the remove is stale. +func cleanUpOrphanedPendingRemoves(ctx context.Context, tx sqlx.ExtContext) error { + var found bool + err := sqlx.GetContext(ctx, tx, &found, ` + SELECT EXISTS ( + SELECT 1 FROM host_mdm_apple_declarations r + INNER JOIN host_mdm_apple_declarations i + ON r.host_uuid = i.host_uuid + AND r.token = i.token + AND r.declaration_identifier = i.declaration_identifier + WHERE r.operation_type = 'remove' AND r.status = 'pending' + AND i.operation_type = 'install' + AND i.status IN ('verified', 'verifying') + LIMIT 1)`) + if err != nil { + return ctxerr.Wrap(ctx, err, "checking for orphaned remove/pending rows") + } + if !found { + return nil + } + + _, err = tx.ExecContext(ctx, ` + DELETE r FROM host_mdm_apple_declarations r + INNER JOIN host_mdm_apple_declarations i + ON r.host_uuid = i.host_uuid + AND r.token = i.token + AND r.declaration_identifier = i.declaration_identifier + WHERE r.operation_type = 'remove' AND r.status = 'pending' + AND i.operation_type = 'install' + AND i.status IN ('verified', 'verifying')`) + return ctxerr.Wrap(ctx, err, "deleting orphaned remove/pending rows") +} + func cleanUpDuplicateRemoveInstall(ctx context.Context, tx sqlx.ExtContext, profilesToInsert map[string]*fleet.MDMAppleHostDeclaration) error { // If we are inserting a declaration that has a matching pending "remove" declaration (same hash), // we will mark the insert as verified. Why? Because there is nothing for the host to do if the same @@ -5966,20 +6009,7 @@ func mdmAppleGetHostsWithChangedDeclarationsDB(ctx context.Context, tx sqlx.ExtC entitiesToRemoveQuery, entitiesToRemoveArgs := generateEntitiesToRemoveQuery("declaration") stmt := fmt.Sprintf(` ( - SELECT - ds.host_uuid, - 'install' as operation_type, - ds.token, - ds.secrets_updated_at, - ds.declaration_uuid, - ds.declaration_identifier, - ds.declaration_name - FROM - %s - ) - UNION ALL - ( - SELECT + SELECT hmae.host_uuid, 'remove' as operation_type, hmae.token, @@ -5990,13 +6020,29 @@ func mdmAppleGetHostsWithChangedDeclarationsDB(ctx context.Context, tx sqlx.ExtC FROM %s ) + UNION ALL + ( + SELECT + ds.host_uuid, + 'install' as operation_type, + ds.token, + ds.secrets_updated_at, + ds.declaration_uuid, + ds.declaration_identifier, + ds.declaration_name + FROM + %s + ) `, - entitiesToInstallQuery, + // We specifically want Removals first, since we later batch process + // and rely on removals being updated so the matching install entry + // can find it and clean it up avoiding leaving stale pending removals in the database. entitiesToRemoveQuery, + entitiesToInstallQuery, ) var decls []*fleet.MDMAppleHostDeclaration - args := slices.Concat(entitiesToInstallArgs, entitiesToRemoveArgs) + args := slices.Concat(entitiesToRemoveArgs, entitiesToInstallArgs) if err := sqlx.SelectContext(ctx, tx, &decls, stmt, args...); err != nil { return nil, ctxerr.Wrap(ctx, err, "running sql statement") } @@ -6042,7 +6088,7 @@ ON DUPLICATE KEY UPDATE for _, c := range current { // Skip updates for 'remove' operations because it is possible that IT admin removed a profile and then re-added it. // Pending removes are cleaned up after we update status of installs. - if u, ok := updatesByToken[c.Token]; ok && u.OperationType != fleet.MDMOperationTypeRemove { + if u, ok := updatesByToken[c.Token]; ok && c.OperationType != fleet.MDMOperationTypeRemove { insertVals.WriteString("(?, ?, ?, ?, ?, ?, ?, UNHEX(?), ?),") args = append(args, hostUUID, c.DeclarationUUID, u.Status, u.OperationType, u.Detail, c.Identifier, c.Name, c.Token, c.SecretsUpdatedAt) diff --git a/server/datastore/mysql/apple_mdm_ddm_test.go b/server/datastore/mysql/apple_mdm_ddm_test.go index 0a412635fc..b128e4d6cc 100644 --- a/server/datastore/mysql/apple_mdm_ddm_test.go +++ b/server/datastore/mysql/apple_mdm_ddm_test.go @@ -21,6 +21,9 @@ func TestMDMDDMApple(t *testing.T) { fn func(t *testing.T, ds *Datastore) }{ {"TestMDMAppleBatchSetHostDeclarationState", testMDMAppleBatchSetHostDeclarationState}, + {"StoreDDMStatusReportSkipsRemoveRows", testStoreDDMStatusReportSkipsRemoveRows}, + {"CleanUpDuplicateRemoveInstallAcrossBatches", testCleanUpDuplicateRemoveInstallAcrossBatches}, + {"CleanUpOrphanedPendingRemoves", testCleanUpOrphanedPendingRemoves}, } for _, c := range cases { @@ -276,3 +279,326 @@ func testMDMAppleBatchSetHostDeclarationState(t *testing.T, ds *Datastore) { } }) } + +func testStoreDDMStatusReportSkipsRemoveRows(t *testing.T, ds *Datastore) { + ctx := t.Context() + + // Create a test host + host, err := ds.NewHost(ctx, &fleet.Host{ + Hostname: "test-host-ddm-status", + UUID: "test-host-uuid-ddm-status", + HardwareSerial: "ABC123-DDM-STATUS", + PrimaryIP: "192.168.1.50", + PrimaryMac: "00:00:00:00:00:50", + OsqueryHostID: ptr.String("test-host-uuid-ddm-status"), + NodeKey: ptr.String("test-host-uuid-ddm-status"), + DetailUpdatedAt: time.Now(), + Platform: "darwin", + }) + require.NoError(t, err) + + setupMDMDeviceAndEnrollment(t, ds, ctx, host.UUID, host.HardwareSerial) + + // Insert two rows into host_mdm_apple_declarations: + // Row A: a remove-operation row (simulating a declaration pending removal) + // Row B: a normal install-operation row + insertHostDeclaration(t, ds, ctx, host.UUID, "decl-remove", "shared-token", "pending", "remove", "com.example.remove") + insertHostDeclaration(t, ds, ctx, host.UUID, "decl-install", "install-token", "pending", "install", "com.example.install") + + // Query back the HEX tokens from the DB (MDMAppleStoreDDMStatusReport reads tokens as HEX) + type tokenRow struct { + DeclarationUUID string `db:"declaration_uuid"` + Token string `db:"token"` + } + var tokens []tokenRow + ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { + return sqlx.SelectContext(ctx, q, &tokens, ` + SELECT declaration_uuid, HEX(token) as token + FROM host_mdm_apple_declarations + WHERE host_uuid = ? + ORDER BY declaration_uuid`, host.UUID) + }) + require.Len(t, tokens, 2) + + tokenByDecl := make(map[string]string, 2) + for _, tok := range tokens { + tokenByDecl[tok.DeclarationUUID] = tok.Token + } + tokenA := tokenByDecl["decl-remove"] + tokenB := tokenByDecl["decl-install"] + require.NotEmpty(t, tokenA) + require.NotEmpty(t, tokenB) + + // Build the updates slice as if the device is reporting status. + // The device always reports operation_type='install' for all declarations. + updates := []*fleet.MDMAppleHostDeclaration{ + { + Token: tokenA, + Status: new(fleet.MDMDeliveryStatus), + OperationType: fleet.MDMOperationTypeInstall, + }, + { + Token: tokenB, + Status: new(fleet.MDMDeliveryStatus), + OperationType: fleet.MDMOperationTypeInstall, + }, + } + *updates[0].Status = fleet.MDMDeliveryVerified + *updates[1].Status = fleet.MDMDeliveryVerified + + // Call the method under test + err = ds.MDMAppleStoreDDMStatusReport(ctx, host.UUID, updates) + require.NoError(t, err) + + // Assert the end state. + type resultRow struct { + DeclarationUUID string `db:"declaration_uuid"` + Token string `db:"token"` + Status string `db:"status"` + OperationType string `db:"operation_type"` + } + var remaining []resultRow + ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { + return sqlx.SelectContext(ctx, q, &remaining, ` + SELECT declaration_uuid, HEX(token) as token, status, operation_type + FROM host_mdm_apple_declarations + WHERE host_uuid = ? + ORDER BY declaration_uuid`, host.UUID) + }) + + // With the fix: only the install row should remain (remove row was skipped then deleted) + // With the bug: both rows remain, and the remove row was flipped to install/verified + require.Len(t, remaining, 1, "expected remove row to not survive as install/verified — this is the token collision bug") + assert.Equal(t, "decl-install", remaining[0].DeclarationUUID) + assert.Equal(t, "install", remaining[0].OperationType) + assert.Equal(t, "verified", remaining[0].Status) +} + +func testCleanUpDuplicateRemoveInstallAcrossBatches(t *testing.T, ds *Datastore) { + ctx := t.Context() + + // D1 simulates the "old" declaration that was already installed on hosts. + // D2 simulates the "new" declaration (same content/identifier, different UUID — e.g. after a name change caused delete+reinsert). + // They share the same identifier, so D1 must be deleted before D2 is created (unique constraint on team_id+identifier). + declJSON := []byte(`{"Type":"com.apple.configuration.test","Identifier":"com.example.cleanup"}`) + + d1, err := ds.NewMDMAppleDeclaration(ctx, &fleet.MDMAppleDeclaration{ + DeclarationUUID: "decl-old", + Name: "Old Declaration", + Identifier: "com.example.cleanup", + RawJSON: declJSON, + }) + require.NoError(t, err) + + // Query the token for D1 from mdm_apple_declarations (generated column) + var d1Token string + ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { + return sqlx.GetContext(ctx, q, &d1Token, + "SELECT HEX(token) FROM mdm_apple_declarations WHERE declaration_uuid = ?", d1.DeclarationUUID) + }) + + // Create 3 hosts, all enrolled + hosts := make([]*fleet.Host, 3) + for i := range 3 { + hostUUID := fmt.Sprintf("cleanup-host-%d", i) + hosts[i], err = ds.NewHost(ctx, &fleet.Host{ + Hostname: fmt.Sprintf("cleanup-host-%d", i), + UUID: hostUUID, + HardwareSerial: fmt.Sprintf("CLEANUP-%d", i), + PrimaryIP: fmt.Sprintf("192.168.10.%d", i+1), + PrimaryMac: fmt.Sprintf("00:00:00:00:10:%02d", i+1), + OsqueryHostID: ptr.String(hostUUID), + NodeKey: ptr.String(hostUUID), + DetailUpdatedAt: time.Now(), + Platform: "darwin", + }) + require.NoError(t, err) + setupMDMDeviceAndEnrollment(t, ds, ctx, hostUUID, hosts[i].HardwareSerial) + } + + // For each host, insert a host_declaration row for D1 with status=verified, operation_type=install. + // This simulates D1 being currently installed on each host. + // NOTE: We use UNHEX(?) with the hex token directly (not insertHostDeclaration which does + // UNHEX(MD5(?))) because we need the token to match the generated column in mdm_apple_declarations exactly. + for _, h := range hosts { + ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { + _, err := q.ExecContext(ctx, ` + INSERT INTO host_mdm_apple_declarations + (host_uuid, declaration_uuid, status, operation_type, token, declaration_identifier) + VALUES (?, ?, 'verified', 'install', UNHEX(?), ?)`, + h.UUID, d1.DeclarationUUID, d1Token, d1.Identifier) + return err + }) + } + + // Delete D1 from mdm_apple_declarations (simulating IT admin removing the old declaration). + // The host_mdm_apple_declarations rows for D1 remain — the reconciler will mark them for removal. + ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { + _, err := q.ExecContext(ctx, "DELETE FROM mdm_apple_declarations WHERE declaration_uuid = ?", d1.DeclarationUUID) + return err + }) + + // Now create D2 with the same identifier (possible since D1 was just deleted) and same raw_json → same token. + d2, err := ds.NewMDMAppleDeclaration(ctx, &fleet.MDMAppleDeclaration{ + DeclarationUUID: "decl-new", + Name: "New Declaration", + Identifier: "com.example.cleanup", + RawJSON: declJSON, + }) + require.NoError(t, err) + + var d2Token string + ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { + return sqlx.GetContext(ctx, q, &d2Token, + "SELECT HEX(token) FROM mdm_apple_declarations WHERE declaration_uuid = ?", d2.DeclarationUUID) + }) + require.Equal(t, d1Token, d2Token, "declarations with same raw_json should have the same token") + + // Force a small batch size so installs and removes end up in different batches. + // The UNION ALL query returns removes first, then installs. With 3 hosts: + // - 3 remove rows (D1 for each host) + 3 install rows (D2 for each host) = 6 rows + // - batch size 2 → batch 1: 2 removes, batch 2: 1 remove + 1 install, batch 3: 2 installs + // When batch 1 runs cleanUpDuplicateRemoveInstall, the matching installs haven't been upserted to + // status='pending' yet — they still have status='verified' — so the cleanup finds no match. + ds.testUpsertMDMDesiredProfilesBatchSize = 2 + t.Cleanup(func() { ds.testUpsertMDMDesiredProfilesBatchSize = 0 }) + + // Run the reconciler + _, err = ds.MDMAppleBatchSetHostDeclarationState(ctx) + require.NoError(t, err) + + // Assert: for each host, the cleanup should have: + // 1. Deleted the D1 remove row (same token as D2 install — duplicate remove/install) + // 2. Marked D2 install as verified with resync=1 + // With the bug: the D1 remove row survives because cleanup ran per-batch and missed + // cross-batch matches. Both D1 (remove/pending) and D2 (install/pending) exist. + type declRow struct { + DeclarationUUID string `db:"declaration_uuid"` + OperationType string `db:"operation_type"` + Status string `db:"status"` + Resync bool `db:"resync"` + Token *string `db:"token"` + } + for _, h := range hosts { + var rows []declRow + ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { + return sqlx.SelectContext(ctx, q, &rows, ` + SELECT declaration_uuid, operation_type, COALESCE(status, '') as status, resync, HEX(token) as token + FROM host_mdm_apple_declarations + WHERE host_uuid = ? + ORDER BY declaration_uuid`, h.UUID) + }) + + // Build a map by declaration UUID for easier assertions + rowByDecl := make(map[string]declRow, len(rows)) + for _, r := range rows { + rowByDecl[r.DeclarationUUID] = r + } + + // D1 remove row should NOT exist (cleaned up because same token as D2 install) + _, d1Exists := rowByDecl[d1.DeclarationUUID] + assert.False(t, d1Exists, "host %s: D1 remove row should have been cleaned up (same token as D2 install)", h.UUID) + + // D2 install row should exist as verified with resync=1 + d2Row, d2Exists := rowByDecl[d2.DeclarationUUID] + if assert.True(t, d2Exists, "host %s: D2 install row should exist", h.UUID) { + assert.Equal(t, "install", d2Row.OperationType, "host %s: D2 should be install", h.UUID) + assert.Equal(t, "verified", d2Row.Status, "host %s: D2 should be marked verified by cleanup", h.UUID) + assert.True(t, d2Row.Resync, "host %s: D2 should have resync=1", h.UUID) + } + } +} + +func testCleanUpOrphanedPendingRemoves(t *testing.T, ds *Datastore) { + ctx := t.Context() + + // This test simulates the "already stuck" scenario: a host has an orphaned + // remove/pending row and a matching install/verified row with the same token + // and identifier. No new declarations are changed, so the reconciler's + // changedDeclarations is empty. The cleanUpOrphanedPendingRemoves safety net + // in MDMAppleBatchSetHostDeclarationState should still clean it up. + + host, err := ds.NewHost(ctx, &fleet.Host{ + Hostname: "test-host-orphan", + UUID: "test-host-uuid-orphan", + HardwareSerial: "ORPHAN-001", + PrimaryIP: "192.168.20.1", + PrimaryMac: "00:00:00:00:20:01", + OsqueryHostID: ptr.String("test-host-uuid-orphan"), + NodeKey: ptr.String("test-host-uuid-orphan"), + DetailUpdatedAt: time.Now(), + Platform: "darwin", + }) + require.NoError(t, err) + setupMDMDeviceAndEnrollment(t, ds, ctx, host.UUID, host.HardwareSerial) + + // Create a declaration so we have a valid token to work with. + decl, err := ds.NewMDMAppleDeclaration(ctx, &fleet.MDMAppleDeclaration{ + DeclarationUUID: "decl-orphan-test", + Name: "Orphan Test Declaration", + Identifier: "com.example.orphan", + RawJSON: []byte(`{"Type":"com.apple.configuration.test","Identifier":"com.example.orphan"}`), + }) + require.NoError(t, err) + + // Get the token from mdm_apple_declarations + var hexToken string + ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { + return sqlx.GetContext(ctx, q, &hexToken, + "SELECT HEX(token) FROM mdm_apple_declarations WHERE declaration_uuid = ?", decl.DeclarationUUID) + }) + + // Simulate the stuck state: insert both an install/verified row (new UUID) + // and a remove/pending row (old UUID) with the same token and identifier. + // Use UNHEX(?) directly to get the exact same binary token. + ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { + _, err := q.ExecContext(ctx, ` + INSERT INTO host_mdm_apple_declarations + (host_uuid, declaration_uuid, status, operation_type, token, declaration_identifier) + VALUES (?, ?, 'verified', 'install', UNHEX(?), ?)`, + host.UUID, decl.DeclarationUUID, hexToken, decl.Identifier) + return err + }) + ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { + _, err := q.ExecContext(ctx, ` + INSERT INTO host_mdm_apple_declarations + (host_uuid, declaration_uuid, status, operation_type, token, declaration_identifier) + VALUES (?, ?, 'pending', 'remove', UNHEX(?), ?)`, + host.UUID, "decl-orphan-old-uuid", hexToken, decl.Identifier) + return err + }) + + // Verify both rows exist before the reconciler runs. + var countBefore int + ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { + return sqlx.GetContext(ctx, q, &countBefore, ` + SELECT COUNT(*) FROM host_mdm_apple_declarations WHERE host_uuid = ?`, host.UUID) + }) + require.Equal(t, 2, countBefore) + + // Run the reconciler. There are no changed declarations (the install row + // matches the desired state, and the remove row is excluded by the + // reconciler's query filter). The safety net should still clean up. + _, err = ds.MDMAppleBatchSetHostDeclarationState(ctx) + require.NoError(t, err) + + // Assert: the orphaned remove/pending row should be deleted. + type resultRow struct { + DeclarationUUID string `db:"declaration_uuid"` + OperationType string `db:"operation_type"` + Status string `db:"status"` + } + var remaining []resultRow + ExecAdhocSQL(t, ds, func(q sqlx.ExtContext) error { + return sqlx.SelectContext(ctx, q, &remaining, ` + SELECT declaration_uuid, operation_type, status + FROM host_mdm_apple_declarations + WHERE host_uuid = ? + ORDER BY declaration_uuid`, host.UUID) + }) + + require.Len(t, remaining, 1, "orphaned remove/pending row should have been cleaned up") + assert.Equal(t, decl.DeclarationUUID, remaining[0].DeclarationUUID) + assert.Equal(t, "install", remaining[0].OperationType) + assert.Equal(t, "verified", remaining[0].Status) +}