From e476b7adf973d645d226a0f9a1b999c4bb5db691 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 5 Jun 2026 20:52:02 +0000 Subject: [PATCH 1/3] Initial plan From d372bca6930dab3c1bd75a581dbf35014f141800 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 5 Jun 2026 21:05:49 +0000 Subject: [PATCH 2/3] Retry transient ARM DeploymentNotFound (404) on deployment create/poll Co-authored-by: JeffreyCA <9157833+JeffreyCA@users.noreply.github.com> --- cli/azd/CHANGELOG.md | 2 + cli/azd/pkg/azapi/standard_deployments.go | 52 ++++++++ .../standard_deployments_coverage3_test.go | 118 ++++++++++++++++++ 3 files changed, 172 insertions(+) diff --git a/cli/azd/CHANGELOG.md b/cli/azd/CHANGELOG.md index 41b1a7dced7..ed1be56d007 100644 --- a/cli/azd/CHANGELOG.md +++ b/cli/azd/CHANGELOG.md @@ -8,6 +8,8 @@ ### Bugs Fixed +- [[#8064]](https://github.com/Azure/azure-dev/issues/8064) Fix intermittent `DeploymentNotFound` failures during `azd provision`/`azd up` and `azd down` for subscription-scoped Bicep deployments (commonly seen with `infra.layers`) by retrying the transient HTTP 404 that ARM can briefly return for a deployment that was just submitted. + ### Other Changes ## 1.25.4 (2026-05-29) diff --git a/cli/azd/pkg/azapi/standard_deployments.go b/cli/azd/pkg/azapi/standard_deployments.go index 4f2ce0d1a2c..18962b94540 100644 --- a/cli/azd/pkg/azapi/standard_deployments.go +++ b/cli/azd/pkg/azapi/standard_deployments.go @@ -9,12 +9,14 @@ import ( "errors" "fmt" "maps" + "net/http" "net/url" "slices" "time" "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/policy" "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources" @@ -48,6 +50,48 @@ const ( slowPollFrequency = 15 * time.Second ) +// Retry tuning for transient HTTP 404 (DeploymentNotFound) responses returned while +// creating/polling an ARM deployment. ARM can briefly report DeploymentNotFound for a +// deployment that was just submitted (read-after-write inconsistency, most often observed on +// subscription-scoped deployments) even though the deployment was accepted and ultimately +// succeeds. These are declared as vars (not consts) so tests can shorten the delays. +var ( + // deploymentRetryMaxRetries is the number of additional attempts made when ARM returns a + // retryable status code (including a transient 404) for a deployment create/poll request. + deploymentRetryMaxRetries int32 = 5 + + // deploymentRetryDelay is the initial backoff between retries of a deployment create/poll + // request. The SDK applies exponential backoff up to deploymentRetryMaxDelay. + deploymentRetryDelay = 3 * time.Second + + // deploymentRetryMaxDelay caps the exponential backoff between deployment create/poll retries. + deploymentRetryMaxDelay = 15 * time.Second +) + +// withDeploymentRetry returns a context configured to retry transient HTTP 404 +// (DeploymentNotFound) responses in addition to the default ARM-retryable status codes. ARM +// occasionally returns a transient 404 for a deployment immediately after it is submitted +// (read-after-write inconsistency, most often on subscription-scoped deployments) even though +// the deployment was accepted and ultimately succeeds. Retrying the create/poll request lets +// azd's view converge with ARM instead of failing with DeploymentNotFound. This mirrors the +// existing handling for transient 404s in pkg/azsdk/zip_deploy_client.go. +func withDeploymentRetry(ctx context.Context) context.Context { + return policy.WithRetryOptions(ctx, policy.RetryOptions{ + MaxRetries: deploymentRetryMaxRetries, + RetryDelay: deploymentRetryDelay, + MaxRetryDelay: deploymentRetryMaxDelay, + StatusCodes: []int{ + http.StatusRequestTimeout, // 408 + http.StatusTooManyRequests, // 429 + http.StatusInternalServerError, // 500 + http.StatusBadGateway, // 502 + http.StatusServiceUnavailable, // 503 + http.StatusGatewayTimeout, // 504 + http.StatusNotFound, // 404 (transient DeploymentNotFound) + }, + }) +} + type StandardDeployments struct { credentialProvider account.SubscriptionCredentialProvider armClientOptions *arm.ClientOptions @@ -228,6 +272,10 @@ func (ds *StandardDeployments) DeployToSubscription( ctx, span := tracing.Start(ctx, events.ArmDeploySubscriptionEvent) defer func() { span.EndWithStatus(err) }() + // Retry transient DeploymentNotFound (404) responses that ARM can return immediately after a + // subscription-scoped deployment is submitted (read-after-write inconsistency). + ctx = withDeploymentRetry(ctx) + deploymentClient, err := ds.createDeploymentsClient(ctx, subscriptionId) if err != nil { return nil, fmt.Errorf("creating deployments client: %w", err) @@ -270,6 +318,10 @@ func (ds *StandardDeployments) DeployToResourceGroup( ctx, span := tracing.Start(ctx, events.ArmDeployResourceGroupEvent) defer func() { span.EndWithStatus(err) }() + // Retry transient DeploymentNotFound (404) responses that ARM can return immediately after a + // deployment is submitted (read-after-write inconsistency). + ctx = withDeploymentRetry(ctx) + deploymentClient, err := ds.createDeploymentsClient(ctx, subscriptionId) if err != nil { return nil, fmt.Errorf("creating deployments client: %w", err) diff --git a/cli/azd/pkg/azapi/standard_deployments_coverage3_test.go b/cli/azd/pkg/azapi/standard_deployments_coverage3_test.go index 8fd3b4035e0..000f6b86894 100644 --- a/cli/azd/pkg/azapi/standard_deployments_coverage3_test.go +++ b/cli/azd/pkg/azapi/standard_deployments_coverage3_test.go @@ -7,6 +7,7 @@ import ( "encoding/json" "net/http" "strings" + "sync/atomic" "testing" "time" @@ -326,6 +327,123 @@ func Test_StdDeployments_DeployToResourceGroup_Coverage3(t *testing.T) { assert.Equal(t, "rg-deploy", d.Name) } +// shortenDeploymentRetry makes the deployment 404 retry backoff effectively instant for tests +// and restores the production values when the test completes. +func shortenDeploymentRetry(t *testing.T) { + t.Helper() + origDelay, origMaxDelay := deploymentRetryDelay, deploymentRetryMaxDelay + deploymentRetryDelay = time.Millisecond + deploymentRetryMaxDelay = time.Millisecond + t.Cleanup(func() { + deploymentRetryDelay = origDelay + deploymentRetryMaxDelay = origMaxDelay + }) +} + +// Test_StdDeployments_DeployToSubscription_Transient404_Coverage3 verifies that a transient +// HTTP 404 (DeploymentNotFound) returned while polling a just-submitted subscription-scoped +// deployment is retried rather than surfaced as an error. This reproduces the read-after-write +// inconsistency reported in issue #8064 (infra.layers, subscription scope). +func Test_StdDeployments_DeployToSubscription_Transient404_Coverage3(t *testing.T) { + shortenDeploymentRetry(t) + + mockCtx := mocks.NewMockContext(t.Context()) + sd := newStdDeployments(mockCtx) + + // PUT accepts the deployment and reports a non-terminal state so the SDK begins polling. + mockCtx.HttpClient.When(func(req *http.Request) bool { + return req.Method == http.MethodPut && strings.Contains(req.URL.Path, "/deployments/sub-deploy") + }).RespondFn(func(req *http.Request) (*http.Response, error) { + dep := makeDeploymentExtended("sub-deploy", armresources.ProvisioningStateRunning) + return mocks.CreateHttpResponseWithBody(req, http.StatusCreated, dep) + }) + + // The first poll returns a transient 404; subsequent polls return the succeeded deployment. + var polls int32 + mockCtx.HttpClient.When(func(req *http.Request) bool { + return req.Method == http.MethodGet && strings.Contains(req.URL.Path, "/deployments/sub-deploy") + }).RespondFn(func(req *http.Request) (*http.Response, error) { + if atomic.AddInt32(&polls, 1) == 1 { + return mocks.CreateEmptyHttpResponse(req, http.StatusNotFound) + } + dep := makeDeploymentExtended("sub-deploy", armresources.ProvisioningStateSucceeded) + return mocks.CreateHttpResponseWithBody(req, http.StatusOK, dep) + }) + + template := azure.RawArmTemplate(json.RawMessage(`{"$schema":"test"}`)) + d, err := sd.DeployToSubscription( + *mockCtx.Context, "SUB", "eastus", "sub-deploy", template, azure.ArmParameters{}, nil, nil) + require.NoError(t, err) + assert.Equal(t, "sub-deploy", d.Name) + assert.Equal(t, DeploymentProvisioningStateSucceeded, d.ProvisioningState) + assert.GreaterOrEqual(t, atomic.LoadInt32(&polls), int32(2), "expected the transient 404 poll to be retried") +} + +// Test_StdDeployments_DeployToResourceGroup_Transient404_Coverage3 verifies the same transient +// 404 retry behavior for resource-group-scoped deployments. +func Test_StdDeployments_DeployToResourceGroup_Transient404_Coverage3(t *testing.T) { + shortenDeploymentRetry(t) + + mockCtx := mocks.NewMockContext(t.Context()) + sd := newStdDeployments(mockCtx) + + mockCtx.HttpClient.When(func(req *http.Request) bool { + return req.Method == http.MethodPut && strings.Contains(req.URL.Path, "/deployments/rg-deploy") + }).RespondFn(func(req *http.Request) (*http.Response, error) { + dep := makeDeploymentExtended("rg-deploy", armresources.ProvisioningStateRunning) + return mocks.CreateHttpResponseWithBody(req, http.StatusCreated, dep) + }) + + var polls int32 + mockCtx.HttpClient.When(func(req *http.Request) bool { + return req.Method == http.MethodGet && strings.Contains(req.URL.Path, "/deployments/rg-deploy") + }).RespondFn(func(req *http.Request) (*http.Response, error) { + if atomic.AddInt32(&polls, 1) == 1 { + return mocks.CreateEmptyHttpResponse(req, http.StatusNotFound) + } + dep := makeDeploymentExtended("rg-deploy", armresources.ProvisioningStateSucceeded) + return mocks.CreateHttpResponseWithBody(req, http.StatusOK, dep) + }) + + template := azure.RawArmTemplate(json.RawMessage(`{"$schema":"test"}`)) + d, err := sd.DeployToResourceGroup( + *mockCtx.Context, "SUB", "RG1", "rg-deploy", template, azure.ArmParameters{}, nil, nil) + require.NoError(t, err) + assert.Equal(t, "rg-deploy", d.Name) + assert.GreaterOrEqual(t, atomic.LoadInt32(&polls), int32(2), "expected the transient 404 poll to be retried") +} + +// Test_StdDeployments_DeployToSubscription_Persistent404_Coverage3 verifies that a deployment +// whose poll keeps returning 404 still fails after the retries are exhausted, so genuine +// DeploymentNotFound errors are not masked. +func Test_StdDeployments_DeployToSubscription_Persistent404_Coverage3(t *testing.T) { + shortenDeploymentRetry(t) + origRetries := deploymentRetryMaxRetries + deploymentRetryMaxRetries = 2 + t.Cleanup(func() { deploymentRetryMaxRetries = origRetries }) + + mockCtx := mocks.NewMockContext(t.Context()) + sd := newStdDeployments(mockCtx) + + mockCtx.HttpClient.When(func(req *http.Request) bool { + return req.Method == http.MethodPut && strings.Contains(req.URL.Path, "/deployments/sub-deploy") + }).RespondFn(func(req *http.Request) (*http.Response, error) { + dep := makeDeploymentExtended("sub-deploy", armresources.ProvisioningStateRunning) + return mocks.CreateHttpResponseWithBody(req, http.StatusCreated, dep) + }) + + mockCtx.HttpClient.When(func(req *http.Request) bool { + return req.Method == http.MethodGet && strings.Contains(req.URL.Path, "/deployments/sub-deploy") + }).RespondFn(func(req *http.Request) (*http.Response, error) { + return mocks.CreateEmptyHttpResponse(req, http.StatusNotFound) + }) + + template := azure.RawArmTemplate(json.RawMessage(`{"$schema":"test"}`)) + _, err := sd.DeployToSubscription( + *mockCtx.Context, "SUB", "eastus", "sub-deploy", template, azure.ArmParameters{}, nil, nil) + require.Error(t, err) +} + func Test_StdDeployments_WhatIfDeployToSubscription_Coverage3(t *testing.T) { mockCtx := mocks.NewMockContext(t.Context()) sd := newStdDeployments(mockCtx) From 6757ec2530af397ce24c96c4813dce51c1c8a607 Mon Sep 17 00:00:00 2001 From: Jeffrey Chen Date: Mon, 8 Jun 2026 23:01:43 +0000 Subject: [PATCH 3/3] Address feedback Co-authored-by: Copilot --- cli/azd/pkg/azapi/standard_deployments.go | 32 +++++++++++-------- .../standard_deployments_coverage3_test.go | 24 ++++++++++++++ 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/cli/azd/pkg/azapi/standard_deployments.go b/cli/azd/pkg/azapi/standard_deployments.go index 18962b94540..851e6ca1638 100644 --- a/cli/azd/pkg/azapi/standard_deployments.go +++ b/cli/azd/pkg/azapi/standard_deployments.go @@ -70,11 +70,13 @@ var ( // withDeploymentRetry returns a context configured to retry transient HTTP 404 // (DeploymentNotFound) responses in addition to the default ARM-retryable status codes. ARM -// occasionally returns a transient 404 for a deployment immediately after it is submitted +// occasionally returns a transient 404 while polling a deployment that was just submitted // (read-after-write inconsistency, most often on subscription-scoped deployments) even though -// the deployment was accepted and ultimately succeeds. Retrying the create/poll request lets -// azd's view converge with ARM instead of failing with DeploymentNotFound. This mirrors the -// existing handling for transient 404s in pkg/azsdk/zip_deploy_client.go. +// the deployment was accepted and ultimately succeeds. This context is applied only to the LRO +// poller (not the initial submit), so a genuine submit-time 404 (e.g. missing resource group) +// still fails fast while the poller converges with ARM instead of failing with +// DeploymentNotFound. This mirrors the existing handling for transient 404s in +// pkg/azsdk/zip_deploy_client.go. func withDeploymentRetry(ctx context.Context) context.Context { return policy.WithRetryOptions(ctx, policy.RetryOptions{ MaxRetries: deploymentRetryMaxRetries, @@ -272,10 +274,6 @@ func (ds *StandardDeployments) DeployToSubscription( ctx, span := tracing.Start(ctx, events.ArmDeploySubscriptionEvent) defer func() { span.EndWithStatus(err) }() - // Retry transient DeploymentNotFound (404) responses that ARM can return immediately after a - // subscription-scoped deployment is submitted (read-after-write inconsistency). - ctx = withDeploymentRetry(ctx) - deploymentClient, err := ds.createDeploymentsClient(ctx, subscriptionId) if err != nil { return nil, fmt.Errorf("creating deployments client: %w", err) @@ -296,8 +294,13 @@ func (ds *StandardDeployments) DeployToSubscription( return nil, fmt.Errorf("starting deployment to subscription: %w", err) } + // Retry transient DeploymentNotFound (404) responses that ARM can return while polling a + // subscription-scoped deployment that was just submitted (read-after-write inconsistency). + // Scoped to the poller only so a genuine submit-time 404 still fails fast. + pollCtx := withDeploymentRetry(ctx) + // wait for deployment creation - deployResult, err := createFromTemplateOperation.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + deployResult, err := createFromTemplateOperation.PollUntilDone(pollCtx, &runtime.PollUntilDoneOptions{ Frequency: deployPollFrequency, }) if err != nil { @@ -318,10 +321,6 @@ func (ds *StandardDeployments) DeployToResourceGroup( ctx, span := tracing.Start(ctx, events.ArmDeployResourceGroupEvent) defer func() { span.EndWithStatus(err) }() - // Retry transient DeploymentNotFound (404) responses that ARM can return immediately after a - // deployment is submitted (read-after-write inconsistency). - ctx = withDeploymentRetry(ctx) - deploymentClient, err := ds.createDeploymentsClient(ctx, subscriptionId) if err != nil { return nil, fmt.Errorf("creating deployments client: %w", err) @@ -341,8 +340,13 @@ func (ds *StandardDeployments) DeployToResourceGroup( return nil, fmt.Errorf("starting deployment to resource group: %w", err) } + // Retry transient DeploymentNotFound (404) responses that ARM can return while polling a + // deployment that was just submitted (read-after-write inconsistency). Scoped to the poller + // only so a genuine submit-time 404 (e.g. missing resource group) still fails fast. + pollCtx := withDeploymentRetry(ctx) + // wait for deployment creation - deployResult, err := createFromTemplateOperation.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + deployResult, err := createFromTemplateOperation.PollUntilDone(pollCtx, &runtime.PollUntilDoneOptions{ Frequency: deployPollFrequency, }) if err != nil { diff --git a/cli/azd/pkg/azapi/standard_deployments_coverage3_test.go b/cli/azd/pkg/azapi/standard_deployments_coverage3_test.go index 000f6b86894..9dfc019d398 100644 --- a/cli/azd/pkg/azapi/standard_deployments_coverage3_test.go +++ b/cli/azd/pkg/azapi/standard_deployments_coverage3_test.go @@ -444,6 +444,30 @@ func Test_StdDeployments_DeployToSubscription_Persistent404_Coverage3(t *testing require.Error(t, err) } +// Test_StdDeployments_DeployToResourceGroup_SubmitTime404_Coverage3 verifies that a 404 returned +// by the initial submit (PUT) is not retried, so a genuine submit-time DeploymentNotFound (e.g. a +// missing resource group) fails fast instead of being delayed by the poll-only 404 retry. +func Test_StdDeployments_DeployToResourceGroup_SubmitTime404_Coverage3(t *testing.T) { + shortenDeploymentRetry(t) + + mockCtx := mocks.NewMockContext(t.Context()) + sd := newStdDeployments(mockCtx) + + var puts int32 + mockCtx.HttpClient.When(func(req *http.Request) bool { + return req.Method == http.MethodPut && strings.Contains(req.URL.Path, "/deployments/rg-deploy") + }).RespondFn(func(req *http.Request) (*http.Response, error) { + atomic.AddInt32(&puts, 1) + return mocks.CreateEmptyHttpResponse(req, http.StatusNotFound) + }) + + template := azure.RawArmTemplate(json.RawMessage(`{"$schema":"test"}`)) + _, err := sd.DeployToResourceGroup( + *mockCtx.Context, "SUB", "RG1", "rg-deploy", template, azure.ArmParameters{}, nil, nil) + require.Error(t, err) + assert.Equal(t, int32(1), atomic.LoadInt32(&puts), "submit-time 404 must not be retried") +} + func Test_StdDeployments_WhatIfDeployToSubscription_Coverage3(t *testing.T) { mockCtx := mocks.NewMockContext(t.Context()) sd := newStdDeployments(mockCtx)