Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cli/azd/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

### Bugs Fixed

- [[#8064]](https://github.com/Azure/azure-dev/issues/8064) Fix intermittent `DeploymentNotFound` failures during `azd provision`/`azd up` and `azd down` for subscription-scoped Bicep deployments (commonly seen with `infra.layers`) by retrying the transient HTTP 404 that ARM can briefly return for a deployment that was just submitted.

### Other Changes

## 1.25.4 (2026-05-29)
Expand Down
60 changes: 58 additions & 2 deletions cli/azd/pkg/azapi/standard_deployments.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@ import (
"errors"
"fmt"
"maps"
"net/http"
"net/url"
"slices"
"time"

"github.com/Azure/azure-sdk-for-go/sdk/azcore"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/arm"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/policy"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources"
Expand Down Expand Up @@ -48,6 +50,50 @@ const (
slowPollFrequency = 15 * time.Second
)

// Retry tuning for transient HTTP 404 (DeploymentNotFound) responses returned while
// creating/polling an ARM deployment. ARM can briefly report DeploymentNotFound for a
// deployment that was just submitted (read-after-write inconsistency, most often observed on
// subscription-scoped deployments) even though the deployment was accepted and ultimately
// succeeds. These are declared as vars (not consts) so tests can shorten the delays.
var (
// deploymentRetryMaxRetries is the number of additional attempts made when ARM returns a
// retryable status code (including a transient 404) for a deployment create/poll request.
deploymentRetryMaxRetries int32 = 5

// deploymentRetryDelay is the initial backoff between retries of a deployment create/poll
// request. The SDK applies exponential backoff up to deploymentRetryMaxDelay.
deploymentRetryDelay = 3 * time.Second

// deploymentRetryMaxDelay caps the exponential backoff between deployment create/poll retries.
deploymentRetryMaxDelay = 15 * time.Second
)

// withDeploymentRetry returns a context configured to retry transient HTTP 404
// (DeploymentNotFound) responses in addition to the default ARM-retryable status codes. ARM
// occasionally returns a transient 404 while polling a deployment that was just submitted
// (read-after-write inconsistency, most often on subscription-scoped deployments) even though
// the deployment was accepted and ultimately succeeds. This context is applied only to the LRO
// poller (not the initial submit), so a genuine submit-time 404 (e.g. missing resource group)
// still fails fast while the poller converges with ARM instead of failing with
// DeploymentNotFound. This mirrors the existing handling for transient 404s in
// pkg/azsdk/zip_deploy_client.go.
func withDeploymentRetry(ctx context.Context) context.Context {
return policy.WithRetryOptions(ctx, policy.RetryOptions{
MaxRetries: deploymentRetryMaxRetries,
RetryDelay: deploymentRetryDelay,
MaxRetryDelay: deploymentRetryMaxDelay,
StatusCodes: []int{
http.StatusRequestTimeout, // 408
http.StatusTooManyRequests, // 429
http.StatusInternalServerError, // 500
http.StatusBadGateway, // 502
http.StatusServiceUnavailable, // 503
http.StatusGatewayTimeout, // 504
http.StatusNotFound, // 404 (transient DeploymentNotFound)
},
})
}

type StandardDeployments struct {
credentialProvider account.SubscriptionCredentialProvider
armClientOptions *arm.ClientOptions
Expand Down Expand Up @@ -248,8 +294,13 @@ func (ds *StandardDeployments) DeployToSubscription(
return nil, fmt.Errorf("starting deployment to subscription: %w", err)
}

// Retry transient DeploymentNotFound (404) responses that ARM can return while polling a
// subscription-scoped deployment that was just submitted (read-after-write inconsistency).
// Scoped to the poller only so a genuine submit-time 404 still fails fast.
pollCtx := withDeploymentRetry(ctx)

// wait for deployment creation
deployResult, err := createFromTemplateOperation.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{
deployResult, err := createFromTemplateOperation.PollUntilDone(pollCtx, &runtime.PollUntilDoneOptions{
Frequency: deployPollFrequency,
})
if err != nil {
Expand Down Expand Up @@ -289,8 +340,13 @@ func (ds *StandardDeployments) DeployToResourceGroup(
return nil, fmt.Errorf("starting deployment to resource group: %w", err)
}

// Retry transient DeploymentNotFound (404) responses that ARM can return while polling a
// deployment that was just submitted (read-after-write inconsistency). Scoped to the poller
// only so a genuine submit-time 404 (e.g. missing resource group) still fails fast.
pollCtx := withDeploymentRetry(ctx)

// wait for deployment creation
deployResult, err := createFromTemplateOperation.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{
deployResult, err := createFromTemplateOperation.PollUntilDone(pollCtx, &runtime.PollUntilDoneOptions{
Frequency: deployPollFrequency,
})
if err != nil {
Expand Down
142 changes: 142 additions & 0 deletions cli/azd/pkg/azapi/standard_deployments_coverage3_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"encoding/json"
"net/http"
"strings"
"sync/atomic"
"testing"
"time"

Expand Down Expand Up @@ -326,6 +327,147 @@ func Test_StdDeployments_DeployToResourceGroup_Coverage3(t *testing.T) {
assert.Equal(t, "rg-deploy", d.Name)
}

// shortenDeploymentRetry makes the deployment 404 retry backoff effectively instant for tests
// and restores the production values when the test completes.
func shortenDeploymentRetry(t *testing.T) {
t.Helper()
origDelay, origMaxDelay := deploymentRetryDelay, deploymentRetryMaxDelay
deploymentRetryDelay = time.Millisecond
deploymentRetryMaxDelay = time.Millisecond
t.Cleanup(func() {
deploymentRetryDelay = origDelay
deploymentRetryMaxDelay = origMaxDelay
})
}

// Test_StdDeployments_DeployToSubscription_Transient404_Coverage3 verifies that a transient
// HTTP 404 (DeploymentNotFound) returned while polling a just-submitted subscription-scoped
// deployment is retried rather than surfaced as an error. This reproduces the read-after-write
// inconsistency reported in issue #8064 (infra.layers, subscription scope).
func Test_StdDeployments_DeployToSubscription_Transient404_Coverage3(t *testing.T) {
shortenDeploymentRetry(t)

mockCtx := mocks.NewMockContext(t.Context())
sd := newStdDeployments(mockCtx)

// PUT accepts the deployment and reports a non-terminal state so the SDK begins polling.
mockCtx.HttpClient.When(func(req *http.Request) bool {
return req.Method == http.MethodPut && strings.Contains(req.URL.Path, "/deployments/sub-deploy")
}).RespondFn(func(req *http.Request) (*http.Response, error) {
dep := makeDeploymentExtended("sub-deploy", armresources.ProvisioningStateRunning)
return mocks.CreateHttpResponseWithBody(req, http.StatusCreated, dep)
})

// The first poll returns a transient 404; subsequent polls return the succeeded deployment.
var polls int32
mockCtx.HttpClient.When(func(req *http.Request) bool {
return req.Method == http.MethodGet && strings.Contains(req.URL.Path, "/deployments/sub-deploy")
}).RespondFn(func(req *http.Request) (*http.Response, error) {
if atomic.AddInt32(&polls, 1) == 1 {
return mocks.CreateEmptyHttpResponse(req, http.StatusNotFound)
}
dep := makeDeploymentExtended("sub-deploy", armresources.ProvisioningStateSucceeded)
return mocks.CreateHttpResponseWithBody(req, http.StatusOK, dep)
})

template := azure.RawArmTemplate(json.RawMessage(`{"$schema":"test"}`))
d, err := sd.DeployToSubscription(
*mockCtx.Context, "SUB", "eastus", "sub-deploy", template, azure.ArmParameters{}, nil, nil)
require.NoError(t, err)
assert.Equal(t, "sub-deploy", d.Name)
assert.Equal(t, DeploymentProvisioningStateSucceeded, d.ProvisioningState)
assert.GreaterOrEqual(t, atomic.LoadInt32(&polls), int32(2), "expected the transient 404 poll to be retried")
}

// Test_StdDeployments_DeployToResourceGroup_Transient404_Coverage3 verifies the same transient
// 404 retry behavior for resource-group-scoped deployments.
func Test_StdDeployments_DeployToResourceGroup_Transient404_Coverage3(t *testing.T) {
shortenDeploymentRetry(t)

mockCtx := mocks.NewMockContext(t.Context())
sd := newStdDeployments(mockCtx)

mockCtx.HttpClient.When(func(req *http.Request) bool {
return req.Method == http.MethodPut && strings.Contains(req.URL.Path, "/deployments/rg-deploy")
}).RespondFn(func(req *http.Request) (*http.Response, error) {
dep := makeDeploymentExtended("rg-deploy", armresources.ProvisioningStateRunning)
return mocks.CreateHttpResponseWithBody(req, http.StatusCreated, dep)
})

var polls int32
mockCtx.HttpClient.When(func(req *http.Request) bool {
return req.Method == http.MethodGet && strings.Contains(req.URL.Path, "/deployments/rg-deploy")
}).RespondFn(func(req *http.Request) (*http.Response, error) {
if atomic.AddInt32(&polls, 1) == 1 {
return mocks.CreateEmptyHttpResponse(req, http.StatusNotFound)
}
dep := makeDeploymentExtended("rg-deploy", armresources.ProvisioningStateSucceeded)
return mocks.CreateHttpResponseWithBody(req, http.StatusOK, dep)
})

template := azure.RawArmTemplate(json.RawMessage(`{"$schema":"test"}`))
d, err := sd.DeployToResourceGroup(
*mockCtx.Context, "SUB", "RG1", "rg-deploy", template, azure.ArmParameters{}, nil, nil)
require.NoError(t, err)
assert.Equal(t, "rg-deploy", d.Name)
assert.GreaterOrEqual(t, atomic.LoadInt32(&polls), int32(2), "expected the transient 404 poll to be retried")
}

// Test_StdDeployments_DeployToSubscription_Persistent404_Coverage3 verifies that a deployment
// whose poll keeps returning 404 still fails after the retries are exhausted, so genuine
// DeploymentNotFound errors are not masked.
func Test_StdDeployments_DeployToSubscription_Persistent404_Coverage3(t *testing.T) {
shortenDeploymentRetry(t)
origRetries := deploymentRetryMaxRetries
deploymentRetryMaxRetries = 2
t.Cleanup(func() { deploymentRetryMaxRetries = origRetries })

mockCtx := mocks.NewMockContext(t.Context())
sd := newStdDeployments(mockCtx)

mockCtx.HttpClient.When(func(req *http.Request) bool {
return req.Method == http.MethodPut && strings.Contains(req.URL.Path, "/deployments/sub-deploy")
}).RespondFn(func(req *http.Request) (*http.Response, error) {
dep := makeDeploymentExtended("sub-deploy", armresources.ProvisioningStateRunning)
return mocks.CreateHttpResponseWithBody(req, http.StatusCreated, dep)
})

mockCtx.HttpClient.When(func(req *http.Request) bool {
return req.Method == http.MethodGet && strings.Contains(req.URL.Path, "/deployments/sub-deploy")
}).RespondFn(func(req *http.Request) (*http.Response, error) {
return mocks.CreateEmptyHttpResponse(req, http.StatusNotFound)
})

template := azure.RawArmTemplate(json.RawMessage(`{"$schema":"test"}`))
_, err := sd.DeployToSubscription(
*mockCtx.Context, "SUB", "eastus", "sub-deploy", template, azure.ArmParameters{}, nil, nil)
require.Error(t, err)
}

// Test_StdDeployments_DeployToResourceGroup_SubmitTime404_Coverage3 verifies that a 404 returned
// by the initial submit (PUT) is not retried, so a genuine submit-time DeploymentNotFound (e.g. a
// missing resource group) fails fast instead of being delayed by the poll-only 404 retry.
func Test_StdDeployments_DeployToResourceGroup_SubmitTime404_Coverage3(t *testing.T) {
shortenDeploymentRetry(t)

mockCtx := mocks.NewMockContext(t.Context())
sd := newStdDeployments(mockCtx)

var puts int32
mockCtx.HttpClient.When(func(req *http.Request) bool {
return req.Method == http.MethodPut && strings.Contains(req.URL.Path, "/deployments/rg-deploy")
}).RespondFn(func(req *http.Request) (*http.Response, error) {
atomic.AddInt32(&puts, 1)
return mocks.CreateEmptyHttpResponse(req, http.StatusNotFound)
})

template := azure.RawArmTemplate(json.RawMessage(`{"$schema":"test"}`))
_, err := sd.DeployToResourceGroup(
*mockCtx.Context, "SUB", "RG1", "rg-deploy", template, azure.ArmParameters{}, nil, nil)
require.Error(t, err)
assert.Equal(t, int32(1), atomic.LoadInt32(&puts), "submit-time 404 must not be retried")
}

func Test_StdDeployments_WhatIfDeployToSubscription_Coverage3(t *testing.T) {
mockCtx := mocks.NewMockContext(t.Context())
sd := newStdDeployments(mockCtx)
Expand Down
Loading