diff --git a/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py b/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py index d8ceb7de..db4ac36f 100644 --- a/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py +++ b/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py @@ -390,12 +390,30 @@ def list(region: Optional[str] = None, stack_status_filter: Optional[List[str]] """ cf = create_boto3_client('cloudformation', region_name=region) + # All valid stack statuses except DELETE_COMPLETE, used to avoid paginating + # through tens of thousands of deleted stacks which causes throttling. + _ACTIVE_STACK_STATUSES = [ + 'CREATE_IN_PROGRESS', 'CREATE_FAILED', 'CREATE_COMPLETE', + 'ROLLBACK_IN_PROGRESS', 'ROLLBACK_FAILED', 'ROLLBACK_COMPLETE', + 'DELETE_IN_PROGRESS', 'DELETE_FAILED', + 'UPDATE_IN_PROGRESS', 'UPDATE_COMPLETE_CLEANUP_IN_PROGRESS', + 'UPDATE_COMPLETE', 'UPDATE_FAILED', + 'UPDATE_ROLLBACK_IN_PROGRESS', 'UPDATE_ROLLBACK_FAILED', + 'UPDATE_ROLLBACK_COMPLETE_CLEANUP_IN_PROGRESS', 'UPDATE_ROLLBACK_COMPLETE', + 'REVIEW_IN_PROGRESS', 'IMPORT_IN_PROGRESS', 'IMPORT_COMPLETE', + 'IMPORT_ROLLBACK_IN_PROGRESS', 'IMPORT_ROLLBACK_FAILED', 'IMPORT_ROLLBACK_COMPLETE', + ] + try: # Prepare API call parameters list_params = {} if stack_status_filter is not None: list_params['StackStatusFilter'] = stack_status_filter + else: + # Exclude DELETE_COMPLETE at the API level to avoid paginating through + # large numbers of deleted stacks, which causes throttling errors. + list_params['StackStatusFilter'] = _ACTIVE_STACK_STATUSES response = cf.list_stacks(**list_params) @@ -406,13 +424,6 @@ def list(region: Optional[str] = None, stack_status_filter: Optional[List[str]] response = cf.list_stacks(**list_params) all_summaries.extend(response.get('StackSummaries', [])) - # Only filter DELETE_COMPLETE when no explicit filter is provided - if stack_status_filter is None: - all_summaries = [ - stack for stack in all_summaries - if stack.get('StackStatus') != 'DELETE_COMPLETE' - ] - return {'StackSummaries': all_summaries} except cf.exceptions.ClientError as e: error_code = e.response['Error']['Code'] diff --git a/test/integration_tests/cluster_management/test_hp_cluster_stack.py b/test/integration_tests/cluster_management/test_hp_cluster_stack.py index eca37ee8..7ccd84a0 100644 --- a/test/integration_tests/cluster_management/test_hp_cluster_stack.py +++ b/test/integration_tests/cluster_management/test_hp_cluster_stack.py @@ -32,15 +32,17 @@ def stack_name(self): @pytest.mark.dependency(name="list_stacks") def test_list_stacks(self): """Test listing CloudFormation stacks using HpClusterStack.list.""" - # Test listing stacks - should return a response with StackSummaries response = HpClusterStack.list() - - # Verify response structure + assert isinstance(response, dict) assert 'StackSummaries' in response assert isinstance(response['StackSummaries'], list) - - # If there are stacks, verify they have expected fields + + # Verify DELETE_COMPLETE stacks are excluded at the API level + for stack in response['StackSummaries']: + assert stack.get('StackStatus') != 'DELETE_COMPLETE', \ + f"DELETE_COMPLETE stack found in results: {stack['StackName']}" + if response['StackSummaries']: stack = response['StackSummaries'][0] assert 'StackName' in stack diff --git a/test/unit_tests/cluster_management/test_hp_cluster_stack.py b/test/unit_tests/cluster_management/test_hp_cluster_stack.py index 3c1e84ae..fc49b67b 100644 --- a/test/unit_tests/cluster_management/test_hp_cluster_stack.py +++ b/test/unit_tests/cluster_management/test_hp_cluster_stack.py @@ -94,16 +94,31 @@ def test_describe_access_denied(self, mock_boto3_client, mock_boto3_session): def test_list_success(self, mock_create_client): mock_cf_client = MagicMock() mock_create_client.return_value = mock_cf_client - - mock_response = {'StackSummaries': [{'StackName': 'stack1'}, {'StackName': 'stack2'}]} + + mock_response = {'StackSummaries': [{'StackName': 'stack1', 'StackStatus': 'CREATE_COMPLETE'}]} mock_cf_client.list_stacks.return_value = mock_response - + result = HpClusterStack.list() - + mock_create_client.assert_called_once_with('cloudformation', region_name=None) - mock_cf_client.list_stacks.assert_called_once() + # Verify DELETE_COMPLETE is excluded at the API level + call_kwargs = mock_cf_client.list_stacks.call_args[1] + self.assertIn('StackStatusFilter', call_kwargs) + self.assertNotIn('DELETE_COMPLETE', call_kwargs['StackStatusFilter']) self.assertEqual(result, mock_response) + @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.create_boto3_client') + def test_list_with_explicit_status_filter(self, mock_create_client): + """Explicit StackStatusFilter is passed through as-is.""" + mock_cf_client = MagicMock() + mock_create_client.return_value = mock_cf_client + mock_cf_client.list_stacks.return_value = {'StackSummaries': []} + + HpClusterStack.list(stack_status_filter=['CREATE_COMPLETE']) + + call_kwargs = mock_cf_client.list_stacks.call_args[1] + self.assertEqual(call_kwargs['StackStatusFilter'], ['CREATE_COMPLETE']) + @patch('boto3.session.Session') @patch('boto3.client') def test_list_access_denied(self, mock_boto3_client, mock_boto3_session): @@ -479,29 +494,16 @@ class TestHpClusterStackList(unittest.TestCase): @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.create_boto3_client') def test_list_default_filters_delete_complete(self, mock_create_client): - """Test that list() filters out DELETE_COMPLETE stacks by default.""" - # Arrange + """Test that list() excludes DELETE_COMPLETE at the API level via StackStatusFilter.""" mock_cf_client = MagicMock() mock_create_client.return_value = mock_cf_client - - mock_response = { - 'StackSummaries': [ - {'StackName': 'active-stack', 'StackStatus': 'CREATE_COMPLETE'}, - {'StackName': 'deleted-stack', 'StackStatus': 'DELETE_COMPLETE'}, - {'StackName': 'updating-stack', 'StackStatus': 'UPDATE_IN_PROGRESS'} - ] - } - mock_cf_client.list_stacks.return_value = mock_response - - # Act - result = HpClusterStack.list() - - # Assert - assert len(result['StackSummaries']) == 2 - stack_names = [stack['StackName'] for stack in result['StackSummaries']] - assert 'active-stack' in stack_names - assert 'updating-stack' in stack_names - assert 'deleted-stack' not in stack_names + mock_cf_client.list_stacks.return_value = {'StackSummaries': []} + + HpClusterStack.list() + + call_kwargs = mock_cf_client.list_stacks.call_args[1] + self.assertIn('StackStatusFilter', call_kwargs) + self.assertNotIn('DELETE_COMPLETE', call_kwargs['StackStatusFilter']) @patch('sagemaker.hyperpod.cluster_management.hp_cluster_stack.create_boto3_client') def test_list_paginates_through_all_pages(self, mock_create_client):