Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,8 @@ def get_mpirun_command(
"LD_LIBRARY_PATH",
"-x",
"PATH",
"-x",
"PYTHONPATH",
]

if additional_options:
Expand Down
16 changes: 8 additions & 8 deletions sagemaker-train/tests/integ/train/test_benchmark_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,12 @@
"region": "us-west-2",
}

# Base model only evaluation configuration (from commented section in notebook)
# Base model only evaluation configuration
BASE_MODEL_ONLY_CONFIG = {
"base_model_id": "meta-textgeneration-llama-3-2-1b-instruct",
"dataset_s3_uri": "s3://sagemaker-us-west-2-052150106756/studio-users/d20251107t195443/datasets/2025-11-07T19-55-37-609Z/zc_test.jsonl",
"s3_output_path": "s3://mufi-test-serverless-smtj/eval/",
"mlflow_tracking_server_arn": "arn:aws:sagemaker:us-west-2:052150106756:mlflow-tracking-server/mmlu-eval-experiment",
"dataset_s3_uri": "s3://sagemaker-us-west-2-729646638167/model-customization/eval/zc_test.jsonl",
"s3_output_path": "s3://sagemaker-us-west-2-729646638167/model-customization/eval/",
"mlflow_tracking_server_arn": "arn:aws:sagemaker:us-west-2:729646638167:mlflow-app/app-W7FOBBXZANVX",
"region": "us-west-2",
}

Expand All @@ -72,7 +72,7 @@
}


@pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/")
# @pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/")
class TestBenchmarkEvaluatorIntegration:
"""Integration tests for BenchmarkEvaluator with fine-tuned model package"""

Expand Down Expand Up @@ -286,7 +286,7 @@ def test_benchmark_subtasks_validation(self):

logger.info("Subtask validation tests passed")

@pytest.mark.skip(reason="Base model only evaluation - to be enabled when needed")
# @pytest.mark.skip(reason="Base model only evaluation - to be enabled when needed")
def test_benchmark_evaluation_base_model_only(self):
"""
Test benchmark evaluation with base model only (no fine-tuned model).
Expand All @@ -307,7 +307,7 @@ def test_benchmark_evaluation_base_model_only(self):
benchmark=Benchmark.MMLU,
model=BASE_MODEL_ONLY_CONFIG["base_model_id"],
s3_output_path=BASE_MODEL_ONLY_CONFIG["s3_output_path"],
mlflow_resource_arn=BASE_MODEL_ONLY_CONFIG["mlflow_tracking_server_arn"],
# mlflow_resource_arn=BASE_MODEL_ONLY_CONFIG["mlflow_tracking_server_arn"],
base_eval_name="integ-test-base-model-only",
# Note: model_package_group not needed for JumpStart models
)
Expand Down Expand Up @@ -339,7 +339,7 @@ def test_benchmark_evaluation_base_model_only(self):
assert execution.status.overall_status == "Succeeded"
logger.info("Base model only evaluation completed successfully")

@pytest.mark.skip(reason="Nova model evaluation - to be enabled when needed")
# @pytest.mark.skip(reason="Nova model evaluation - to be enabled when needed")
def test_benchmark_evaluation_nova_model(self):
"""
Test benchmark evaluation with Nova model.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@
}


@pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/")
# @pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/")
@pytest.mark.xdist_group("custom_scorer_evaluator")
class TestCustomScorerEvaluatorIntegration:
"""Integration tests for CustomScorerEvaluator with custom evaluator"""

Expand Down Expand Up @@ -233,7 +234,7 @@ def test_custom_scorer_evaluator_validation(self):

logger.info("Validation tests passed")

@pytest.mark.skip(reason="Built-in metric evaluation - to be enabled when needed")
# @pytest.mark.skip(reason="Built-in metric evaluation - to be enabled when needed")
def test_custom_scorer_with_builtin_metric(self):
"""
Test custom scorer evaluation with built-in metric.
Expand Down Expand Up @@ -285,7 +286,7 @@ def test_custom_scorer_with_builtin_metric(self):
assert execution.status.overall_status == "Succeeded"
logger.info("Built-in metric evaluation completed successfully")

@pytest.mark.skip(reason="Base model only evaluation - not working yet per notebook")
# @pytest.mark.skip(reason="Base model only evaluation - not working yet per notebook")
def test_custom_scorer_base_model_only(self):
"""
Test custom scorer evaluation with base model only (no fine-tuned model).
Expand Down
96 changes: 1 addition & 95 deletions sagemaker-train/tests/integ/train/test_llm_as_judge_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
}


@pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/")
# @pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/")
class TestLLMAsJudgeEvaluatorIntegration:
"""Integration tests for LLMAsJudgeEvaluator"""

Expand Down Expand Up @@ -254,98 +254,4 @@ def test_llm_as_judge_builtin_metrics_prefix_handling(self):

logger.info("Built-in metrics prefix handling tests passed")

@pytest.mark.skip(reason="Built-in metrics only test - to be enabled when needed")
def test_llm_as_judge_builtin_metrics_only(self):
"""
Test LLM-as-Judge evaluation with only built-in metrics (no custom metrics).

This test uses only built-in metrics without custom metrics.

Note: This test is currently skipped. Remove the @pytest.mark.skip decorator
when you want to enable it.
"""
logger.info("Creating LLMAsJudgeEvaluator with built-in metrics only")

# Create evaluator with only built-in metrics
evaluator = LLMAsJudgeEvaluator(
model=TEST_CONFIG["model_package_arn"],
evaluator_model=TEST_CONFIG["evaluator_model"],
dataset=TEST_CONFIG["dataset_s3_uri"],
builtin_metrics=["Completeness", "Faithfulness", "Helpfulness"],
# mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
s3_output_path=TEST_CONFIG["s3_output_path"],
evaluate_base_model=False,
)

# Verify evaluator was created
assert evaluator is not None
assert evaluator.builtin_metrics == ["Completeness", "Faithfulness", "Helpfulness"]
assert evaluator.custom_metrics is None

logger.info("Created evaluator with built-in metrics only")

# Start evaluation
logger.info("Starting evaluation execution")
execution = evaluator.evaluate()

# Verify execution was created
assert execution is not None
assert execution.arn is not None

logger.info(f"Pipeline Execution ARN: {execution.arn}")

# Wait for completion
logger.info(f"Waiting for evaluation to complete (timeout: {EVALUATION_TIMEOUT_SECONDS}s / {EVALUATION_TIMEOUT_SECONDS//3600}h)")
execution.wait(target_status="Succeeded", poll=30, timeout=EVALUATION_TIMEOUT_SECONDS)

# Verify completion
assert execution.status.overall_status == "Succeeded"
logger.info("Built-in metrics only evaluation completed successfully")

@pytest.mark.skip(reason="Custom metrics only test - to be enabled when needed")
def test_llm_as_judge_custom_metrics_only(self):
"""
Test LLM-as-Judge evaluation with only custom metrics (no built-in metrics).

This test uses only custom metrics without built-in metrics.

Note: This test is currently skipped. Remove the @pytest.mark.skip decorator
when you want to enable it.
"""
logger.info("Creating LLMAsJudgeEvaluator with custom metrics only")

# Create evaluator with only custom metrics
evaluator = LLMAsJudgeEvaluator(
model=TEST_CONFIG["model_package_arn"],
evaluator_model=TEST_CONFIG["evaluator_model"],
dataset=TEST_CONFIG["dataset_s3_uri"],
custom_metrics=TEST_CONFIG["custom_metrics_json"],
# mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
s3_output_path=TEST_CONFIG["s3_output_path"],
evaluate_base_model=False,
)

# Verify evaluator was created
assert evaluator is not None
assert evaluator.custom_metrics == TEST_CONFIG["custom_metrics_json"]
assert evaluator.builtin_metrics is None

logger.info("Created evaluator with custom metrics only")

# Start evaluation
logger.info("Starting evaluation execution")
execution = evaluator.evaluate()

# Verify execution was created
assert execution is not None
assert execution.arn is not None

logger.info(f"Pipeline Execution ARN: {execution.arn}")

# Wait for completion
logger.info(f"Waiting for evaluation to complete (timeout: {EVALUATION_TIMEOUT_SECONDS}s / {EVALUATION_TIMEOUT_SECONDS//3600}h)")
execution.wait(target_status="Succeeded", poll=30, timeout=EVALUATION_TIMEOUT_SECONDS)

# Verify completion
assert execution.status.overall_status == "Succeeded"
logger.info("Custom metrics only evaluation completed successfully")
2 changes: 1 addition & 1 deletion sagemaker-train/tests/integ/train/test_model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def test_hp_contract_basic_sh_script(sagemaker_session):


# skip this test for now as requirments.txt is not resolved
@pytest.mark.skip
# @pytest.mark.skip
def test_hp_contract_mpi_script(sagemaker_session):
compute = Compute(instance_type="ml.m5.xlarge", instance_count=2)
model_trainer = ModelTrainer(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,95 @@ def test_llm_as_judge_evaluator_get_llmaj_template_additions_no_metrics(mock_art
assert additions['custom_metrics'] is None


@patch('sagemaker.train.common_utils.model_resolution._resolve_base_model')
@patch('sagemaker.core.resources.Artifact')
def test_llm_as_judge_evaluator_builtin_metrics_only_no_custom(mock_artifact, mock_resolve):
"""Test that evaluator handles builtin_metrics with custom_metrics=None correctly."""
mock_info = Mock()
mock_info.base_model_name = DEFAULT_MODEL
mock_info.base_model_arn = DEFAULT_BASE_MODEL_ARN
mock_info.source_model_package_arn = None
mock_resolve.return_value = mock_info

mock_artifact.get_all.return_value = iter([])
mock_artifact_instance = Mock()
mock_artifact_instance.artifact_arn = DEFAULT_ARTIFACT_ARN
mock_artifact.create.return_value = mock_artifact_instance

mock_session = Mock()
mock_session.boto_region_name = DEFAULT_REGION
mock_session.boto_session = Mock()
mock_session.get_caller_identity_arn.return_value = DEFAULT_ROLE

evaluator = LLMAsJudgeEvaluator(
evaluator_model=DEFAULT_EVALUATOR_MODEL,
dataset=DEFAULT_DATASET,
model=DEFAULT_MODEL,
builtin_metrics=["Completeness", "Faithfulness"],
custom_metrics=None,
s3_output_path=DEFAULT_S3_OUTPUT,
mlflow_resource_arn=DEFAULT_MLFLOW_ARN,
model_package_group=DEFAULT_MODEL_PACKAGE_GROUP_ARN,
sagemaker_session=mock_session,
)

assert evaluator.builtin_metrics == ["Completeness", "Faithfulness"]
assert evaluator.custom_metrics is None

eval_name = "test-eval"
additions = evaluator._get_llmaj_template_additions(eval_name)

assert additions['llmaj_metrics'] == json.dumps(["Completeness", "Faithfulness"])
assert additions['custom_metrics'] is None


@patch('sagemaker.core.s3.client.S3Uploader.upload_string_as_file_body')
@patch('sagemaker.train.common_utils.model_resolution._resolve_base_model')
@patch('sagemaker.core.resources.Artifact')
def test_llm_as_judge_evaluator_custom_metrics_only_no_builtin(mock_artifact, mock_resolve, mock_s3_upload):
"""Test that evaluator handles custom_metrics with builtin_metrics=None correctly."""
mock_info = Mock()
mock_info.base_model_name = DEFAULT_MODEL
mock_info.base_model_arn = DEFAULT_BASE_MODEL_ARN
mock_info.source_model_package_arn = None
mock_resolve.return_value = mock_info

mock_artifact.get_all.return_value = iter([])
mock_artifact_instance = Mock()
mock_artifact_instance.artifact_arn = DEFAULT_ARTIFACT_ARN
mock_artifact.create.return_value = mock_artifact_instance

mock_session = Mock()
mock_session.boto_region_name = DEFAULT_REGION
mock_session.boto_session = Mock()
mock_session.get_caller_identity_arn.return_value = DEFAULT_ROLE

custom_metrics_json = json.dumps([{"customMetricDefinition": {"name": "TestMetric"}}])

evaluator = LLMAsJudgeEvaluator(
evaluator_model=DEFAULT_EVALUATOR_MODEL,
dataset=DEFAULT_DATASET,
model=DEFAULT_MODEL,
builtin_metrics=None,
custom_metrics=custom_metrics_json,
s3_output_path=DEFAULT_S3_OUTPUT,
mlflow_resource_arn=DEFAULT_MLFLOW_ARN,
model_package_group=DEFAULT_MODEL_PACKAGE_GROUP_ARN,
sagemaker_session=mock_session,
)

assert evaluator.builtin_metrics is None
assert evaluator.custom_metrics == custom_metrics_json

eval_name = "test-eval"
additions = evaluator._get_llmaj_template_additions(eval_name)

assert additions['llmaj_metrics'] == json.dumps([])
assert additions['custom_metrics'] is not None
assert additions['custom_metrics'].startswith("s3://")
mock_s3_upload.assert_called_once()


@pytest.mark.skip(reason="Integration test - requires full pipeline execution setup")
@patch('sagemaker.train.evaluate.execution.Pipeline')
@patch('sagemaker.train.evaluate.llm_as_judge_evaluator.EvaluationPipelineExecution')
Expand Down
Loading