diff --git a/braintrust-sdk/src/main/java/dev/braintrust/devserver/Devserver.java b/braintrust-sdk/src/main/java/dev/braintrust/devserver/Devserver.java index 129957d2..f83024af 100644 --- a/braintrust-sdk/src/main/java/dev/braintrust/devserver/Devserver.java +++ b/braintrust-sdk/src/main/java/dev/braintrust/devserver/Devserver.java @@ -181,10 +181,8 @@ private void handleList(HttpExchange exchange) throws IOException { Map metadata = new LinkedHashMap<>(); - // Serialize parameters in the container format - if (eval.getParameters().isEmpty()) { - metadata.put("parameters", NullNode.getInstance()); - } else { + // Serialize parameters in the container format. + if (!eval.getParameters().isEmpty()) { Map> schemaMap = new LinkedHashMap<>(); for (ParameterDef param : eval.getParameters()) { Map paramMetadata = new LinkedHashMap<>(); diff --git a/braintrust-sdk/src/test/java/dev/braintrust/devserver/DevserverTest.java b/braintrust-sdk/src/test/java/dev/braintrust/devserver/DevserverTest.java index f790887b..e1755a03 100644 --- a/braintrust-sdk/src/test/java/dev/braintrust/devserver/DevserverTest.java +++ b/braintrust-sdk/src/test/java/dev/braintrust/devserver/DevserverTest.java @@ -27,12 +27,7 @@ import lombok.extern.slf4j.Slf4j; import org.junit.jupiter.api.*; -/** - * NOTE: playground UI has been updated and breaks the SDK contract. will have to investigate and - * fixe before this test can be re-enabled - */ @Slf4j -@Disabled class DevserverTest { private static Devserver server; private static Thread serverThread; @@ -49,15 +44,27 @@ class DevserverTest { private static final BraintrustUtils.Parent PLAYGROUND_PARENT = new BraintrustUtils.Parent("playground_id", "ceea7422-3507-4d1c-a5f7-7acf41d9fac2"); - // Remote scorer from java-unit-test project (returns 1.0 for exact match, 0.0 otherwise) - private static final String REMOTE_SCORER_FUNCTION_ID = "efa5f9c3-6ece-4726-a9d6-4ba792980b3f"; - private static final String REMOTE_SCORER_NAME = "typescript_exact_match"; + // Remote scorer created in the java-unit-test project via the test harness. Its latest version + // returns 1.0 for an exact match and 0.0 otherwise. The score is keyed in results by the + // scorer's resolved name: "invoke---". + private static final String REMOTE_SCORER_SLUG = "typescript-exact-match"; + private static String remoteScorerFunctionId; + // Resolved scorer name (set in setUp once the project name is known). + private static String REMOTE_SCORER_NAME; @BeforeAll static void setUp() throws Exception { // Set up test harness with VCR (records/replays HTTP interactions) testHarness = TestHarness.setup(); + // Ensure the remote code scorer exists and resolve its function ID. The latest version + // returns {name: "typescript exact match", score: output === expected ? 1.0 : 0.0}. + var scorerInfo = testHarness.ensureRemoteCodeScorer(REMOTE_SCORER_SLUG, REMOTE_SCORER_CODE); + remoteScorerFunctionId = + lookupFunctionId(TestHarness.defaultProjectName(), scorerInfo.slug()); + REMOTE_SCORER_NAME = + "invoke-" + TestHarness.defaultProjectName() + "-" + scorerInfo.slug() + "-latest"; + // Create a shared eval for all tests RemoteEval testEval = RemoteEval.builder() @@ -229,7 +236,7 @@ void testStreamingEval() throws Exception { EvalRequest.RemoteScorer remoteScorer = new EvalRequest.RemoteScorer(); remoteScorer.setName(REMOTE_SCORER_NAME); EvalRequest.FunctionId functionId = new EvalRequest.FunctionId(); - functionId.setFunctionId(REMOTE_SCORER_FUNCTION_ID); + functionId.setFunctionId(remoteScorerFunctionId); remoteScorer.setFunctionId(functionId); evalRequest.setScores(List.of(remoteScorer)); @@ -485,13 +492,13 @@ void testStreamingEval() throws Exception { assertEquals("scorer", spanAttrs.get("purpose").asText()); assertEquals("test-gen-1", spanAttrs.get("generation").asText()); - // Scorer name should be either simple_scorer or the remote scorer + // Scorer name should be either simple_scorer or the remote scorer. The remote + // scorer's span name is "invoke---". String scorerName = spanAttrs.get("name").asText(); assertTrue( - scorerName.contains("simple_scorer") - || scorerName.contains(REMOTE_SCORER_NAME.replaceAll("_", "")), - "Score span name should be simple_scorer or %s -- got: %s" - .formatted(REMOTE_EVAL_NAME, scorerName)); + scorerName.contains("simple_scorer") || scorerName.contains(REMOTE_SCORER_SLUG), + "Score span name should be simple_scorer or contain %s -- got: %s" + .formatted(REMOTE_SCORER_SLUG, scorerName)); // Verify braintrust.output_json contains scores String outputJson = @@ -604,6 +611,10 @@ void testListEndpoint() throws Exception { JsonNode eval = root.get(REMOTE_EVAL_NAME); + assertFalse( + eval.has("parameters"), + "parameters field must be omitted (not null) when the eval has no parameters"); + // Check scores assertTrue(eval.has("scores")); JsonNode scores = eval.get("scores"); @@ -1060,4 +1071,83 @@ private List> readSSEEvents(HttpURLConnection conn) throws E reader.close(); return events; } + + /** Resolve a function's ID by project name + slug. */ + private static String lookupFunctionId(String projectName, String slug) { + var functionsApi = + new dev.braintrust.openapi.api.FunctionsApi( + testHarness.braintrust().openApiClient()); + var response = + functionsApi.getFunction( + 1, null, null, null, null, projectName, null, slug, null, null, null); + var objects = response.getObjects(); + if (objects == null || objects.isEmpty()) { + throw new IllegalStateException("function not found for slug: " + slug); + } + return objects.get(0).getId().toString(); + } + + // Code scorer source matching ScorerBrainstoreImplTest so the recorded cassettes are shared. + // First element is the oldest version (always returns 0.0); the last (latest) returns 1.0 for + // an exact match and 0.0 otherwise. + private static final List REMOTE_SCORER_CODE = + List.of( + // language=typescript + """ +import type { Trace } from 'braintrust'; +// an older buggy version that always returns 0.0 +async function handler({ + input, + output, + expected, + metadata, + trace, +}: { + input: any; + output: any; + expected: any; + metadata: Record; + trace: Trace; +}): Promise< + | number + | { score: number; name?: string; metadata?: Record } + | null +> { + if (expected === null) return null; + + return { + name: "typescript exact match", + score: 0.0 + }; +} +""", + // language=typescript + """ +import type { Trace } from 'braintrust'; +// returns 1.0 for exact match, 0.0 otherwise +async function handler({ + input, + output, + expected, + metadata, + trace, +}: { + input: any; + output: any; + expected: any; + metadata: Record; + trace: Trace; +}): Promise< + | number + | { score: number; name?: string; metadata?: Record } + | null +> { + if (expected === null) return null; + + return { + name: "typescript exact match", + score: output === expected ? 1.0 : 0.0 + }; +} +"""); }