diff --git a/braintrust-sdk/src/main/java/dev/braintrust/devserver/Devserver.java b/braintrust-sdk/src/main/java/dev/braintrust/devserver/Devserver.java index f83024af..50bfde45 100644 --- a/braintrust-sdk/src/main/java/dev/braintrust/devserver/Devserver.java +++ b/braintrust-sdk/src/main/java/dev/braintrust/devserver/Devserver.java @@ -13,6 +13,8 @@ import dev.braintrust.api.BraintrustOpenApiClient; import dev.braintrust.config.BraintrustConfig; import dev.braintrust.eval.*; +import dev.braintrust.openapi.api.ExperimentsApi; +import dev.braintrust.openapi.model.CreateExperiment; import dev.braintrust.trace.BraintrustContext; import dev.braintrust.trace.BraintrustTracing; import io.opentelemetry.api.common.AttributeKey; @@ -391,7 +393,8 @@ private void handleStreamingEval( // Execute task and scorers for each case final Map> scoresByName = new ConcurrentHashMap<>(); - final var parentInfo = extractParentInfo(request); + final var parentInfo = + extractParentInfo(request, project.getId(), experimentName, apiClient); final var braintrustParent = parentInfo.braintrustParent(); final var braintrustGeneration = parentInfo.generation(); @@ -1107,21 +1110,39 @@ private record ParentInfo( /** * Extracts parent information from the eval request. * + *

There are two ways a remote eval is triggered from the Braintrust UI: + * + *

+ * * @param request The eval request + * @param projectId The resolved project ID (from the authenticated request) the experiment is + * created under + * @param experimentName The experiment name to use when creating an experiment + * @param apiClient The Braintrust API client used to create the experiment * @return ParentInfo containing braintrustParent and generation */ - private static ParentInfo extractParentInfo(EvalRequest request) { - String parentSpec = null; - String generation = null; - - // Extract parent spec and generation from request - if (request.getParent() != null && request.getParent() instanceof Map) { + private static ParentInfo extractParentInfo( + EvalRequest request, + UUID projectId, + String experimentName, + BraintrustOpenApiClient apiClient) { + // Playground path: the request carries an explicit parent object. + if (request.getParent() instanceof Map) { @SuppressWarnings("unchecked") Map parentMap = (Map) request.getParent(); String objectType = (String) parentMap.get("object_type"); String objectId = (String) parentMap.get("object_id"); // Extract generation from propagated_event.span_attributes.generation + String generation = null; Object propEventObj = parentMap.get("propagated_event"); if (propEventObj instanceof Map) { @SuppressWarnings("unchecked") @@ -1135,14 +1156,36 @@ private static ParentInfo extractParentInfo(EvalRequest request) { } if (objectType != null && objectId != null) { - parentSpec = "playground_id:" + objectId; + return new ParentInfo( + BraintrustUtils.parseParent("playground_id:" + objectId), generation); } } - if (parentSpec == null) { - throw new IllegalArgumentException("braintrust parent (playground_id) not found"); + // Experiment path: no parent object, so create an experiment and parent to it. + if (experimentName != null) { + // ensure_new=true makes each UI-triggered run create a distinct experiment: without + // it, POST /v1/experiment returns the existing experiment for a name that already + // exists, so repeated runs would append to the first experiment instead of creating + // new ones. (The TS SDK gets the same effect from the register-experiment endpoint's + // default behavior.) + var experiment = + new ExperimentsApi(apiClient) + .postExperiment( + new CreateExperiment() + .projectId(projectId) + .name(experimentName) + .ensureNew(true)); + log.debug( + "Created experiment '{}' ({}) for experiment-triggered remote eval", + experimentName, + experiment.getId()); + return new ParentInfo( + BraintrustUtils.parseParent("experiment_id:" + experiment.getId()), null); } - return new ParentInfo(BraintrustUtils.parseParent(parentSpec), generation); + + throw new IllegalArgumentException( + "braintrust parent not found: request has neither a playground parent nor an" + + " experiment_name"); } /** diff --git a/braintrust-sdk/src/test/java/dev/braintrust/devserver/DevserverTest.java b/braintrust-sdk/src/test/java/dev/braintrust/devserver/DevserverTest.java index e1755a03..932e4492 100644 --- a/braintrust-sdk/src/test/java/dev/braintrust/devserver/DevserverTest.java +++ b/braintrust-sdk/src/test/java/dev/braintrust/devserver/DevserverTest.java @@ -43,6 +43,8 @@ class DevserverTest { private static final String SCORER_ERROR_EVAL_NAME = "scorer-error-eval"; private static final BraintrustUtils.Parent PLAYGROUND_PARENT = new BraintrustUtils.Parent("playground_id", "ceea7422-3507-4d1c-a5f7-7acf41d9fac2"); + // Experiment name used by the experiment-triggered run (parent=null) test. + private static final String EXPERIMENT_EVAL_NAME = "java-experiment-repro"; // Remote scorer created in the java-unit-test project via the test harness. Its latest version // returns 1.0 for an exact match and 0.0 otherwise. The score is keyed in results by the @@ -1046,6 +1048,112 @@ void testParameterDefaultsAndOverrides() throws Exception { assertEquals(1, events.stream().filter(e -> "done".equals(e.get("event"))).count()); } + /** + * Reproduces Pylon #17986: a remote eval triggered as an Experiment from the Braintrust + * UI. Unlike a Playground run, the platform sends {@code parent=null} together with {@code + * experiment_name} + {@code project_id}, and expects the dev server to create the experiment + * itself and parent the eval spans to {@code experiment_id:} (the same way {@code + * Eval.java} does for CLI-driven experiments). + * + *

The TS SDK dev server does this by forwarding {@code experimentName}/{@code projectId} + * into {@code Eval()}, which creates the experiment whenever no playground parent is present + * (sdk/js/src/framework.ts: {@code options.parent ? null : initExperiment(...)}). + * + *

Until the experiment-parent branch is implemented in {@code Devserver.extractParentInfo}, + * this run fails: the server throws {@code IllegalArgumentException: braintrust parent + * (playground_id) not found} and emits an SSE {@code error} event instead of {@code + * summary}/{@code done}. + */ + @Test + void testExperimentEval() throws Exception { + EvalRequest evalRequest = new EvalRequest(); + evalRequest.setName(REMOTE_EVAL_NAME); + evalRequest.setStream(true); + + // Experiment runs carry experiment_name + project_id but NO parent. + evalRequest.setExperimentName(EXPERIMENT_EVAL_NAME); + evalRequest.setProjectId(TestHarness.defaultProjectId()); + assertNull(evalRequest.getParent(), "experiment runs send parent=null"); + + // The dev server must create the experiment with ensure_new=true so repeated UI runs each + // get a distinct experiment instead of appending to the first (POST /v1/experiment reuses + // an experiment of the same name by default). This is enforced by the cassette stub for + // POST /v1/experiment, whose body matcher requires {"...","ensure_new":true} with + // ignoreExtraElements=false: if extractParentInfo stops sending ensure_new (or changes the + // request shape), the stub won't match, postExperiment fails, and this test fails on the + // assertions below. Keep that matcher in sync with the request the SDK sends. + + EvalRequest.DataSpec dataSpec = new EvalRequest.DataSpec(); + EvalRequest.EvalCaseData case1 = new EvalRequest.EvalCaseData(); + case1.setInput("apple"); + case1.setExpected("fruit"); + dataSpec.setData(List.of(case1)); + evalRequest.setData(dataSpec); + + String requestBody = JSON_MAPPER.writeValueAsString(evalRequest); + + HttpURLConnection conn = + (HttpURLConnection) new URI(TEST_URL + "/eval").toURL().openConnection(); + conn.setRequestMethod("POST"); + conn.setRequestProperty("Content-Type", "application/json"); + conn.setRequestProperty("x-bt-auth-token", testHarness.braintrustApiKey()); + conn.setRequestProperty("x-bt-project-id", TestHarness.defaultProjectId()); + conn.setRequestProperty("x-bt-org-name", TestHarness.defaultOrgName()); + conn.setDoOutput(true); + + conn.getOutputStream().write(requestBody.getBytes(StandardCharsets.UTF_8)); + conn.getOutputStream().flush(); + + assertEquals(200, conn.getResponseCode()); + assertEquals("text/event-stream", conn.getHeaderField("Content-Type")); + + List> events = readSSEEvents(conn); + + // The run must complete cleanly -- no error event, exactly one summary + done. + List> errorEvents = + events.stream().filter(e -> "error".equals(e.get("event"))).toList(); + assertTrue( + errorEvents.isEmpty(), + "experiment run should not emit an error event, got: " + errorEvents); + assertEquals( + 1, + events.stream().filter(e -> "summary".equals(e.get("event"))).count(), + "Should have 1 summary event"); + assertEquals( + 1, + events.stream().filter(e -> "done".equals(e.get("event"))).count(), + "Should have 1 done event"); + + // Summary should report the experiment name from the request. + Map summaryEvent = + events.stream() + .filter(e -> "summary".equals(e.get("event"))) + .findFirst() + .orElseThrow(); + JsonNode summaryData = JSON_MAPPER.readTree(summaryEvent.get("data")); + assertEquals(EXPERIMENT_EVAL_NAME, summaryData.get("experimentName").asText()); + + // Eval spans must be parented to an experiment (experiment_id:), not a playground. + List allSpans = testHarness.awaitExportedSpans(); + List experimentEvalSpans = + allSpans.stream() + .filter(s -> s.getName().equals("eval")) + .filter( + s -> { + String parent = + s.getAttributes() + .get( + AttributeKey.stringKey( + "braintrust.parent")); + return parent != null && parent.startsWith("experiment_id:"); + }) + .toList(); + assertEquals( + 1, + experimentEvalSpans.size(), + "Should have 1 eval span parented to experiment_id:"); + } + /** Helper to read SSE events from an HttpURLConnection response. */ private List> readSSEEvents(HttpURLConnection conn) throws Exception { BufferedReader reader = diff --git a/test-harness/src/testFixtures/resources/cassettes/braintrust/__files/v1_experiment-java-experiment-repro.json b/test-harness/src/testFixtures/resources/cassettes/braintrust/__files/v1_experiment-java-experiment-repro.json new file mode 100644 index 00000000..b24dfbd0 --- /dev/null +++ b/test-harness/src/testFixtures/resources/cassettes/braintrust/__files/v1_experiment-java-experiment-repro.json @@ -0,0 +1 @@ +{"id":"11111111-2222-3333-4444-555555555555","project_id":"f1e858a4-58e3-408f-983f-016760d7fa25","name":"java-experiment-repro","description":null,"created":"2026-06-18T00:00:00.000Z","repo_info":{},"commit":null,"base_exp_id":null,"deleted_at":null,"dataset_id":null,"dataset_version":null,"internal_metadata":null,"parameters_id":null,"parameters_version":null,"public":false,"user_id":"a5ca7f9c-bf20-40c4-a82b-5c992f6a38f5","metadata":null,"tags":null} diff --git a/test-harness/src/testFixtures/resources/cassettes/braintrust/mappings/v1_experiment-java-experiment-repro.json b/test-harness/src/testFixtures/resources/cassettes/braintrust/mappings/v1_experiment-java-experiment-repro.json new file mode 100644 index 00000000..a772b207 --- /dev/null +++ b/test-harness/src/testFixtures/resources/cassettes/braintrust/mappings/v1_experiment-java-experiment-repro.json @@ -0,0 +1,28 @@ +{ + "id" : "b1a2c3d4-0000-4000-8000-000000000001", + "name" : "v1_experiment", + "request" : { + "url" : "/v1/experiment", + "method" : "POST", + "headers" : { + "Content-Type" : { + "equalTo" : "application/json" + } + }, + "bodyPatterns" : [ { + "equalToJson" : "{\"project_id\":\"f1e858a4-58e3-408f-983f-016760d7fa25\",\"name\":\"java-experiment-repro\",\"ensure_new\":true}", + "ignoreArrayOrder" : true, + "ignoreExtraElements" : false + } ] + }, + "response" : { + "status" : 200, + "bodyFileName" : "v1_experiment-java-experiment-repro.json", + "headers" : { + "Content-Type" : "application/json; charset=utf-8" + } + }, + "uuid" : "b1a2c3d4-0000-4000-8000-000000000001", + "persistent" : true, + "insertionIndex" : 200 +}