Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import dev.braintrust.api.BraintrustOpenApiClient;
import dev.braintrust.config.BraintrustConfig;
import dev.braintrust.eval.*;
import dev.braintrust.openapi.api.ExperimentsApi;
import dev.braintrust.openapi.model.CreateExperiment;
import dev.braintrust.trace.BraintrustContext;
import dev.braintrust.trace.BraintrustTracing;
import io.opentelemetry.api.common.AttributeKey;
Expand Down Expand Up @@ -391,7 +393,8 @@ private <I, O> void handleStreamingEval(

// Execute task and scorers for each case
final Map<String, List<Double>> scoresByName = new ConcurrentHashMap<>();
final var parentInfo = extractParentInfo(request);
final var parentInfo =
extractParentInfo(request, project.getId(), experimentName, apiClient);
final var braintrustParent = parentInfo.braintrustParent();
final var braintrustGeneration = parentInfo.generation();

Expand Down Expand Up @@ -1107,21 +1110,39 @@ private record ParentInfo(
/**
* Extracts parent information from the eval request.
*
* <p>There are two ways a remote eval is triggered from the Braintrust UI:
*
* <ul>
* <li><b>Playground</b>: the request carries a {@code parent} object ({@code object_type} +
* {@code object_id}); the eval spans are parented to {@code playground_id:<id>}.
* <li><b>Experiment</b>: the request carries {@code parent=null} plus {@code experiment_name}
* + {@code project_id}. In this case the dev server creates the experiment and parents
* the eval spans to {@code experiment_id:<id>} -- mirroring {@code Eval.java} (the CLI
* runner) and the TS dev server (sdk/js/src/framework.ts creates an experiment whenever
* no parent is present).
* </ul>
*
* @param request The eval request
* @param projectId The resolved project ID (from the authenticated request) the experiment is
* created under
* @param experimentName The experiment name to use when creating an experiment
* @param apiClient The Braintrust API client used to create the experiment
* @return ParentInfo containing braintrustParent and generation
*/
private static ParentInfo extractParentInfo(EvalRequest request) {
String parentSpec = null;
String generation = null;

// Extract parent spec and generation from request
if (request.getParent() != null && request.getParent() instanceof Map) {
private static ParentInfo extractParentInfo(
EvalRequest request,
UUID projectId,
String experimentName,
BraintrustOpenApiClient apiClient) {
// Playground path: the request carries an explicit parent object.
if (request.getParent() instanceof Map) {
@SuppressWarnings("unchecked")
Map<String, Object> parentMap = (Map<String, Object>) request.getParent();
String objectType = (String) parentMap.get("object_type");
String objectId = (String) parentMap.get("object_id");

// Extract generation from propagated_event.span_attributes.generation
String generation = null;
Object propEventObj = parentMap.get("propagated_event");
if (propEventObj instanceof Map) {
@SuppressWarnings("unchecked")
Expand All @@ -1135,14 +1156,36 @@ private static ParentInfo extractParentInfo(EvalRequest request) {
}

if (objectType != null && objectId != null) {
parentSpec = "playground_id:" + objectId;
return new ParentInfo(
BraintrustUtils.parseParent("playground_id:" + objectId), generation);
}
}

if (parentSpec == null) {
throw new IllegalArgumentException("braintrust parent (playground_id) not found");
// Experiment path: no parent object, so create an experiment and parent to it.
if (experimentName != null) {
// ensure_new=true makes each UI-triggered run create a distinct experiment: without
// it, POST /v1/experiment returns the existing experiment for a name that already
// exists, so repeated runs would append to the first experiment instead of creating
// new ones. (The TS SDK gets the same effect from the register-experiment endpoint's
// default behavior.)
var experiment =
new ExperimentsApi(apiClient)
.postExperiment(
new CreateExperiment()
.projectId(projectId)
.name(experimentName)
.ensureNew(true));
Comment on lines +1174 to +1177

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Link dataset runs to the created experiment

When the Experiment trigger is run against a Braintrust dataset (data.dataset_id or project_name/dataset_name), this creates the experiment with only project/name/ensure_new before extractDataset() opens the dataset cursor. Unlike Eval.run(), which copies the dataset id and cursor version into CreateExperiment, the new remote-eval experiment is not linked to the dataset/version, so the Experiment page loses the dataset association even though the rows were fetched from it.

Useful? React with 👍 / 👎.

log.debug(
"Created experiment '{}' ({}) for experiment-triggered remote eval",
experimentName,
experiment.getId());
return new ParentInfo(
BraintrustUtils.parseParent("experiment_id:" + experiment.getId()), null);
}
return new ParentInfo(BraintrustUtils.parseParent(parentSpec), generation);

throw new IllegalArgumentException(
"braintrust parent not found: request has neither a playground parent nor an"
+ " experiment_name");
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ class DevserverTest {
private static final String SCORER_ERROR_EVAL_NAME = "scorer-error-eval";
private static final BraintrustUtils.Parent PLAYGROUND_PARENT =
new BraintrustUtils.Parent("playground_id", "ceea7422-3507-4d1c-a5f7-7acf41d9fac2");
// Experiment name used by the experiment-triggered run (parent=null) test.
private static final String EXPERIMENT_EVAL_NAME = "java-experiment-repro";

// Remote scorer created in the java-unit-test project via the test harness. Its latest version
// returns 1.0 for an exact match and 0.0 otherwise. The score is keyed in results by the
Expand Down Expand Up @@ -1046,6 +1048,112 @@ void testParameterDefaultsAndOverrides() throws Exception {
assertEquals(1, events.stream().filter(e -> "done".equals(e.get("event"))).count());
}

/**
* Reproduces Pylon #17986: a remote eval triggered as an <b>Experiment</b> from the Braintrust
* UI. Unlike a Playground run, the platform sends {@code parent=null} together with {@code
* experiment_name} + {@code project_id}, and expects the dev server to create the experiment
* itself and parent the eval spans to {@code experiment_id:<id>} (the same way {@code
* Eval.java} does for CLI-driven experiments).
*
* <p>The TS SDK dev server does this by forwarding {@code experimentName}/{@code projectId}
* into {@code Eval()}, which creates the experiment whenever no playground parent is present
* (sdk/js/src/framework.ts: {@code options.parent ? null : initExperiment(...)}).
*
* <p>Until the experiment-parent branch is implemented in {@code Devserver.extractParentInfo},
* this run fails: the server throws {@code IllegalArgumentException: braintrust parent
* (playground_id) not found} and emits an SSE {@code error} event instead of {@code
* summary}/{@code done}.
*/
@Test
void testExperimentEval() throws Exception {
EvalRequest evalRequest = new EvalRequest();
evalRequest.setName(REMOTE_EVAL_NAME);
evalRequest.setStream(true);

// Experiment runs carry experiment_name + project_id but NO parent.
evalRequest.setExperimentName(EXPERIMENT_EVAL_NAME);
evalRequest.setProjectId(TestHarness.defaultProjectId());
assertNull(evalRequest.getParent(), "experiment runs send parent=null");

// The dev server must create the experiment with ensure_new=true so repeated UI runs each
// get a distinct experiment instead of appending to the first (POST /v1/experiment reuses
// an experiment of the same name by default). This is enforced by the cassette stub for
// POST /v1/experiment, whose body matcher requires {"...","ensure_new":true} with
// ignoreExtraElements=false: if extractParentInfo stops sending ensure_new (or changes the
// request shape), the stub won't match, postExperiment fails, and this test fails on the
// assertions below. Keep that matcher in sync with the request the SDK sends.

EvalRequest.DataSpec dataSpec = new EvalRequest.DataSpec();
EvalRequest.EvalCaseData case1 = new EvalRequest.EvalCaseData();
case1.setInput("apple");
case1.setExpected("fruit");
dataSpec.setData(List.of(case1));
evalRequest.setData(dataSpec);

String requestBody = JSON_MAPPER.writeValueAsString(evalRequest);

HttpURLConnection conn =
(HttpURLConnection) new URI(TEST_URL + "/eval").toURL().openConnection();
conn.setRequestMethod("POST");
conn.setRequestProperty("Content-Type", "application/json");
conn.setRequestProperty("x-bt-auth-token", testHarness.braintrustApiKey());
conn.setRequestProperty("x-bt-project-id", TestHarness.defaultProjectId());
conn.setRequestProperty("x-bt-org-name", TestHarness.defaultOrgName());
conn.setDoOutput(true);

conn.getOutputStream().write(requestBody.getBytes(StandardCharsets.UTF_8));
conn.getOutputStream().flush();

assertEquals(200, conn.getResponseCode());
assertEquals("text/event-stream", conn.getHeaderField("Content-Type"));

List<Map<String, String>> events = readSSEEvents(conn);

// The run must complete cleanly -- no error event, exactly one summary + done.
List<Map<String, String>> errorEvents =
events.stream().filter(e -> "error".equals(e.get("event"))).toList();
assertTrue(
errorEvents.isEmpty(),
"experiment run should not emit an error event, got: " + errorEvents);
assertEquals(
1,
events.stream().filter(e -> "summary".equals(e.get("event"))).count(),
"Should have 1 summary event");
assertEquals(
1,
events.stream().filter(e -> "done".equals(e.get("event"))).count(),
"Should have 1 done event");

// Summary should report the experiment name from the request.
Map<String, String> summaryEvent =
events.stream()
.filter(e -> "summary".equals(e.get("event")))
.findFirst()
.orElseThrow();
JsonNode summaryData = JSON_MAPPER.readTree(summaryEvent.get("data"));
assertEquals(EXPERIMENT_EVAL_NAME, summaryData.get("experimentName").asText());

// Eval spans must be parented to an experiment (experiment_id:<id>), not a playground.
List<SpanData> allSpans = testHarness.awaitExportedSpans();
List<SpanData> experimentEvalSpans =
allSpans.stream()
.filter(s -> s.getName().equals("eval"))
.filter(
s -> {
String parent =
s.getAttributes()
.get(
AttributeKey.stringKey(
"braintrust.parent"));
return parent != null && parent.startsWith("experiment_id:");
})
.toList();
assertEquals(
1,
experimentEvalSpans.size(),
"Should have 1 eval span parented to experiment_id:<id>");
}

/** Helper to read SSE events from an HttpURLConnection response. */
private List<Map<String, String>> readSSEEvents(HttpURLConnection conn) throws Exception {
BufferedReader reader =
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":"11111111-2222-3333-4444-555555555555","project_id":"f1e858a4-58e3-408f-983f-016760d7fa25","name":"java-experiment-repro","description":null,"created":"2026-06-18T00:00:00.000Z","repo_info":{},"commit":null,"base_exp_id":null,"deleted_at":null,"dataset_id":null,"dataset_version":null,"internal_metadata":null,"parameters_id":null,"parameters_version":null,"public":false,"user_id":"a5ca7f9c-bf20-40c4-a82b-5c992f6a38f5","metadata":null,"tags":null}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"id" : "b1a2c3d4-0000-4000-8000-000000000001",
"name" : "v1_experiment",
"request" : {
"url" : "/v1/experiment",
"method" : "POST",
"headers" : {
"Content-Type" : {
"equalTo" : "application/json"
}
},
"bodyPatterns" : [ {
"equalToJson" : "{\"project_id\":\"f1e858a4-58e3-408f-983f-016760d7fa25\",\"name\":\"java-experiment-repro\",\"ensure_new\":true}",
"ignoreArrayOrder" : true,
"ignoreExtraElements" : false
} ]
},
"response" : {
"status" : 200,
"bodyFileName" : "v1_experiment-java-experiment-repro.json",
"headers" : {
"Content-Type" : "application/json; charset=utf-8"
}
},
"uuid" : "b1a2c3d4-0000-4000-8000-000000000001",
"persistent" : true,
"insertionIndex" : 200
}
Loading