Skip to content
Merged
150 changes: 87 additions & 63 deletions packages/junior-evals/evals/behavior-harness.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ import { fileURLToPath } from "node:url";
import type { Message } from "chat";
import { executeWithReplay } from "vitest-evals/replay";
import type { JsonValue } from "vitest-evals/harness";
import {
createPluginAppFixture,
type PluginAppFixture,
} from "@junior-tests/fixtures/plugin-app";
import { createSlackRuntime } from "@/chat/app/factory";
import type { AssistantLifecycleEvent } from "@/chat/runtime/slack-runtime";
import type { JuniorRuntimeServiceOverrides } from "@/chat/app/services";
Expand All @@ -19,8 +23,7 @@ import {
deleteMcpStoredOAuthCredentials,
getLatestMcpAuthSessionForUserProvider,
} from "@/chat/mcp/auth-store";
import { setPluginPackages } from "@/chat/plugins/package-discovery";
import { getPluginOAuthConfig } from "@/chat/plugins/registry";
import { getPluginOAuthConfig, setPluginConfig } from "@/chat/plugins/registry";
import { generateAssistantReply } from "@/chat/respond";
import { getStateAdapter } from "@/chat/state/adapter";
import { resetSkillDiscoveryCache } from "@/chat/skills";
Expand Down Expand Up @@ -399,7 +402,6 @@ const HARNESS_ENV_KEYS = [
"EVAL_ENABLE_TEST_CREDENTIALS",
"EVAL_TEST_CREDENTIAL_TOKEN",
"JUNIOR_BASE_URL",
"JUNIOR_EXTRA_PLUGIN_ROOTS",
"JUNIOR_EVAL_ENABLE_FAULTS",
"JUNIOR_EVAL_FAULT_SANDBOX_BASH_STREAM_INTERRUPTS",
"JUNIOR_STATE_ADAPTER",
Expand Down Expand Up @@ -909,85 +911,106 @@ interface HarnessEnvironment {
configuredPluginDirs: string[];
configuredSkillDirs: string[];
envSnapshot: EnvSnapshot;
pluginApp?: PluginAppFixture;
stateAdapter: HarnessStateAdapter;
}

async function setupHarnessEnvironment(
scenario: EvalScenario,
): Promise<HarnessEnvironment> {
const envSnapshot = snapshotEnv(HARNESS_ENV_KEYS);
let pluginApp: PluginAppFixture | undefined;

const configuredSkillDirs =
scenario.overrides?.skill_dirs?.map(resolveEvalRelativePath) ?? [];
const configuredPluginDirs =
scenario.overrides?.plugin_dirs?.map(resolveEvalRelativePath) ?? [];
const autoCompleteMcpOauthProviders = new Set(
scenario.overrides?.auto_complete_mcp_oauth?.map((p) => p.trim()) ?? [],
);
const autoCompleteOauthProviders = new Set(
scenario.overrides?.auto_complete_oauth?.map((p) => p.trim()) ?? [],
);
const authRequesterUsers = new Set(
scenario.events.flatMap((event) =>
"message" in event
? [event.message.author?.user_id?.trim() || "U-test"]
: event.user_id
? [event.user_id]
: [],
),
);
if (authRequesterUsers.size === 0) {
authRequesterUsers.add("U-test");
}
try {
const configuredSkillDirs =
scenario.overrides?.skill_dirs?.map(resolveEvalRelativePath) ?? [];
const configuredPluginDirs =
scenario.overrides?.plugin_dirs?.map(resolveEvalRelativePath) ?? [];
const autoCompleteMcpOauthProviders = new Set(
scenario.overrides?.auto_complete_mcp_oauth?.map((p) => p.trim()) ?? [],
);
const autoCompleteOauthProviders = new Set(
scenario.overrides?.auto_complete_oauth?.map((p) => p.trim()) ?? [],
);
const authRequesterUsers = new Set(
scenario.events.flatMap((event) =>
"message" in event
? [event.message.author?.user_id?.trim() || "U-test"]
: event.user_id
? [event.user_id]
: [],
),
);
if (authRequesterUsers.size === 0) {
authRequesterUsers.add("U-test");
}

if (scenario.overrides?.enable_test_credentials) {
process.env.EVAL_ENABLE_TEST_CREDENTIALS = "1";
if (scenario.overrides.test_credential_token) {
process.env.EVAL_TEST_CREDENTIAL_TOKEN =
scenario.overrides.test_credential_token;
if (scenario.overrides?.enable_test_credentials) {
process.env.EVAL_ENABLE_TEST_CREDENTIALS = "1";
if (scenario.overrides.test_credential_token) {
process.env.EVAL_TEST_CREDENTIAL_TOKEN =
scenario.overrides.test_credential_token;
}
}
}
const sandboxBashStreamInterrupts =
scenario.overrides?.faults?.sandbox_bash_stream_interrupts;
if (
typeof sandboxBashStreamInterrupts === "number" &&
Number.isFinite(sandboxBashStreamInterrupts) &&
sandboxBashStreamInterrupts > 0
) {
process.env.JUNIOR_EVAL_ENABLE_FAULTS = "1";
process.env.JUNIOR_EVAL_FAULT_SANDBOX_BASH_STREAM_INTERRUPTS = String(
Math.floor(sandboxBashStreamInterrupts),
const sandboxBashStreamInterrupts =
scenario.overrides?.faults?.sandbox_bash_stream_interrupts;
if (
typeof sandboxBashStreamInterrupts === "number" &&
Number.isFinite(sandboxBashStreamInterrupts) &&
sandboxBashStreamInterrupts > 0
) {
process.env.JUNIOR_EVAL_ENABLE_FAULTS = "1";
process.env.JUNIOR_EVAL_FAULT_SANDBOX_BASH_STREAM_INTERRUPTS = String(
Math.floor(sandboxBashStreamInterrupts),
);
}
process.env.JUNIOR_BASE_URL = "https://junior.example.com";
process.env.JUNIOR_STATE_ADAPTER = "memory";
pluginApp =
configuredPluginDirs.length > 0
? await createPluginAppFixture(configuredPluginDirs, {
linkNodeModules: Boolean(
scenario.overrides?.plugin_packages?.length,
),
})
: undefined;
setPluginConfig({ packages: scenario.overrides?.plugin_packages ?? [] });

const stateAdapter = getStateAdapter();
await stateAdapter.connect();
resetSkillDiscoveryCache();
await cleanupHarnessThreadState(stateAdapter, scenario.events);
await cleanupMcpAuthState(
authRequesterUsers,
autoCompleteMcpOauthProviders,
);
}
process.env.JUNIOR_BASE_URL = "https://junior.example.com";
process.env.JUNIOR_STATE_ADAPTER = "memory";
process.env.JUNIOR_EXTRA_PLUGIN_ROOTS = JSON.stringify(configuredPluginDirs);
setPluginPackages(scenario.overrides?.plugin_packages ?? []);
await cleanupOAuthTokens(authRequesterUsers, autoCompleteOauthProviders);

const stateAdapter = getStateAdapter();
await stateAdapter.connect();
resetSkillDiscoveryCache();
await cleanupHarnessThreadState(stateAdapter, scenario.events);
await cleanupMcpAuthState(authRequesterUsers, autoCompleteMcpOauthProviders);
await cleanupOAuthTokens(authRequesterUsers, autoCompleteOauthProviders);

return {
authRequesterUsers,
autoCompleteMcpOauthProviders,
autoCompleteOauthProviders,
configuredPluginDirs,
configuredSkillDirs,
envSnapshot,
stateAdapter,
};
return {
authRequesterUsers,
autoCompleteMcpOauthProviders,
autoCompleteOauthProviders,
configuredPluginDirs,
configuredSkillDirs,
envSnapshot,
...(pluginApp ? { pluginApp } : {}),
stateAdapter,
};
} catch (error) {
resetSkillDiscoveryCache();
setPluginConfig(undefined);
envSnapshot.restore();
await pluginApp?.cleanup();
throw error;
}
}

async function teardownHarnessEnvironment(
scenario: EvalScenario,
env: HarnessEnvironment,
): Promise<void> {
resetSkillDiscoveryCache();
setPluginPackages(undefined);
setPluginConfig(undefined);
await cleanupHarnessThreadState(env.stateAdapter, scenario.events);
await cleanupMcpAuthState(
env.authRequesterUsers,
Expand All @@ -998,6 +1021,7 @@ async function teardownHarnessEnvironment(
env.autoCompleteOauthProviders,
);
env.envSnapshot.restore();
await env.pluginApp?.cleanup();
}

// ---------------------------------------------------------------------------
Expand Down
16 changes: 16 additions & 0 deletions packages/junior-evals/tests/unit/harness/behavior-harness.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,22 @@ describe("behavior harness", () => {
]);
});

it("restores cwd when setup fails after creating a plugin fixture", async () => {
const cwd = process.cwd();

await expect(
runEvalScenario({
events: [],
overrides: {
plugin_dirs: ["evals/fixtures/plugins"],
plugin_packages: ["../bad-package"],
},
}),
).rejects.toThrow("Plugin package names must be valid npm package names");

expect(process.cwd()).toBe(cwd);
});

it("collects created canvas metadata from captured Slack API calls", () => {
const artifacts = collectSlackArtifactsFromCapturedCalls([
{
Expand Down
94 changes: 80 additions & 14 deletions packages/junior/src/app.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import { Hono } from "hono";
import { setConfigDefaults } from "@/chat/configuration/defaults";
import {
getConfigDefaults,
setConfigDefaults,
} from "@/chat/configuration/defaults";
import { logException } from "@/chat/logging";
import { setPluginPackages } from "@/chat/plugins/package-discovery";
import { setPluginConfig } from "@/chat/plugins/registry";
import {
getPluginCatalogSignature,
setPluginConfig,
} from "@/chat/plugins/registry";
import type { PluginConfig } from "@/chat/plugins/types";
import { GET as diagnosticsGET } from "@/handlers/diagnostics";
import { GET as dashboardGET } from "@/handlers/diagnostics-dashboard";
Expand Down Expand Up @@ -47,25 +52,86 @@ async function resolveBuildPluginConfig(): Promise<PluginConfig | undefined> {
try {
const mod: { plugins?: PluginConfig } = await import("#junior/config");
return mod.plugins;
} catch {
// Virtual module unavailable (not running in Nitro context).
// Fall back to env var for dev mode and tests.
const env = process.env.JUNIOR_PLUGIN_PACKAGES;
if (env) {
try {
return { packages: JSON.parse(env) };
} catch {}
} catch (error) {
if (!isMissingVirtualConfig(error)) {
throw error;
}
const packages = readEnvPluginPackages();
if (packages) {
return { packages };
}
return undefined;
}
}

function isMissingVirtualConfig(error: unknown): boolean {
if (!(error instanceof Error)) {
return false;
}
const code = (error as { code?: string }).code;
return (
(code === "ERR_PACKAGE_IMPORT_NOT_DEFINED" ||
code === "ERR_MODULE_NOT_FOUND" ||
code === "MODULE_NOT_FOUND") &&
error.message.includes("#junior/config")
);
}

function readEnvPluginPackages(): string[] | undefined {
const env = process.env.JUNIOR_PLUGIN_PACKAGES;
if (!env) {
return undefined;
}

let parsed: unknown;
try {
parsed = JSON.parse(env);
} catch (error) {
throw new Error("JUNIOR_PLUGIN_PACKAGES must be valid JSON", {
cause: error,
});
}

if (
!Array.isArray(parsed) ||
parsed.some((value) => typeof value !== "string" || !value.trim())
) {
throw new Error(
"JUNIOR_PLUGIN_PACKAGES must be a JSON array of package names",
);
}

return parsed;
}

function hasConfiguredPluginCatalog(config: PluginConfig | undefined): boolean {
if (!config) {
return false;
}

return Boolean(
config.packages?.length || Object.keys(config.manifests ?? {}).length,
);
}

/** Create a Hono app with all Junior routes. */
export async function createApp(options?: JuniorAppOptions): Promise<Hono> {
const pluginConfig = options?.plugins ?? (await resolveBuildPluginConfig());
setPluginPackages(pluginConfig?.packages);
setPluginConfig(pluginConfig);
setConfigDefaults(options?.configDefaults);
const shouldValidatePluginCatalog =
hasConfiguredPluginCatalog(pluginConfig) ||
Boolean(Object.keys(options?.configDefaults ?? {}).length);
const previousPluginConfig = setPluginConfig(pluginConfig);
const previousConfigDefaults = getConfigDefaults();
try {
setConfigDefaults(options?.configDefaults);
if (shouldValidatePluginCatalog) {
getPluginCatalogSignature();
}
} catch (error) {
setPluginConfig(previousPluginConfig);
setConfigDefaults(previousConfigDefaults);
throw error;
}

const waitUntil = options?.waitUntil ?? (await defaultWaitUntil());

Expand Down
Loading
Loading