diff --git a/codex-rs/app-server-protocol/schema/json/ClientRequest.json b/codex-rs/app-server-protocol/schema/json/ClientRequest.json index a6fe99b35e98..7a2c8a57f1f7 100644 --- a/codex-rs/app-server-protocol/schema/json/ClientRequest.json +++ b/codex-rs/app-server-protocol/schema/json/ClientRequest.json @@ -1173,6 +1173,26 @@ ], "title": "InputImageFunctionCallOutputContentItem", "type": "object" + }, + { + "properties": { + "input_audio": { + "$ref": "#/definitions/InputAudio" + }, + "type": { + "enum": [ + "input_audio" + ], + "title": "InputAudioFunctionCallOutputContentItemType", + "type": "string" + } + }, + "required": [ + "input_audio", + "type" + ], + "title": "InputAudioFunctionCallOutputContentItem", + "type": "object" } ] }, @@ -1289,6 +1309,21 @@ ], "type": "object" }, + "InputAudio": { + "properties": { + "data": { + "type": "string" + }, + "format": { + "type": "string" + } + }, + "required": [ + "data", + "format" + ], + "type": "object" + }, "ListMcpServerStatusParams": { "properties": { "cursor": { diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json index 066e1f4870f3..ac33cfaee6eb 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json @@ -9145,6 +9145,26 @@ ], "title": "InputImageFunctionCallOutputContentItem", "type": "object" + }, + { + "properties": { + "input_audio": { + "$ref": "#/definitions/v2/InputAudio" + }, + "type": { + "enum": [ + "input_audio" + ], + "title": "InputAudioFunctionCallOutputContentItemType", + "type": "string" + } + }, + "required": [ + "input_audio", + "type" + ], + "title": "InputAudioFunctionCallOutputContentItem", + "type": "object" } ] }, @@ -9935,6 +9955,21 @@ ], "type": "string" }, + "InputAudio": { + "properties": { + "data": { + "type": "string" + }, + "format": { + "type": "string" + } + }, + "required": [ + "data", + "format" + ], + "type": "object" + }, "InputModality": { "description": "Canonical user-input modality tags advertised by a model.", "oneOf": [ @@ -9951,6 +9986,13 @@ "image" ], "type": "string" + }, + { + "description": "Audio content included in tool payloads.", + "enum": [ + "audio" + ], + "type": "string" } ] }, diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json index 95ce7e4aefb8..fefb9e94aa38 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json @@ -5534,6 +5534,26 @@ ], "title": "InputImageFunctionCallOutputContentItem", "type": "object" + }, + { + "properties": { + "input_audio": { + "$ref": "#/definitions/InputAudio" + }, + "type": { + "enum": [ + "input_audio" + ], + "title": "InputAudioFunctionCallOutputContentItemType", + "type": "string" + } + }, + "required": [ + "input_audio", + "type" + ], + "title": "InputAudioFunctionCallOutputContentItem", + "type": "object" } ] }, @@ -6484,6 +6504,21 @@ "title": "InitializeParams", "type": "object" }, + "InputAudio": { + "properties": { + "data": { + "type": "string" + }, + "format": { + "type": "string" + } + }, + "required": [ + "data", + "format" + ], + "type": "object" + }, "InputModality": { "description": "Canonical user-input modality tags advertised by a model.", "oneOf": [ @@ -6500,6 +6535,13 @@ "image" ], "type": "string" + }, + { + "description": "Audio content included in tool payloads.", + "enum": [ + "audio" + ], + "type": "string" } ] }, diff --git a/codex-rs/app-server-protocol/schema/json/v2/ModelListResponse.json b/codex-rs/app-server-protocol/schema/json/v2/ModelListResponse.json index c0221805eb08..0a97f98b6a06 100644 --- a/codex-rs/app-server-protocol/schema/json/v2/ModelListResponse.json +++ b/codex-rs/app-server-protocol/schema/json/v2/ModelListResponse.json @@ -17,6 +17,13 @@ "image" ], "type": "string" + }, + { + "description": "Audio content included in tool payloads.", + "enum": [ + "audio" + ], + "type": "string" } ] }, diff --git a/codex-rs/app-server-protocol/schema/json/v2/RawResponseItemCompletedNotification.json b/codex-rs/app-server-protocol/schema/json/v2/RawResponseItemCompletedNotification.json index 6973d15baa6d..b02ddff73ed4 100644 --- a/codex-rs/app-server-protocol/schema/json/v2/RawResponseItemCompletedNotification.json +++ b/codex-rs/app-server-protocol/schema/json/v2/RawResponseItemCompletedNotification.json @@ -140,6 +140,26 @@ ], "title": "InputImageFunctionCallOutputContentItem", "type": "object" + }, + { + "properties": { + "input_audio": { + "$ref": "#/definitions/InputAudio" + }, + "type": { + "enum": [ + "input_audio" + ], + "title": "InputAudioFunctionCallOutputContentItemType", + "type": "string" + } + }, + "required": [ + "input_audio", + "type" + ], + "title": "InputAudioFunctionCallOutputContentItem", + "type": "object" } ] }, @@ -152,6 +172,21 @@ ], "type": "string" }, + "InputAudio": { + "properties": { + "data": { + "type": "string" + }, + "format": { + "type": "string" + } + }, + "required": [ + "data", + "format" + ], + "type": "object" + }, "LocalShellAction": { "oneOf": [ { diff --git a/codex-rs/app-server-protocol/schema/json/v2/ThreadResumeParams.json b/codex-rs/app-server-protocol/schema/json/v2/ThreadResumeParams.json index 5f07fe0149db..2f6dbc0f8436 100644 --- a/codex-rs/app-server-protocol/schema/json/v2/ThreadResumeParams.json +++ b/codex-rs/app-server-protocol/schema/json/v2/ThreadResumeParams.json @@ -203,6 +203,26 @@ ], "title": "InputImageFunctionCallOutputContentItem", "type": "object" + }, + { + "properties": { + "input_audio": { + "$ref": "#/definitions/InputAudio" + }, + "type": { + "enum": [ + "input_audio" + ], + "title": "InputAudioFunctionCallOutputContentItemType", + "type": "string" + } + }, + "required": [ + "input_audio", + "type" + ], + "title": "InputAudioFunctionCallOutputContentItem", + "type": "object" } ] }, @@ -215,6 +235,21 @@ ], "type": "string" }, + "InputAudio": { + "properties": { + "data": { + "type": "string" + }, + "format": { + "type": "string" + } + }, + "required": [ + "data", + "format" + ], + "type": "object" + }, "LocalShellAction": { "oneOf": [ { diff --git a/codex-rs/app-server-protocol/schema/typescript/FunctionCallOutputContentItem.ts b/codex-rs/app-server-protocol/schema/typescript/FunctionCallOutputContentItem.ts index fb2996f1e54a..330c99f9dd27 100644 --- a/codex-rs/app-server-protocol/schema/typescript/FunctionCallOutputContentItem.ts +++ b/codex-rs/app-server-protocol/schema/typescript/FunctionCallOutputContentItem.ts @@ -2,9 +2,10 @@ // This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. import type { ImageDetail } from "./ImageDetail"; +import type { InputAudio } from "./InputAudio"; /** * Responses API compatible content items that can be returned by a tool call. * This is a subset of ContentItem with the types we support as function call outputs. */ -export type FunctionCallOutputContentItem = { "type": "input_text", text: string, } | { "type": "input_image", image_url: string, detail?: ImageDetail, }; +export type FunctionCallOutputContentItem = { "type": "input_text", text: string, } | { "type": "input_image", image_url: string, detail?: ImageDetail, } | { "type": "input_audio", input_audio: InputAudio, }; diff --git a/codex-rs/app-server-protocol/schema/typescript/InputAudio.ts b/codex-rs/app-server-protocol/schema/typescript/InputAudio.ts new file mode 100644 index 000000000000..637996baae93 --- /dev/null +++ b/codex-rs/app-server-protocol/schema/typescript/InputAudio.ts @@ -0,0 +1,5 @@ +// GENERATED CODE! DO NOT MODIFY BY HAND! + +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. + +export type InputAudio = { data: string, format: string, }; diff --git a/codex-rs/app-server-protocol/schema/typescript/InputModality.ts b/codex-rs/app-server-protocol/schema/typescript/InputModality.ts index 73661938b38a..40d598df3db5 100644 --- a/codex-rs/app-server-protocol/schema/typescript/InputModality.ts +++ b/codex-rs/app-server-protocol/schema/typescript/InputModality.ts @@ -5,4 +5,4 @@ /** * Canonical user-input modality tags advertised by a model. */ -export type InputModality = "text" | "image"; +export type InputModality = "text" | "image" | "audio"; diff --git a/codex-rs/app-server-protocol/schema/typescript/index.ts b/codex-rs/app-server-protocol/schema/typescript/index.ts index 97ea43560192..c9d7a73f44b6 100644 --- a/codex-rs/app-server-protocol/schema/typescript/index.ts +++ b/codex-rs/app-server-protocol/schema/typescript/index.ts @@ -36,6 +36,7 @@ export type { ImageDetail } from "./ImageDetail"; export type { InitializeCapabilities } from "./InitializeCapabilities"; export type { InitializeParams } from "./InitializeParams"; export type { InitializeResponse } from "./InitializeResponse"; +export type { InputAudio } from "./InputAudio"; export type { InputModality } from "./InputModality"; export type { InternalSessionSource } from "./InternalSessionSource"; export type { LocalShellAction } from "./LocalShellAction"; diff --git a/codex-rs/code-mode/src/description.rs b/codex-rs/code-mode/src/description.rs index 0c2813e51a88..33c7e6b4a07f 100644 --- a/codex-rs/code-mode/src/description.rs +++ b/codex-rs/code-mode/src/description.rs @@ -9,7 +9,7 @@ use crate::PUBLIC_TOOL_NAME; const MAX_JS_SAFE_INTEGER: u64 = (1_u64 << 53) - 1; const DEFERRED_NESTED_TOOLS_GUIDANCE: &str = r#"Some nested MCP/app tools may be omitted from this description. They are still available on the global `tools` object and listed in `ALL_TOOLS`. To find one, filter `ALL_TOOLS` by `name` and `description`."#; -const EXEC_DESCRIPTION_TEMPLATE: &str = r#"Run JavaScript code to orchestrate/compose tool calls +const EXEC_DESCRIPTION_TEMPLATE_PREFIX: &str = r#"Run JavaScript code to orchestrate/compose tool calls - Evaluates the provided JavaScript code in a fresh V8 isolate as an async module. - All nested tools are available on the global `tools` object, for example `await tools.exec_command(...)`. Tool names are exposed as normalized JavaScript identifiers, for example `await tools.mcp__ologs__get_profile(...)`. - Nested tool methods take either a string or an object as their input argument. @@ -24,8 +24,9 @@ const EXEC_DESCRIPTION_TEMPLATE: &str = r#"Run JavaScript code to orchestrate/co - Global helpers: - `exit()`: Immediately ends the current script successfully (like an early return from the top level). - `text(value: string | number | boolean | undefined | null)`: Appends a text item. Non-string values are stringified with `JSON.stringify(...)` when possible. -- `image(imageUrlOrItem: string | { image_url: string; detail?: "auto" | "low" | "high" | "original" | null } | ImageContent, detail?: "auto" | "low" | "high" | "original" | null)`: Appends an image item. `image_url` can be an HTTPS URL or a base64-encoded `data:` URL. To forward an MCP tool image, pass an individual `ImageContent` block from `result.content`, for example `image(result.content[0])`. MCP image blocks may request detail with `_meta: { "codex/imageDetail": "original" }`. When provided, the second `detail` argument overrides any detail embedded in the first argument. -- `store(key: string, value: any)`: stores a serializable value under a string key for later `exec` calls in the same session. +- `image(imageUrlOrItem: string | { image_url: string; detail?: "auto" | "low" | "high" | "original" | null } | ImageContent, detail?: "auto" | "low" | "high" | "original" | null)`: Appends an image item. `image_url` can be an HTTPS URL or a base64-encoded `data:` URL. To forward an MCP tool image, pass an individual `ImageContent` block from `result.content`, for example `image(result.content[0])`. MCP image blocks may request detail with `_meta: { "codex/imageDetail": "original" }`. When provided, the second `detail` argument overrides any detail embedded in the first argument."#; +const AUDIO_HELPER_DESCRIPTION: &str = r#"- `audio(audioItem: { data: string; format?: string | null; mimeType?: string | null; mime_type?: string | null } | AudioContent)`: Appends an audio item. `data` can be raw base64 audio or a base64-encoded `data:audio/...` URL. To forward an MCP tool audio block, pass an individual `AudioContent` block from `result.content`, for example `audio(result.content[0])`."#; +const EXEC_DESCRIPTION_TEMPLATE_SUFFIX: &str = r#"- `store(key: string, value: any)`: stores a serializable value under a string key for later `exec` calls in the same session. - `load(key: string)`: returns the stored value for a string key, or `undefined` if it is missing. - `notify(value: string | number | boolean | undefined | null)`: immediately injects an extra `custom_tool_call_output` for the current `exec` call. Values are stringified like `text(...)`. - `setTimeout(callback: () => void, delayMs?: number)`: schedules a callback to run later and returns a timeout id. Pending timeouts do not keep `exec` alive by themselves; await an explicit promise if you need to wait for one. @@ -41,7 +42,7 @@ const WAIT_DESCRIPTION_TEMPLATE: &str = r#"- Use `wait` only after `exec` return - If the cell is still running, `wait` may yield again with the same `cell_id`. - If the cell has already finished, `wait` returns the completed result and closes the cell."#; // Based off of https://modelcontextprotocol.io/specification/draft/schema#calltoolresult -const MCP_TYPESCRIPT_PREAMBLE: &str = r#"type Role = "user" | "assistant"; +const MCP_TYPESCRIPT_PREAMBLE_PREFIX: &str = r#"type Role = "user" | "assistant"; type MetaObject = Record; type Annotations = { audience?: Role[]; @@ -79,14 +80,16 @@ type ImageContent = { annotations?: Annotations; _meta?: MetaObject; }; -type AudioContent = { +"#; +const MCP_AUDIO_CONTENT_TYPE: &str = r#"type AudioContent = { type: "audio"; data: string; mimeType: string; annotations?: Annotations; _meta?: MetaObject; }; -type ResourceLink = { +"#; +const MCP_TYPESCRIPT_PREAMBLE_SUFFIX: &str = r#"type ResourceLink = { icons?: Icon[]; name: string; title?: string; @@ -106,8 +109,10 @@ type EmbeddedResource = { }; type ContentBlock = | TextContent - | ImageContent - | AudioContent + | ImageContent"#; +const MCP_AUDIO_CONTENT_BLOCK_VARIANT: &str = r#" + | AudioContent"#; +const MCP_TYPESCRIPT_PREAMBLE_END: &str = r#" | ResourceLink | EmbeddedResource; type CallToolResult = { @@ -143,6 +148,13 @@ pub struct ToolNamespaceDescription { pub description: String, } +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct ExecToolDescriptionOptions { + pub code_mode_only: bool, + pub deferred_tools_available: bool, + pub supports_audio_input: bool, +} + #[derive(Debug, Default, Deserialize, PartialEq, Eq)] #[serde(deny_unknown_fields)] struct CodeModeExecPragma { @@ -250,15 +262,21 @@ pub fn is_code_mode_nested_tool(tool_name: &str) -> bool { pub fn build_exec_tool_description( enabled_tools: &[ToolDefinition], namespace_descriptions: &BTreeMap, - code_mode_only: bool, - deferred_tools_available: bool, + options: ExecToolDescriptionOptions, ) -> String { let mut sections = Vec::new(); - sections.push(EXEC_DESCRIPTION_TEMPLATE.to_string()); - if deferred_tools_available { + let mut exec_description = String::from(EXEC_DESCRIPTION_TEMPLATE_PREFIX); + if options.supports_audio_input { + exec_description.push('\n'); + exec_description.push_str(AUDIO_HELPER_DESCRIPTION); + } + exec_description.push('\n'); + exec_description.push_str(EXEC_DESCRIPTION_TEMPLATE_SUFFIX); + sections.push(exec_description); + if options.deferred_tools_available { sections.push(DEFERRED_NESTED_TOOLS_GUIDANCE.to_string()); } - if !code_mode_only { + if !options.code_mode_only { return sections.join("\n\n"); } @@ -305,8 +323,18 @@ pub fn build_exec_tool_description( } if has_mcp_tools { + let mut mcp_typescript_preamble = String::from(MCP_TYPESCRIPT_PREAMBLE_PREFIX); + if options.supports_audio_input { + mcp_typescript_preamble.push_str(MCP_AUDIO_CONTENT_TYPE); + } + mcp_typescript_preamble.push_str(MCP_TYPESCRIPT_PREAMBLE_SUFFIX); + if options.supports_audio_input { + mcp_typescript_preamble.push_str(MCP_AUDIO_CONTENT_BLOCK_VARIANT); + } + mcp_typescript_preamble.push_str(MCP_TYPESCRIPT_PREAMBLE_END); + sections.push(format!( - "Shared MCP Types:\n```ts\n{MCP_TYPESCRIPT_PREAMBLE}\n```" + "Shared MCP Types:\n```ts\n{mcp_typescript_preamble}\n```" )); } let nested_tool_reference = nested_tool_sections.join("\n\n"); @@ -706,6 +734,7 @@ fn render_json_schema_literal(value: &JsonValue) -> String { #[cfg(test)] mod tests { use super::CodeModeToolKind; + use super::ExecToolDescriptionOptions; use super::ParsedExecSource; use super::ToolDefinition; use super::ToolNamespaceDescription; @@ -863,8 +892,11 @@ mod tests { output_schema: None, }], &BTreeMap::new(), - /*code_mode_only*/ true, - /*deferred_tools_available*/ false, + ExecToolDescriptionOptions { + code_mode_only: true, + deferred_tools_available: false, + supports_audio_input: false, + }, ); assert!(description.contains( "### `foo` @@ -878,13 +910,41 @@ bar" let description = build_exec_tool_description( &[], &BTreeMap::new(), - /*code_mode_only*/ false, - /*deferred_tools_available*/ false, + ExecToolDescriptionOptions { + code_mode_only: false, + deferred_tools_available: false, + supports_audio_input: false, + }, ); assert!(description.contains("`setTimeout(callback: () => void, delayMs?: number)`")); assert!(description.contains("`clearTimeout(timeoutId?: number)`")); } + #[test] + fn exec_description_gates_audio_helper_on_audio_input_support() { + let unsupported_description = build_exec_tool_description( + &[], + &BTreeMap::new(), + ExecToolDescriptionOptions { + code_mode_only: false, + deferred_tools_available: false, + supports_audio_input: false, + }, + ); + assert!(!unsupported_description.contains("`audio(audioItem")); + + let supported_description = build_exec_tool_description( + &[], + &BTreeMap::new(), + ExecToolDescriptionOptions { + code_mode_only: false, + deferred_tools_available: false, + supports_audio_input: true, + }, + ); + assert!(supported_description.contains("`audio(audioItem")); + } + #[test] fn code_mode_only_description_groups_namespace_instructions_once() { let namespace_descriptions = BTreeMap::from([( @@ -930,8 +990,11 @@ bar" }, ], &namespace_descriptions, - /*code_mode_only*/ true, - /*deferred_tools_available*/ false, + ExecToolDescriptionOptions { + code_mode_only: true, + deferred_tools_available: false, + supports_audio_input: false, + }, ); assert_eq!(description.matches("## mcp__sample").count(), 1); assert!(description.contains("## mcp__sample\nShared namespace guidance.")); @@ -970,8 +1033,11 @@ bar" }))), }], &namespace_descriptions, - /*code_mode_only*/ true, - /*deferred_tools_available*/ false, + ExecToolDescriptionOptions { + code_mode_only: true, + deferred_tools_available: false, + supports_audio_input: false, + }, ); assert!(!description.contains("## mcp__sample")); @@ -1069,8 +1135,11 @@ bar" }, ], &BTreeMap::new(), - /*code_mode_only*/ true, - /*deferred_tools_available*/ false, + ExecToolDescriptionOptions { + code_mode_only: true, + deferred_tools_available: false, + supports_audio_input: false, + }, ); assert_eq!( @@ -1082,13 +1151,60 @@ bar" assert_eq!(description.matches("Shared MCP Types:").count(), 1); } + #[test] + fn code_mode_only_description_gates_mcp_audio_type_on_audio_input_support() { + let tools = vec![ToolDefinition { + name: "mcp__sample__audio".to_string(), + tool_name: ToolName::namespaced("mcp__sample__", "audio"), + description: "Audio tool".to_string(), + kind: CodeModeToolKind::Function, + input_schema: Some(json!({ + "type": "object", + "properties": {}, + "additionalProperties": false + })), + output_schema: Some(mcp_call_tool_result_schema(json!({ + "type": "object", + "properties": {}, + "additionalProperties": false + }))), + }]; + + let unsupported_description = build_exec_tool_description( + &tools, + &BTreeMap::new(), + ExecToolDescriptionOptions { + code_mode_only: true, + deferred_tools_available: false, + supports_audio_input: false, + }, + ); + assert!(!unsupported_description.contains("type AudioContent")); + assert!(!unsupported_description.contains("| AudioContent")); + + let supported_description = build_exec_tool_description( + &tools, + &BTreeMap::new(), + ExecToolDescriptionOptions { + code_mode_only: true, + deferred_tools_available: false, + supports_audio_input: true, + }, + ); + assert!(supported_description.contains("type AudioContent")); + assert!(supported_description.contains("| AudioContent")); + } + #[test] fn exec_description_mentions_deferred_nested_tools_when_available() { let description = build_exec_tool_description( &[], &BTreeMap::new(), - /*code_mode_only*/ false, - /*deferred_tools_available*/ true, + ExecToolDescriptionOptions { + code_mode_only: false, + deferred_tools_available: true, + supports_audio_input: false, + }, ); assert!(description.contains("Some nested MCP/app tools may be omitted")); diff --git a/codex-rs/code-mode/src/lib.rs b/codex-rs/code-mode/src/lib.rs index 3da8c7732592..826c19d2d951 100644 --- a/codex-rs/code-mode/src/lib.rs +++ b/codex-rs/code-mode/src/lib.rs @@ -5,6 +5,7 @@ mod service; pub use description::CODE_MODE_PRAGMA_PREFIX; pub use description::CodeModeToolKind; +pub use description::ExecToolDescriptionOptions; pub use description::ToolDefinition; pub use description::ToolNamespaceDescription; pub use description::augment_tool_definition; @@ -18,6 +19,7 @@ pub use description::render_json_schema_to_typescript; pub use response::DEFAULT_IMAGE_DETAIL; pub use response::FunctionCallOutputContentItem; pub use response::ImageDetail; +pub use response::InputAudio; pub use runtime::CodeModeNestedToolCall; pub use runtime::DEFAULT_EXEC_YIELD_TIME_MS; pub use runtime::DEFAULT_MAX_OUTPUT_TOKENS_PER_EXEC_CALL; diff --git a/codex-rs/code-mode/src/response.rs b/codex-rs/code-mode/src/response.rs index 0ac3a03770e4..b6fdb83a1885 100644 --- a/codex-rs/code-mode/src/response.rs +++ b/codex-rs/code-mode/src/response.rs @@ -23,4 +23,13 @@ pub enum FunctionCallOutputContentItem { #[serde(default, skip_serializing_if = "Option::is_none")] detail: Option, }, + InputAudio { + input_audio: InputAudio, + }, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct InputAudio { + pub data: String, + pub format: String, } diff --git a/codex-rs/code-mode/src/runtime/callbacks.rs b/codex-rs/code-mode/src/runtime/callbacks.rs index c3a648ae3297..ae8b471b306d 100644 --- a/codex-rs/code-mode/src/runtime/callbacks.rs +++ b/codex-rs/code-mode/src/runtime/callbacks.rs @@ -5,6 +5,7 @@ use super::RuntimeEvent; use super::RuntimeState; use super::timers; use super::value::json_to_v8; +use super::value::normalize_output_audio; use super::value::normalize_output_image; use super::value::serialize_output_text; use super::value::throw_type_error; @@ -129,6 +130,26 @@ pub(super) fn image_callback( retval.set(v8::undefined(scope).into()); } +pub(super) fn audio_callback( + scope: &mut v8::PinScope<'_, '_>, + args: v8::FunctionCallbackArguments, + mut retval: v8::ReturnValue, +) { + let value = if args.length() == 0 { + v8::undefined(scope).into() + } else { + args.get(0) + }; + let audio_item = match normalize_output_audio(scope, value) { + Ok(audio_item) => audio_item, + Err(()) => return, + }; + if let Some(state) = scope.get_slot::() { + let _ = state.event_tx.send(RuntimeEvent::ContentItem(audio_item)); + } + retval.set(v8::undefined(scope).into()); +} + pub(super) fn store_callback( scope: &mut v8::PinScope<'_, '_>, args: v8::FunctionCallbackArguments, diff --git a/codex-rs/code-mode/src/runtime/globals.rs b/codex-rs/code-mode/src/runtime/globals.rs index 2ec6953f093b..375dbf80d517 100644 --- a/codex-rs/code-mode/src/runtime/globals.rs +++ b/codex-rs/code-mode/src/runtime/globals.rs @@ -1,4 +1,5 @@ use super::RuntimeState; +use super::callbacks::audio_callback; use super::callbacks::clear_timeout_callback; use super::callbacks::exit_callback; use super::callbacks::image_callback; @@ -23,6 +24,7 @@ pub(super) fn install_globals(scope: &mut v8::PinScope<'_, '_>) -> Result<(), St let set_timeout = helper_function(scope, "setTimeout", set_timeout_callback)?; let text = helper_function(scope, "text", text_callback)?; let image = helper_function(scope, "image", image_callback)?; + let audio = helper_function(scope, "audio", audio_callback)?; let store = helper_function(scope, "store", store_callback)?; let load = helper_function(scope, "load", load_callback)?; let notify = helper_function(scope, "notify", notify_callback)?; @@ -35,6 +37,7 @@ pub(super) fn install_globals(scope: &mut v8::PinScope<'_, '_>) -> Result<(), St set_global(scope, global, "setTimeout", set_timeout.into())?; set_global(scope, global, "text", text.into())?; set_global(scope, global, "image", image.into())?; + set_global(scope, global, "audio", audio.into())?; set_global(scope, global, "store", store.into())?; set_global(scope, global, "load", load.into())?; set_global(scope, global, "notify", notify.into())?; diff --git a/codex-rs/code-mode/src/runtime/value.rs b/codex-rs/code-mode/src/runtime/value.rs index 8d76a832d365..57e4985c7628 100644 --- a/codex-rs/code-mode/src/runtime/value.rs +++ b/codex-rs/code-mode/src/runtime/value.rs @@ -3,8 +3,10 @@ use serde_json::Value as JsonValue; use crate::response::DEFAULT_IMAGE_DETAIL; use crate::response::FunctionCallOutputContentItem; use crate::response::ImageDetail; +use crate::response::InputAudio; const IMAGE_HELPER_EXPECTS_MESSAGE: &str = "image expects a non-empty image URL string, an object with image_url and optional detail, or a raw MCP image block"; +const AUDIO_HELPER_EXPECTS_MESSAGE: &str = "audio expects an object with non-empty data and format/mimeType/mime_type, or a raw MCP audio block"; const CODEX_IMAGE_DETAIL_META_KEY: &str = "codex/imageDetail"; pub(super) fn serialize_output_text( @@ -97,6 +99,35 @@ pub(super) fn normalize_output_image( } } +pub(super) fn normalize_output_audio( + scope: &mut v8::PinScope<'_, '_>, + value: v8::Local<'_, v8::Value>, +) -> Result { + let result = (|| -> Result { + if !value.is_object() || value.is_array() { + return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string()); + } + + let object = v8::Local::::try_from(value) + .map_err(|_| AUDIO_HELPER_EXPECTS_MESSAGE.to_string())?; + let input_audio = if let Some(audio) = parse_non_mcp_output_audio(scope, object)? { + audio + } else { + parse_mcp_output_audio(scope, value)? + }; + + Ok(FunctionCallOutputContentItem::InputAudio { input_audio }) + })(); + + match result { + Ok(item) => Ok(item), + Err(error_text) => { + throw_type_error(scope, &error_text); + Err(()) + } + } +} + fn parse_non_mcp_output_image( scope: &mut v8::PinScope<'_, '_>, object: v8::Local<'_, v8::Object>, @@ -165,6 +196,90 @@ fn parse_mcp_output_image( Ok((image_url, detail)) } +fn parse_non_mcp_output_audio( + scope: &mut v8::PinScope<'_, '_>, + object: v8::Local<'_, v8::Object>, +) -> Result, String> { + let data_key = v8::String::new(scope, "data") + .ok_or_else(|| "failed to allocate audio helper keys".to_string())?; + let Some(data) = object.get(scope, data_key.into()) else { + return Ok(None); + }; + if data.is_undefined() { + return Ok(None); + } + if !data.is_string() { + return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string()); + } + let data = data.to_rust_string_lossy(scope); + let format = optional_string_property(scope, object, "format")?; + let mime_type = optional_string_property(scope, object, "mimeType")? + .or(optional_string_property(scope, object, "mime_type")?); + let Some(input_audio) = codex_protocol::models::input_audio_from_data( + &data, + format.as_deref(), + mime_type.as_deref(), + ) else { + return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string()); + }; + Ok(Some(InputAudio { + data: input_audio.data, + format: input_audio.format, + })) +} + +fn parse_mcp_output_audio( + scope: &mut v8::PinScope<'_, '_>, + value: v8::Local<'_, v8::Value>, +) -> Result { + let Some(result) = v8_value_to_json(scope, value)? else { + return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string()); + }; + let JsonValue::Object(result) = result else { + return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string()); + }; + let Some(item_type) = result.get("type").and_then(JsonValue::as_str) else { + return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string()); + }; + if item_type != "audio" { + return Err(format!( + "audio only accepts MCP audio blocks, got \"{item_type}\"" + )); + } + let data = result + .get("data") + .and_then(JsonValue::as_str) + .ok_or_else(|| "audio expected MCP audio data".to_string())?; + let mime_type = result + .get("mimeType") + .or_else(|| result.get("mime_type")) + .and_then(JsonValue::as_str); + let Some(input_audio) = + codex_protocol::models::input_audio_from_data(data, /*format*/ None, mime_type) + else { + return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string()); + }; + Ok(InputAudio { + data: input_audio.data, + format: input_audio.format, + }) +} + +fn optional_string_property( + scope: &mut v8::PinScope<'_, '_>, + object: v8::Local<'_, v8::Object>, + name: &str, +) -> Result, String> { + let key = v8::String::new(scope, name) + .ok_or_else(|| "failed to allocate audio helper keys".to_string())?; + match object.get(scope, key.into()) { + Some(value) if value.is_string() => Ok(Some(value.to_rust_string_lossy(scope))), + Some(value) if value.is_null() || value.is_undefined() => Ok(None), + Some(_) => Err(format!("{name} must be a string when provided")), + None => Ok(None), + } +} + fn parse_image_detail_value<'s>( scope: &mut v8::PinScope<'s, '_>, value: Option>, diff --git a/codex-rs/code-mode/src/service.rs b/codex-rs/code-mode/src/service.rs index 44e4be49396d..43a59aee98a5 100644 --- a/codex-rs/code-mode/src/service.rs +++ b/codex-rs/code-mode/src/service.rs @@ -703,6 +703,7 @@ mod tests { use super::run_session_control; use crate::CodeModeToolKind; use crate::FunctionCallOutputContentItem; + use crate::InputAudio; use crate::ToolDefinition; use crate::runtime::ExecuteRequest; use crate::runtime::ExecuteToPendingOutcome; @@ -1230,6 +1231,7 @@ text(formatter.format(new Date("2025-01-02T03:04:05Z"))); const returnsUndefined = [ text("first"), image("https://example.com/image.jpg"), + audio({ data: "BASE64", format: "wav" }), notify("ping"), ].map((value) => value === undefined); text(JSON.stringify(returnsUndefined)); @@ -1253,8 +1255,14 @@ text(JSON.stringify(returnsUndefined)); image_url: "https://example.com/image.jpg".to_string(), detail: Some(crate::DEFAULT_IMAGE_DETAIL), }, + FunctionCallOutputContentItem::InputAudio { + input_audio: InputAudio { + data: "BASE64".to_string(), + format: "wav".to_string(), + }, + }, FunctionCallOutputContentItem::InputText { - text: "[true,true,true]".to_string(), + text: "[true,true,true,true]".to_string(), }, ], stored_values: HashMap::new(), @@ -1411,6 +1419,147 @@ image({ ); } + #[tokio::test] + async fn audio_helper_accepts_explicit_object() { + let service = CodeModeService::new(); + + let response = service + .execute(ExecuteRequest { + source: r#"audio({ data: "BASE64", format: "wav" });"#.to_string(), + yield_time_ms: None, + ..execute_request("") + }) + .await + .unwrap(); + + assert_eq!( + response, + RuntimeResponse::Result { + cell_id: "1".to_string(), + content_items: vec![FunctionCallOutputContentItem::InputAudio { + input_audio: InputAudio { + data: "BASE64".to_string(), + format: "wav".to_string(), + }, + }], + stored_values: HashMap::new(), + error_text: None, + } + ); + } + + #[tokio::test] + async fn audio_helper_strips_data_url_and_derives_format() { + let service = CodeModeService::new(); + + let response = service + .execute(ExecuteRequest { + source: r#"audio({ data: "data:audio/mpeg;base64,BASE64" });"#.to_string(), + yield_time_ms: None, + ..execute_request("") + }) + .await + .unwrap(); + + assert_eq!( + response, + RuntimeResponse::Result { + cell_id: "1".to_string(), + content_items: vec![FunctionCallOutputContentItem::InputAudio { + input_audio: InputAudio { + data: "BASE64".to_string(), + format: "mp3".to_string(), + }, + }], + stored_values: HashMap::new(), + error_text: None, + } + ); + } + + #[tokio::test] + async fn audio_helper_accepts_raw_mcp_audio_block() { + let service = CodeModeService::new(); + + let response = service + .execute(ExecuteRequest { + source: r#"audio({ type: "audio", data: "BASE64", mimeType: "audio/ogg" });"# + .to_string(), + yield_time_ms: None, + ..execute_request("") + }) + .await + .unwrap(); + + assert_eq!( + response, + RuntimeResponse::Result { + cell_id: "1".to_string(), + content_items: vec![FunctionCallOutputContentItem::InputAudio { + input_audio: InputAudio { + data: "BASE64".to_string(), + format: "ogg".to_string(), + }, + }], + stored_values: HashMap::new(), + error_text: None, + } + ); + } + + #[tokio::test] + async fn audio_helper_rejects_bare_string() { + let service = CodeModeService::new(); + + let response = service + .execute(ExecuteRequest { + source: r#"audio("BASE64");"#.to_string(), + yield_time_ms: None, + ..execute_request("") + }) + .await + .unwrap(); + + assert_eq!( + response, + RuntimeResponse::Result { + cell_id: "1".to_string(), + content_items: Vec::new(), + stored_values: HashMap::new(), + error_text: Some( + "audio expects an object with non-empty data and format/mimeType/mime_type, or a raw MCP audio block".to_string(), + ), + } + ); + } + + #[tokio::test] + async fn audio_helper_rejects_non_audio_mime_type() { + let service = CodeModeService::new(); + + let response = service + .execute(ExecuteRequest { + source: r#"audio({ data: "BASE64", mimeType: "application/octet-stream" });"# + .to_string(), + yield_time_ms: None, + ..execute_request("") + }) + .await + .unwrap(); + + assert_eq!( + response, + RuntimeResponse::Result { + cell_id: "1".to_string(), + content_items: Vec::new(), + stored_values: HashMap::new(), + error_text: Some( + "audio expects an object with non-empty data and format/mimeType/mime_type, or a raw MCP audio block".to_string(), + ), + } + ); + } + #[tokio::test] async fn wait_reports_missing_cell_separately_from_runtime_results() { let service = CodeModeService::new(); diff --git a/codex-rs/core/src/context_manager/history.rs b/codex-rs/core/src/context_manager/history.rs index 80c057e0eb1d..301bf20cd71a 100644 --- a/codex-rs/core/src/context_manager/history.rs +++ b/codex-rs/core/src/context_manager/history.rs @@ -113,9 +113,8 @@ impl ContextManager { } /// Returns the history prepared for sending to the model. This applies a proper - /// normalization and drops un-suited items. When `input_modalities` does not - /// include `InputModality::Image`, images are stripped from messages and tool - /// outputs. + /// normalization and drops un-suited items. Unsupported media content is + /// stripped from messages and tool outputs according to `input_modalities`. pub(crate) fn for_prompt(mut self, input_modalities: &[InputModality]) -> Vec { self.normalize_history(input_modalities); self.items @@ -365,8 +364,8 @@ impl ContextManager { // all outputs must have a corresponding function/tool call normalize::remove_orphan_outputs(&mut self.items); - // strip images when model does not support them - normalize::strip_images_when_unsupported(input_modalities, &mut self.items); + // strip unsupported media content before sending history to the model + normalize::strip_unsupported_media_content(input_modalities, &mut self.items); } fn process_item(&self, item: &ResponseItem, policy: TruncationPolicy) -> ResponseItem { diff --git a/codex-rs/core/src/context_manager/history_tests.rs b/codex-rs/core/src/context_manager/history_tests.rs index 74f4d29bfb4e..df6c09a395d3 100644 --- a/codex-rs/core/src/context_manager/history_tests.rs +++ b/codex-rs/core/src/context_manager/history_tests.rs @@ -10,6 +10,7 @@ use codex_protocol::models::FunctionCallOutputBody; use codex_protocol::models::FunctionCallOutputContentItem; use codex_protocol::models::FunctionCallOutputPayload; use codex_protocol::models::ImageDetail; +use codex_protocol::models::InputAudio; use codex_protocol::models::LocalShellAction; use codex_protocol::models::LocalShellExecAction; use codex_protocol::models::LocalShellStatus; @@ -513,6 +514,85 @@ fn for_prompt_strips_images_when_model_does_not_support_images() { } } +#[test] +fn for_prompt_strips_audio_when_model_does_not_support_audio() { + let items = vec![ + ResponseItem::FunctionCall { + id: None, + name: "audio_tool".to_string(), + namespace: None, + arguments: "{}".to_string(), + call_id: "call-1".to_string(), + }, + ResponseItem::FunctionCallOutput { + call_id: "call-1".to_string(), + output: FunctionCallOutputPayload::from_content_items(vec![ + FunctionCallOutputContentItem::InputText { + text: "audio result".to_string(), + }, + FunctionCallOutputContentItem::InputAudio { + input_audio: InputAudio { + data: "UklGRg==".to_string(), + format: "wav".to_string(), + }, + }, + ]), + }, + ]; + let history = create_history_with_items(items); + let default_modalities = default_input_modalities(); + let stripped = history.clone().for_prompt(&default_modalities); + + assert_eq!( + stripped, + vec![ + ResponseItem::FunctionCall { + id: None, + name: "audio_tool".to_string(), + namespace: None, + arguments: "{}".to_string(), + call_id: "call-1".to_string(), + }, + ResponseItem::FunctionCallOutput { + call_id: "call-1".to_string(), + output: FunctionCallOutputPayload::from_content_items(vec![ + FunctionCallOutputContentItem::InputText { + text: "audio result".to_string(), + }, + FunctionCallOutputContentItem::InputText { + text: "audio content omitted because you do not support audio input" + .to_string(), + }, + ]), + }, + ] + ); + + let audio_modalities = vec![ + InputModality::Text, + InputModality::Image, + InputModality::Audio, + ]; + let with_audio = history.for_prompt(&audio_modalities); + assert_eq!( + with_audio[1], + ResponseItem::FunctionCallOutput { + call_id: "call-1".to_string(), + output: FunctionCallOutputPayload::from_content_items(vec![ + FunctionCallOutputContentItem::InputText { + text: "audio result".to_string(), + }, + FunctionCallOutputContentItem::InputAudio { + input_audio: InputAudio { + data: "UklGRg==".to_string(), + format: "wav".to_string(), + }, + }, + ]), + } + ); +} + #[test] fn for_prompt_preserves_image_generation_calls_when_images_are_supported() { let history = create_history_with_items(vec![ diff --git a/codex-rs/core/src/context_manager/normalize.rs b/codex-rs/core/src/context_manager/normalize.rs index 839bae331ed2..d9cb5423ea38 100644 --- a/codex-rs/core/src/context_manager/normalize.rs +++ b/codex-rs/core/src/context_manager/normalize.rs @@ -10,6 +10,8 @@ use tracing::info; const IMAGE_CONTENT_OMITTED_PLACEHOLDER: &str = "image content omitted because you do not support image input"; +const AUDIO_CONTENT_OMITTED_PLACEHOLDER: &str = + "audio content omitted because you do not support audio input"; pub(crate) fn ensure_call_outputs_present(items: &mut Vec) { // Collect synthetic outputs to insert immediately after their calls. @@ -290,14 +292,14 @@ where } } -/// Strip image content from messages and tool outputs when the model does not support images. -/// When `input_modalities` contains `InputModality::Image`, no stripping is performed. -pub(crate) fn strip_images_when_unsupported( +/// Strip unsupported media content from messages and tool outputs. +pub(crate) fn strip_unsupported_media_content( input_modalities: &[InputModality], items: &mut [ResponseItem], ) { let supports_images = input_modalities.contains(&InputModality::Image); - if supports_images { + let supports_audio = input_modalities.contains(&InputModality::Audio); + if supports_images && supports_audio { return; } @@ -307,7 +309,7 @@ pub(crate) fn strip_images_when_unsupported( let mut normalized_content = Vec::with_capacity(content.len()); for content_item in content.iter() { match content_item { - ContentItem::InputImage { .. } => { + ContentItem::InputImage { .. } if !supports_images => { normalized_content.push(ContentItem::InputText { text: IMAGE_CONTENT_OMITTED_PLACEHOLDER.to_string(), }); @@ -323,20 +325,29 @@ pub(crate) fn strip_images_when_unsupported( let mut normalized_content_items = Vec::with_capacity(content_items.len()); for content_item in content_items.iter() { match content_item { - FunctionCallOutputContentItem::InputImage { .. } => { + FunctionCallOutputContentItem::InputImage { .. } + if !supports_images => + { normalized_content_items.push( FunctionCallOutputContentItem::InputText { text: IMAGE_CONTENT_OMITTED_PLACEHOLDER.to_string(), }, ); } + FunctionCallOutputContentItem::InputAudio { .. } if !supports_audio => { + normalized_content_items.push( + FunctionCallOutputContentItem::InputText { + text: AUDIO_CONTENT_OMITTED_PLACEHOLDER.to_string(), + }, + ); + } _ => normalized_content_items.push(content_item.clone()), } } *content_items = normalized_content_items; } } - ResponseItem::ImageGenerationCall { result, .. } => { + ResponseItem::ImageGenerationCall { result, .. } if !supports_images => { result.clear(); } _ => {} diff --git a/codex-rs/core/src/mcp_tool_call.rs b/codex-rs/core/src/mcp_tool_call.rs index fc8ce4d8ca32..ead0e933a7b4 100644 --- a/codex-rs/core/src/mcp_tool_call.rs +++ b/codex-rs/core/src/mcp_tool_call.rs @@ -589,13 +589,8 @@ async fn execute_mcp_tool_call( ) .await .map_err(|e| format!("tool call error: {e:?}"))?; - let result = sanitize_mcp_tool_result_for_model( - turn_context - .model_info - .input_modalities - .contains(&InputModality::Image), - Ok(result), - )?; + let result = + sanitize_mcp_tool_result_for_model(&turn_context.model_info.input_modalities, Ok(result))?; Ok(maybe_request_codex_apps_auth_elicitation( sess, turn_context, @@ -783,36 +778,61 @@ async fn maybe_mark_thread_memory_mode_polluted( } fn sanitize_mcp_tool_result_for_model( - supports_image_input: bool, + input_modalities: &[InputModality], result: Result, ) -> Result { - if supports_image_input { - return result; - } + let supports_image_input = input_modalities.contains(&InputModality::Image); + let supports_audio_input = input_modalities.contains(&InputModality::Audio); + + result.and_then(|call_tool_result| { + if !supports_audio_input + && !has_non_null_structured_content(&call_tool_result) + && call_tool_result + .content + .iter() + .any(|block| block.get("type").and_then(serde_json::Value::as_str) == Some("audio")) + { + return Err( + "audio content returned by MCP tool but the selected model does not support audio input" + .to_string(), + ); + } - result.map(|call_tool_result| CallToolResult { - content: call_tool_result - .content - .iter() - .map(|block| { - if let Some(content_type) = block.get("type").and_then(serde_json::Value::as_str) - && content_type == "image" - { - return serde_json::json!({ - "type": "text", - "text": "", - }); - } + if supports_image_input { + return Ok(call_tool_result); + } - block.clone() - }) - .collect::>(), - structured_content: call_tool_result.structured_content, - is_error: call_tool_result.is_error, - meta: call_tool_result.meta, + Ok(CallToolResult { + content: call_tool_result + .content + .iter() + .map(|block| { + if let Some(content_type) = block.get("type").and_then(serde_json::Value::as_str) + && content_type == "image" + { + return serde_json::json!({ + "type": "text", + "text": "", + }); + } + + block.clone() + }) + .collect::>(), + structured_content: call_tool_result.structured_content, + is_error: call_tool_result.is_error, + meta: call_tool_result.meta, + }) }) } +fn has_non_null_structured_content(call_tool_result: &CallToolResult) -> bool { + call_tool_result + .structured_content + .as_ref() + .is_some_and(|structured_content| !structured_content.is_null()) +} + fn truncate_mcp_tool_result_for_event( result: &Result, ) -> Result { diff --git a/codex-rs/core/src/mcp_tool_call_tests.rs b/codex-rs/core/src/mcp_tool_call_tests.rs index d8b6326cd0b9..dbbef3cf8643 100644 --- a/codex-rs/core/src/mcp_tool_call_tests.rs +++ b/codex-rs/core/src/mcp_tool_call_tests.rs @@ -924,7 +924,7 @@ fn sanitize_mcp_tool_result_for_model_rewrites_image_content() { meta: None, }); - let got = sanitize_mcp_tool_result_for_model(/*supports_image_input*/ false, result) + let got = sanitize_mcp_tool_result_for_model(&[InputModality::Text], result) .expect("sanitized result"); assert_eq!( @@ -956,7 +956,7 @@ fn sanitize_mcp_tool_result_for_model_preserves_image_when_supported() { }; let got = sanitize_mcp_tool_result_for_model( - /*supports_image_input*/ true, + &[InputModality::Text, InputModality::Image], Ok(original.clone()), ) .expect("unsanitized result"); @@ -964,6 +964,73 @@ fn sanitize_mcp_tool_result_for_model_preserves_image_when_supported() { assert_eq!(got, original); } +#[test] +fn sanitize_mcp_tool_result_for_model_rejects_audio_when_unsupported() { + let result = Ok(CallToolResult { + content: vec![serde_json::json!({ + "type": "audio", + "data": "UklGRg==", + "mimeType": "audio/wav", + })], + structured_content: None, + is_error: Some(false), + meta: None, + }); + + let err = sanitize_mcp_tool_result_for_model(&[InputModality::Text], result) + .expect_err("unsupported audio should fail"); + + assert_eq!( + err, + "audio content returned by MCP tool but the selected model does not support audio input" + ); +} + +#[test] +fn sanitize_mcp_tool_result_for_model_preserves_audio_when_supported() { + let original = CallToolResult { + content: vec![serde_json::json!({ + "type": "audio", + "data": "UklGRg==", + "mimeType": "audio/wav", + })], + structured_content: None, + is_error: Some(false), + meta: Some(serde_json::json!({"k": "v"})), + }; + + let got = sanitize_mcp_tool_result_for_model( + &[ + InputModality::Text, + InputModality::Image, + InputModality::Audio, + ], + Ok(original.clone()), + ) + .expect("supported audio should remain unchanged"); + + assert_eq!(got, original); +} + +#[test] +fn sanitize_mcp_tool_result_for_model_lets_structured_content_take_precedence_over_audio() { + let original = CallToolResult { + content: vec![serde_json::json!({ + "type": "audio", + "data": "UklGRg==", + "mimeType": "audio/wav", + })], + structured_content: Some(serde_json::json!({"answer": "structured"})), + is_error: Some(false), + meta: None, + }; + + let got = sanitize_mcp_tool_result_for_model(&[InputModality::Text], Ok(original.clone())) + .expect("structured content should take precedence"); + + assert_eq!(got, original); +} + #[test] fn truncate_mcp_tool_result_for_event_preserves_small_result() { let original = CallToolResult { diff --git a/codex-rs/core/src/tools/code_mode/execute_spec.rs b/codex-rs/core/src/tools/code_mode/execute_spec.rs index 0a858bd2060e..fcad41e01410 100644 --- a/codex-rs/core/src/tools/code_mode/execute_spec.rs +++ b/codex-rs/core/src/tools/code_mode/execute_spec.rs @@ -7,8 +7,7 @@ use std::collections::BTreeMap; pub(crate) fn create_code_mode_tool( enabled_tools: &[CodeModeToolDefinition], namespace_descriptions: &BTreeMap, - code_mode_only: bool, - deferred_tools_available: bool, + options: codex_code_mode::ExecToolDescriptionOptions, ) -> ToolSpec { const CODE_MODE_FREEFORM_GRAMMAR: &str = r#" start: pragma_source | plain_source @@ -25,8 +24,7 @@ SOURCE: /[\s\S]+/ description: codex_code_mode::build_exec_tool_description( enabled_tools, namespace_descriptions, - code_mode_only, - deferred_tools_available, + options, ), format: FreeformToolFormat { r#type: "grammar".to_string(), @@ -57,16 +55,22 @@ mod tests { create_code_mode_tool( &enabled_tools, &BTreeMap::new(), - /*code_mode_only*/ true, - /*deferred_tools_available*/ false, + codex_code_mode::ExecToolDescriptionOptions { + code_mode_only: true, + deferred_tools_available: false, + supports_audio_input: false, + }, ), ToolSpec::Freeform(FreeformTool { name: codex_code_mode::PUBLIC_TOOL_NAME.to_string(), description: codex_code_mode::build_exec_tool_description( &enabled_tools, &BTreeMap::new(), - /*code_mode_only*/ true, - /*deferred_tools_available*/ false + codex_code_mode::ExecToolDescriptionOptions { + code_mode_only: true, + deferred_tools_available: false, + supports_audio_input: false, + } ), format: FreeformToolFormat { r#type: "grammar".to_string(), diff --git a/codex-rs/core/src/tools/code_mode/mod.rs b/codex-rs/core/src/tools/code_mode/mod.rs index ff9f8c889384..c4340af29231 100644 --- a/codex-rs/core/src/tools/code_mode/mod.rs +++ b/codex-rs/core/src/tools/code_mode/mod.rs @@ -14,6 +14,7 @@ use codex_code_mode::RuntimeResponse; use codex_protocol::models::FunctionCallOutputContentItem; use codex_protocol::models::FunctionCallOutputPayload; use codex_protocol::models::ResponseInputItem; +use codex_protocol::openai_models::InputModality; use serde_json::Value as JsonValue; use tokio_util::sync::CancellationToken; @@ -168,6 +169,9 @@ pub(super) async fn handle_runtime_response( match response { RuntimeResponse::Yielded { content_items, .. } => { let mut content_items = into_function_call_output_content_items(content_items); + if let Some(output) = unsupported_audio_output(exec.turn.as_ref(), &content_items) { + return Ok(output); + } sanitize_runtime_image_detail(exec.turn.as_ref(), &mut content_items); content_items = truncate_code_mode_result(content_items, max_output_tokens); prepend_script_status(&mut content_items, &script_status, started_at.elapsed()); @@ -175,6 +179,9 @@ pub(super) async fn handle_runtime_response( } RuntimeResponse::Terminated { content_items, .. } => { let mut content_items = into_function_call_output_content_items(content_items); + if let Some(output) = unsupported_audio_output(exec.turn.as_ref(), &content_items) { + return Ok(output); + } sanitize_runtime_image_detail(exec.turn.as_ref(), &mut content_items); content_items = truncate_code_mode_result(content_items, max_output_tokens); prepend_script_status(&mut content_items, &script_status, started_at.elapsed()); @@ -187,12 +194,15 @@ pub(super) async fn handle_runtime_response( .. } => { let mut content_items = into_function_call_output_content_items(content_items); - sanitize_runtime_image_detail(exec.turn.as_ref(), &mut content_items); exec.session .services .code_mode_service .replace_stored_values(stored_values) .await; + if let Some(output) = unsupported_audio_output(exec.turn.as_ref(), &content_items) { + return Ok(output); + } + sanitize_runtime_image_detail(exec.turn.as_ref(), &mut content_items); let success = error_text.is_none(); if let Some(error_text) = error_text { content_items.push(FunctionCallOutputContentItem::InputText { @@ -209,6 +219,29 @@ pub(super) async fn handle_runtime_response( } } +fn unsupported_audio_output( + turn: &TurnContext, + items: &[FunctionCallOutputContentItem], +) -> Option { + let supports_audio = turn + .model_info + .input_modalities + .contains(&InputModality::Audio); + if supports_audio + || !items + .iter() + .any(|item| matches!(item, FunctionCallOutputContentItem::InputAudio { .. })) + { + return None; + } + + Some(FunctionToolOutput::from_text( + "audio content emitted by code mode but the selected model does not support audio input" + .to_string(), + Some(false), + )) +} + fn sanitize_runtime_image_detail(turn: &TurnContext, items: &mut [FunctionCallOutputContentItem]) { sanitize_image_detail_items(can_request_original_image_detail(&turn.model_info), items); } diff --git a/codex-rs/core/src/tools/code_mode/response_adapter.rs b/codex-rs/core/src/tools/code_mode/response_adapter.rs index e20cf6a0713e..f964dab900ba 100644 --- a/codex-rs/core/src/tools/code_mode/response_adapter.rs +++ b/codex-rs/core/src/tools/code_mode/response_adapter.rs @@ -42,6 +42,14 @@ impl IntoProtocol .or(Some(DEFAULT_IMAGE_DETAIL)), } } + codex_code_mode::FunctionCallOutputContentItem::InputAudio { input_audio } => { + FunctionCallOutputContentItem::InputAudio { + input_audio: codex_protocol::models::InputAudio { + data: input_audio.data, + format: input_audio.format, + }, + } + } } } } diff --git a/codex-rs/core/src/tools/spec_plan.rs b/codex-rs/core/src/tools/spec_plan.rs index 24f59a0f04e8..d4d1835f9438 100644 --- a/codex-rs/core/src/tools/spec_plan.rs +++ b/codex-rs/core/src/tools/spec_plan.rs @@ -177,8 +177,11 @@ fn build_code_mode_executors( create_code_mode_tool( &enabled_tools, &namespace_descriptions, - config.code_mode_only_enabled, - deferred_tools_available, + codex_code_mode::ExecToolDescriptionOptions { + code_mode_only: config.code_mode_only_enabled, + deferred_tools_available, + supports_audio_input: config.supports_audio_input, + }, ), code_mode_nested_tool_specs, )), diff --git a/codex-rs/core/src/tools/spec_plan_tests.rs b/codex-rs/core/src/tools/spec_plan_tests.rs index dd690b5631c0..b28a170c14a5 100644 --- a/codex-rs/core/src/tools/spec_plan_tests.rs +++ b/codex-rs/core/src/tools/spec_plan_tests.rs @@ -2350,6 +2350,70 @@ fn code_mode_exec_description_omits_nested_tool_details_when_not_code_mode_only( assert!(!description.contains("### `view_image`")); } +#[test] +fn code_mode_exec_audio_helper_docs_require_audio_input_support() { + let unsupported_model_info = model_info(); + let mut supported_model_info = unsupported_model_info.clone(); + supported_model_info.input_modalities = vec![ + InputModality::Text, + InputModality::Image, + InputModality::Audio, + ]; + let mut features = Features::with_defaults(); + features.enable(Feature::CodeMode); + let available_models = Vec::new(); + let unsupported_tools_config = ToolsConfig::new(&ToolsConfigParams { + model_info: &unsupported_model_info, + available_models: &available_models, + features: &features, + image_generation_tool_auth_allowed: true, + web_search_mode: Some(WebSearchMode::Cached), + session_source: SessionSource::Cli, + permission_profile: &PermissionProfile::Disabled, + windows_sandbox_level: WindowsSandboxLevel::Disabled, + }); + let supported_tools_config = ToolsConfig::new(&ToolsConfigParams { + model_info: &supported_model_info, + available_models: &available_models, + features: &features, + image_generation_tool_auth_allowed: true, + web_search_mode: Some(WebSearchMode::Cached), + session_source: SessionSource::Cli, + permission_profile: &PermissionProfile::Disabled, + windows_sandbox_level: WindowsSandboxLevel::Disabled, + }); + + let (unsupported_tools, _) = build_specs( + &unsupported_tools_config, + /*mcp_tools*/ None, + /*deferred_mcp_tools*/ None, + &[], + ); + let ToolSpec::Freeform(FreeformTool { + description: unsupported_description, + .. + }) = find_tool(&unsupported_tools, "exec") + else { + panic!("expected freeform tool"); + }; + assert!(!unsupported_description.contains("`audio(audioItem")); + + let (supported_tools, _) = build_specs( + &supported_tools_config, + /*mcp_tools*/ None, + /*deferred_mcp_tools*/ None, + &[], + ); + let ToolSpec::Freeform(FreeformTool { + description: supported_description, + .. + }) = find_tool(&supported_tools, "exec") + else { + panic!("expected freeform tool"); + }; + assert!(supported_description.contains("`audio(audioItem")); +} + fn model_info() -> ModelInfo { serde_json::from_value(json!({ "slug": "gpt-5-codex", diff --git a/codex-rs/core/tests/suite/code_mode.rs b/codex-rs/core/tests/suite/code_mode.rs index af63d092f8cc..eac24781427d 100644 --- a/codex-rs/core/tests/suite/code_mode.rs +++ b/codex-rs/core/tests/suite/code_mode.rs @@ -12,6 +12,7 @@ use codex_protocol::dynamic_tools::DynamicToolCallOutputContentItem; use codex_protocol::dynamic_tools::DynamicToolResponse; use codex_protocol::dynamic_tools::DynamicToolSpec; use codex_protocol::models::PermissionProfile; +use codex_protocol::openai_models::InputModality; use codex_protocol::protocol::AskForApproval; use codex_protocol::protocol::EventMsg; use codex_protocol::protocol::Op; @@ -177,6 +178,54 @@ async fn run_code_mode_turn( Ok((test, second_mock)) } +async fn run_code_mode_turn_with_audio_model( + server: &MockServer, + prompt: &str, + code: &str, +) -> Result<(TestCodex, ResponseMock)> { + let mut builder = test_codex() + .with_model("gpt-5.4") + .with_config(move |config| { + let _ = config.features.enable(Feature::CodeMode); + let mut model_catalog = bundled_models_response() + .unwrap_or_else(|err| panic!("bundled models.json should parse: {err}")); + let model = model_catalog + .models + .iter_mut() + .find(|model| model.slug == "gpt-5.4") + .expect("gpt-5.4 exists in bundled models.json"); + model.input_modalities = vec![ + InputModality::Text, + InputModality::Image, + InputModality::Audio, + ]; + config.model_catalog = Some(model_catalog); + }); + let test = builder.build(server).await?; + + responses::mount_sse_once( + server, + sse(vec![ + ev_response_created("resp-1"), + ev_custom_tool_call("call-1", "exec", code), + ev_completed("resp-1"), + ]), + ) + .await; + + let second_mock = responses::mount_sse_once( + server, + sse(vec![ + ev_assistant_message("msg-1", "done"), + ev_completed("resp-2"), + ]), + ) + .await; + + test.submit_turn(prompt).await?; + Ok((test, second_mock)) +} + async fn run_code_mode_turn_with_rmcp( server: &MockServer, prompt: &str, @@ -1987,6 +2036,78 @@ image("data:image/png;base64,AAA"); Ok(()) } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn code_mode_can_output_audio_via_global_helper_for_audio_model() -> Result<()> { + skip_if_no_network!(Ok(())); + + let server = responses::start_mock_server().await; + let (_test, second_mock) = run_code_mode_turn_with_audio_model( + &server, + "use exec to return audio", + r#" +audio({ data: "BASE64", format: "wav" }); +audio({ data: "data:audio/mpeg;base64,MP3BASE64" }); +"#, + ) + .await?; + + let req = second_mock.single_request(); + let items = custom_tool_output_items(&req, "call-1"); + let (_, success) = custom_tool_output_body_and_success(&req, "call-1"); + assert_ne!( + success, + Some(false), + "code_mode audio output failed unexpectedly" + ); + assert_eq!(items.len(), 3); + assert_regex_match( + concat!( + r"(?s)\A", + r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z" + ), + text_item(&items, /*index*/ 0), + ); + assert_eq!( + items[1], + serde_json::json!({ + "type": "input_audio", + "input_audio": { "data": "BASE64", "format": "wav" } + }), + ); + assert_eq!( + items[2], + serde_json::json!({ + "type": "input_audio", + "input_audio": { "data": "MP3BASE64", "format": "mp3" } + }), + ); + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn code_mode_audio_output_fails_for_non_audio_model() -> Result<()> { + skip_if_no_network!(Ok(())); + + let server = responses::start_mock_server().await; + let (_test, second_mock) = run_code_mode_turn( + &server, + "use exec to return audio", + r#"audio({ data: "BASE64", format: "wav" });"#, + /*include_apply_patch*/ false, + ) + .await?; + + let req = second_mock.single_request(); + let (output, _success) = custom_tool_output_body_and_success(&req, "call-1"); + assert_eq!( + output, + "audio content emitted by code mode but the selected model does not support audio input" + ); + + Ok(()) +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn code_mode_can_use_view_image_result_with_image_helper() -> Result<()> { skip_if_no_network!(Ok(())); @@ -2422,6 +2543,7 @@ text(JSON.stringify(Object.getOwnPropertyNames(globalThis).sort())); "WeakSet", "__codexContentItems", "add_content", + "audio", "decodeURI", "decodeURIComponent", "encodeURI", diff --git a/codex-rs/core/tests/suite/rmcp_client.rs b/codex-rs/core/tests/suite/rmcp_client.rs index d1973d97a99d..40ba39b93876 100644 --- a/codex-rs/core/tests/suite/rmcp_client.rs +++ b/codex-rs/core/tests/suite/rmcp_client.rs @@ -93,6 +93,50 @@ fn assert_wall_time_header(output: &str) { assert_eq!(marker, "Output:"); } +fn test_model_info_with_modalities( + slug: &str, + description: &str, + input_modalities: Vec, +) -> ModelInfo { + ModelInfo { + slug: slug.to_string(), + display_name: slug.to_string(), + description: Some(description.to_string()), + default_reasoning_level: None, + supported_reasoning_levels: vec![ReasoningEffortPreset { + effort: codex_protocol::openai_models::ReasoningEffort::Medium, + description: "Medium".to_string(), + }], + shell_type: ConfigShellToolType::Default, + visibility: ModelVisibility::List, + supported_in_api: true, + priority: 1, + additional_speed_tiers: Vec::new(), + service_tiers: Vec::new(), + upgrade: None, + base_instructions: "base instructions".to_string(), + model_messages: None, + supports_reasoning_summaries: false, + default_reasoning_summary: ReasoningSummary::Auto, + support_verbosity: false, + default_verbosity: None, + availability_nux: None, + apply_patch_tool_type: None, + web_search_tool_type: Default::default(), + truncation_policy: TruncationPolicyConfig::bytes(/*limit*/ 10_000), + supports_parallel_tool_calls: false, + supports_image_detail_original: false, + context_window: Some(272_000), + max_context_window: None, + auto_compact_token_limit: None, + effective_context_window_percent: 95, + experimental_supported_tools: Vec::new(), + input_modalities, + used_fallback_model_metadata: false, + supports_search_tool: false, + } +} + fn read_only_user_turn(fixture: &TestCodex, text: impl Into) -> Op { read_only_user_turn_with_model(fixture, text, fixture.session_configured.model.clone()) } @@ -1386,6 +1430,257 @@ async fn stdio_image_responses_are_sanitized_for_text_only_model() -> anyhow::Re Ok(()) } +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +#[serial(mcp_test_value)] +async fn stdio_audio_responses_are_forwarded_for_audio_model() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + + let server = responses::start_mock_server().await; + + let call_id = "audio-supported-1"; + let server_name = "rmcp"; + let namespace = format!("mcp__{server_name}__"); + let audio_model_slug = "rmcp-audio-model"; + + let models_mock = mount_models_once( + &server, + ModelsResponse { + models: vec![test_model_info_with_modalities( + audio_model_slug, + "Test model with audio input support", + vec![ + InputModality::Text, + InputModality::Image, + InputModality::Audio, + ], + )], + }, + ) + .await; + + mount_sse_once( + &server, + responses::sse(vec![ + responses::ev_response_created("resp-1"), + responses::ev_function_call_with_namespace(call_id, &namespace, "audio", "{}"), + responses::ev_completed("resp-1"), + ]), + ) + .await; + let final_mock = mount_sse_once( + &server, + responses::sse(vec![ + responses::ev_assistant_message("msg-1", "rmcp audio tool completed successfully."), + responses::ev_completed("resp-2"), + ]), + ) + .await; + + let rmcp_test_server_bin = remote_aware_stdio_server_bin()?; + + let fixture = test_codex() + .with_auth(CodexAuth::create_dummy_chatgpt_auth_for_testing()) + .with_config(move |config| { + insert_mcp_server( + config, + server_name, + stdio_transport( + rmcp_test_server_bin, + Some(HashMap::from([ + ("MCP_TEST_AUDIO_DATA".to_string(), "UklGRg==".to_string()), + ( + "MCP_TEST_AUDIO_MIME_TYPE".to_string(), + "audio/mpeg".to_string(), + ), + ])), + Vec::new(), + ), + TestMcpServerOptions { + experimental_environment: remote_aware_experimental_environment(), + ..Default::default() + }, + ); + }) + .build_remote_aware(&server) + .await?; + + fixture + .thread_manager + .get_models_manager() + .list_models(RefreshStrategy::Online) + .await; + assert_eq!(models_mock.requests().len(), 1); + + fixture + .codex + .submit(read_only_user_turn_with_model( + &fixture, + "call the rmcp audio tool", + audio_model_slug.to_string(), + )) + .await?; + + wait_for_event(&fixture.codex, |ev| { + matches!(ev, EventMsg::McpToolCallBegin(_)) + }) + .await; + wait_for_event(&fixture.codex, |ev| { + matches!(ev, EventMsg::McpToolCallEnd(_)) + }) + .await; + wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TurnComplete(_))).await; + + let output_item = final_mock.single_request().function_call_output(call_id); + let output = output_item["output"] + .as_array() + .expect("audio MCP output should be content items"); + assert_eq!(output.len(), 2); + assert_wall_time_header( + output[0]["text"] + .as_str() + .expect("first MCP audio output item should be wall-time text"), + ); + assert_eq!( + output[1], + json!({ + "type": "input_audio", + "input_audio": { + "data": "UklGRg==", + "format": "mp3", + }, + }) + ); + + server.verify().await; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +#[serial(mcp_test_value)] +async fn stdio_audio_responses_fail_for_text_only_model() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + + let server = responses::start_mock_server().await; + + let call_id = "audio-text-only-1"; + let server_name = "rmcp"; + let namespace = format!("mcp__{server_name}__"); + let text_only_model_slug = "rmcp-audio-text-only-model"; + + let models_mock = mount_models_once( + &server, + ModelsResponse { + models: vec![test_model_info_with_modalities( + text_only_model_slug, + "Test model without audio input support", + vec![InputModality::Text, InputModality::Image], + )], + }, + ) + .await; + + mount_sse_once( + &server, + responses::sse(vec![ + responses::ev_response_created("resp-1"), + responses::ev_function_call_with_namespace(call_id, &namespace, "audio", "{}"), + responses::ev_completed("resp-1"), + ]), + ) + .await; + let final_mock = mount_sse_once( + &server, + responses::sse(vec![ + responses::ev_assistant_message("msg-1", "rmcp audio tool failed."), + responses::ev_completed("resp-2"), + ]), + ) + .await; + + let rmcp_test_server_bin = remote_aware_stdio_server_bin()?; + + let fixture = test_codex() + .with_auth(CodexAuth::create_dummy_chatgpt_auth_for_testing()) + .with_config(move |config| { + insert_mcp_server( + config, + server_name, + stdio_transport( + rmcp_test_server_bin, + Some(HashMap::from([ + ("MCP_TEST_AUDIO_DATA".to_string(), "UklGRg==".to_string()), + ( + "MCP_TEST_AUDIO_MIME_TYPE".to_string(), + "audio/wav".to_string(), + ), + ])), + Vec::new(), + ), + TestMcpServerOptions { + experimental_environment: remote_aware_experimental_environment(), + ..Default::default() + }, + ); + }) + .build_remote_aware(&server) + .await?; + + fixture + .thread_manager + .get_models_manager() + .list_models(RefreshStrategy::Online) + .await; + assert_eq!(models_mock.requests().len(), 1); + + fixture + .codex + .submit(read_only_user_turn_with_model( + &fixture, + "call the rmcp audio tool", + text_only_model_slug.to_string(), + )) + .await?; + + wait_for_event(&fixture.codex, |ev| { + matches!(ev, EventMsg::McpToolCallBegin(_)) + }) + .await; + let end_event = wait_for_event(&fixture.codex, |ev| { + matches!(ev, EventMsg::McpToolCallEnd(_)) + }) + .await; + let EventMsg::McpToolCallEnd(end) = end_event else { + unreachable!("event guard guarantees McpToolCallEnd"); + }; + assert_eq!( + end.result, + Err( + "audio content returned by MCP tool but the selected model does not support audio input" + .to_string() + ) + ); + wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TurnComplete(_))).await; + + let output_item = final_mock.single_request().function_call_output(call_id); + let output_text = output_item + .get("output") + .and_then(Value::as_str) + .expect("function_call_output output should be a JSON string"); + let wrapped_payload = split_wall_time_wrapped_output(output_text); + let output_json: Value = serde_json::from_str(wrapped_payload) + .expect("function_call_output output should be valid JSON"); + assert_eq!( + output_json, + json!([{ + "type": "text", + "text": "audio content returned by MCP tool but the selected model does not support audio input" + }]) + ); + + server.verify().await; + Ok(()) +} + #[tokio::test(flavor = "multi_thread", worker_threads = 1)] #[serial(mcp_test_value)] async fn stdio_server_propagates_whitelisted_env_vars() -> anyhow::Result<()> { diff --git a/codex-rs/protocol/src/models.rs b/codex-rs/protocol/src/models.rs index 6919ee43e770..68d8a91b4523 100644 --- a/codex-rs/protocol/src/models.rs +++ b/codex-rs/protocol/src/models.rs @@ -1304,6 +1304,98 @@ pub enum FunctionCallOutputContentItem { #[ts(optional)] detail: Option, }, + // Do not rename, these are serialized and used directly in the responses API. + InputAudio { + input_audio: InputAudio, + }, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, JsonSchema, TS)] +pub struct InputAudio { + pub data: String, + pub format: String, +} + +pub fn input_audio_from_data( + data: &str, + format: Option<&str>, + mime_type: Option<&str>, +) -> Option { + if data.is_empty() { + return None; + } + + let (data, data_url_format) = if let Some((data, format)) = parse_audio_data_url(data) { + (data, Some(format)) + } else if data + .get(.."data:".len()) + .is_some_and(|prefix| prefix.eq_ignore_ascii_case("data:")) + { + return None; + } else { + (data.to_string(), None) + }; + if data.is_empty() { + return None; + } + + let mime_type_format = match mime_type { + Some(mime_type) => Some(audio_format_from_mime_type(mime_type)?), + None => None, + }; + + let format = format + .and_then(normalize_audio_format) + .or(data_url_format) + .or(mime_type_format)?; + + Some(InputAudio { data, format }) +} + +fn parse_audio_data_url(data_url: &str) -> Option<(String, String)> { + if data_url.len() < "data:".len() + || !data_url + .get(.."data:".len()) + .is_some_and(|prefix| prefix.eq_ignore_ascii_case("data:")) + { + return None; + } + + let (metadata, data) = data_url["data:".len()..].split_once(',')?; + if !metadata + .split(';') + .any(|part| part.eq_ignore_ascii_case("base64")) + { + return None; + } + + let mime_type = metadata.split(';').next()?; + let format = audio_format_from_mime_type(mime_type)?; + Some((data.to_string(), format)) +} + +fn audio_format_from_mime_type(mime_type: &str) -> Option { + let media_type = mime_type.split(';').next()?.trim().to_ascii_lowercase(); + let subtype = media_type.strip_prefix("audio/")?; + normalize_audio_format(subtype) +} + +fn normalize_audio_format(format: &str) -> Option { + let format = format.trim().to_ascii_lowercase(); + if format.is_empty() { + return None; + } + if format.contains('/') { + return audio_format_from_mime_type(&format); + } + + let format = format.strip_prefix("x-").unwrap_or(&format); + let format = match format { + "mpeg" => "mp3", + "wave" => "wav", + _ => format, + }; + Some(format.to_string()) } /// Converts structured function-call output content into plain text for @@ -1311,7 +1403,7 @@ pub enum FunctionCallOutputContentItem { /// /// This conversion is intentionally lossy: /// - only `input_text` items are included -/// - image items are ignored +/// - image and audio items are ignored /// /// We use this helper where callers still need a string representation (for /// example telemetry previews or legacy string-only output paths) while keeping @@ -1327,7 +1419,8 @@ pub fn function_call_output_content_items_to_text( Some(text.as_str()) } FunctionCallOutputContentItem::InputText { .. } - | FunctionCallOutputContentItem::InputImage { .. } => None, + | FunctionCallOutputContentItem::InputImage { .. } + | FunctionCallOutputContentItem::InputAudio { .. } => None, }) .collect::>(); @@ -1378,7 +1471,7 @@ impl FunctionCallOutputBody { /// human-readable surfaces. /// /// This conversion is intentionally lossy when the body contains content - /// items: image entries are dropped and text entries are joined with + /// items: image and audio entries are dropped and text entries are joined with /// newlines. pub fn to_text(&self) -> Option { match self { @@ -1556,11 +1649,18 @@ fn convert_mcp_content_to_items( #[serde(rename = "_meta", default)] meta: Option, }, + #[serde(rename = "audio")] + Audio { + data: String, + #[serde(rename = "mimeType", alias = "mime_type")] + mime_type: Option, + }, #[serde(other)] Unknown, } let mut saw_image = false; + let mut saw_audio = false; let mut items = Vec::with_capacity(contents.len()); for content in contents { @@ -1595,6 +1695,19 @@ fn convert_mcp_content_to_items( .or(Some(DEFAULT_IMAGE_DETAIL)), } } + Ok(McpContent::Audio { data, mime_type }) => { + if let Some(input_audio) = + input_audio_from_data(&data, /*format*/ None, mime_type.as_deref()) + { + saw_audio = true; + FunctionCallOutputContentItem::InputAudio { input_audio } + } else { + FunctionCallOutputContentItem::InputText { + text: serde_json::to_string(content) + .unwrap_or_else(|_| "".to_string()), + } + } + } Ok(McpContent::Unknown) | Err(_) => FunctionCallOutputContentItem::InputText { text: serde_json::to_string(content).unwrap_or_else(|_| "".to_string()), }, @@ -1602,7 +1715,11 @@ fn convert_mcp_content_to_items( items.push(item); } - if saw_image { Some(items) } else { None } + if saw_image || saw_audio { + Some(items) + } else { + None + } } // Implement Display so callers can treat the payload like a plain string when logging or doing @@ -2232,6 +2349,198 @@ mod tests { Ok(()) } + #[test] + fn serializes_audio_outputs_as_array() -> Result<()> { + let call_tool_result = CallToolResult { + content: vec![ + serde_json::json!({"type":"text","text":"caption"}), + serde_json::json!({"type":"audio","data":"BASE64","mimeType":"audio/mpeg"}), + ], + structured_content: None, + is_error: Some(false), + meta: None, + }; + + let payload = call_tool_result.into_function_call_output_payload(); + assert_eq!(payload.success, Some(true)); + let Some(items) = payload.content_items() else { + panic!("expected content items"); + }; + let items = items.to_vec(); + assert_eq!( + items, + vec![ + FunctionCallOutputContentItem::InputText { + text: "caption".into(), + }, + FunctionCallOutputContentItem::InputAudio { + input_audio: InputAudio { + data: "BASE64".into(), + format: "mp3".into(), + }, + }, + ] + ); + + let item = ResponseInputItem::FunctionCallOutput { + call_id: "call1".into(), + output: payload, + }; + + let json = serde_json::to_string(&item)?; + let v: serde_json::Value = serde_json::from_str(&json)?; + + assert_eq!( + v.get("output").expect("output field"), + &serde_json::json!([ + { "type": "input_text", "text": "caption" }, + { "type": "input_audio", "input_audio": { "data": "BASE64", "format": "mp3" } } + ]) + ); + + Ok(()) + } + + #[test] + fn serializes_mixed_image_and_audio_outputs_as_array() { + let call_tool_result = CallToolResult { + content: vec![ + serde_json::json!({"type":"image","data":"IMAGE","mimeType":"image/png"}), + serde_json::json!({"type":"audio","data":"AUDIO","mimeType":"audio/wav"}), + ], + structured_content: None, + is_error: Some(false), + meta: None, + }; + + let payload = call_tool_result.into_function_call_output_payload(); + let Some(items) = payload.content_items() else { + panic!("expected content items"); + }; + assert_eq!( + items, + [ + FunctionCallOutputContentItem::InputImage { + image_url: "data:image/png;base64,IMAGE".into(), + detail: Some(DEFAULT_IMAGE_DETAIL), + }, + FunctionCallOutputContentItem::InputAudio { + input_audio: InputAudio { + data: "AUDIO".into(), + format: "wav".into(), + }, + }, + ] + ); + } + + #[test] + fn strips_audio_data_urls_and_derives_format() { + let call_tool_result = CallToolResult { + content: vec![serde_json::json!({ + "type": "audio", + "data": "data:audio/ogg;base64,T2dnUw", + })], + structured_content: None, + is_error: Some(false), + meta: None, + }; + + let payload = call_tool_result.into_function_call_output_payload(); + let Some(items) = payload.content_items() else { + panic!("expected content items"); + }; + assert_eq!( + items, + [FunctionCallOutputContentItem::InputAudio { + input_audio: InputAudio { + data: "T2dnUw".into(), + format: "ogg".into(), + }, + }] + ); + } + + #[test] + fn audio_without_derivable_format_falls_back_to_text_payload() { + let content = vec![serde_json::json!({ + "type": "audio", + "data": "BASE64", + })]; + let call_tool_result = CallToolResult { + content: content.clone(), + structured_content: None, + is_error: Some(false), + meta: None, + }; + + let payload = call_tool_result.into_function_call_output_payload(); + assert_eq!( + payload, + FunctionCallOutputPayload { + body: FunctionCallOutputBody::Text(serde_json::to_string(&content).unwrap()), + success: Some(true), + } + ); + } + + #[test] + fn malformed_audio_block_falls_back_to_text_inside_structured_payload() { + let malformed_audio = serde_json::json!({ + "type": "audio", + "data": "data:image/png;base64,NOT_AUDIO", + }); + let call_tool_result = CallToolResult { + content: vec![ + serde_json::json!({"type":"image","data":"IMAGE","mimeType":"image/png"}), + malformed_audio.clone(), + ], + structured_content: None, + is_error: Some(false), + meta: None, + }; + + let payload = call_tool_result.into_function_call_output_payload(); + let Some(items) = payload.content_items() else { + panic!("expected content items"); + }; + assert_eq!( + items, + [ + FunctionCallOutputContentItem::InputImage { + image_url: "data:image/png;base64,IMAGE".into(), + detail: Some(DEFAULT_IMAGE_DETAIL), + }, + FunctionCallOutputContentItem::InputText { + text: serde_json::to_string(&malformed_audio).unwrap(), + }, + ] + ); + } + + #[test] + fn structured_content_precedence_ignores_audio_content() { + let call_tool_result = CallToolResult { + content: vec![serde_json::json!({ + "type": "audio", + "data": "BASE64", + "mimeType": "audio/wav", + })], + structured_content: Some(serde_json::json!({ "ok": true })), + is_error: Some(false), + meta: None, + }; + + let payload = call_tool_result.into_function_call_output_payload(); + assert_eq!( + payload, + FunctionCallOutputPayload { + body: FunctionCallOutputBody::Text("{\"ok\":true}".to_string()), + success: Some(true), + } + ); + } + #[test] fn serializes_custom_tool_image_outputs_as_array() -> Result<()> { let item = ResponseInputItem::CustomToolCallOutput { diff --git a/codex-rs/protocol/src/openai_models.rs b/codex-rs/protocol/src/openai_models.rs index d51e70ddf16f..b130d58c5dbd 100644 --- a/codex-rs/protocol/src/openai_models.rs +++ b/codex-rs/protocol/src/openai_models.rs @@ -82,6 +82,8 @@ pub enum InputModality { Text, /// Image attachments included in user turns. Image, + /// Audio content included in tool payloads. + Audio, } /// Backward-compatible default when `input_modalities` is omitted on the wire. diff --git a/codex-rs/rmcp-client/src/bin/test_stdio_server.rs b/codex-rs/rmcp-client/src/bin/test_stdio_server.rs index 7add4d05f5af..fea5078b8d0c 100644 --- a/codex-rs/rmcp-client/src/bin/test_stdio_server.rs +++ b/codex-rs/rmcp-client/src/bin/test_stdio_server.rs @@ -71,6 +71,7 @@ impl TestToolServer { Self::cwd_tool(), Self::sync_tool(), Self::image_tool(), + Self::audio_tool(), Self::image_scenario_tool(), sandbox_meta_tool, ]; @@ -227,6 +228,24 @@ impl TestToolServer { tool } + fn audio_tool() -> Tool { + #[expect(clippy::expect_used)] + let schema: JsonObject = serde_json::from_value(serde_json::json!({ + "type": "object", + "properties": {}, + "additionalProperties": false + })) + .expect("audio tool schema should deserialize"); + + let mut tool = Tool::new( + Cow::Borrowed("audio"), + Cow::Borrowed("Return a single audio content block."), + Arc::new(schema), + ); + tool.annotations = Some(ToolAnnotations::new().read_only(true)); + tool + } + /// Tool intended for manual testing of Codex TUI rendering for MCP image tool results. /// /// This exists to exercise edge cases where a `CallToolResult.content` includes image blocks @@ -543,6 +562,20 @@ impl ServerHandler for TestToolServer { data_b64, mime_type, )])) } + "audio" => { + let data = + std::env::var("MCP_TEST_AUDIO_DATA").unwrap_or_else(|_| "QkFTRTY0".to_string()); + let mime_type = std::env::var("MCP_TEST_AUDIO_MIME_TYPE") + .unwrap_or_else(|_| "audio/wav".to_string()); + + Ok(CallToolResult::success(vec![rmcp::model::Annotated::new( + rmcp::model::RawContent::Audio(rmcp::model::RawAudioContent { + data, + mime_type, + }), + None, + )])) + } "image_scenario" => { let args = Self::parse_call_args::(&request, "image_scenario")?; Self::image_scenario_result(args) diff --git a/codex-rs/tools/src/tool_config.rs b/codex-rs/tools/src/tool_config.rs index ad884e5be023..5b513b469600 100644 --- a/codex-rs/tools/src/tool_config.rs +++ b/codex-rs/tools/src/tool_config.rs @@ -113,6 +113,7 @@ pub struct ToolsConfig { pub request_permissions_tool_enabled: bool, pub code_mode_enabled: bool, pub code_mode_only_enabled: bool, + pub supports_audio_input: bool, pub can_request_original_image_detail: bool, pub collab_tools: bool, pub goal_tools: bool, @@ -188,6 +189,7 @@ impl ToolsConfig { && features.enabled(Feature::Apps) && features.enabled(Feature::Plugins); let include_original_image_detail = can_request_original_image_detail(model_info); + let supports_audio_input = model_info.input_modalities.contains(&InputModality::Audio); // API-key auth bypasses Codex backend entitlement/tool normalization, so // callers must confirm ChatGPT auth before exposing the built-in tool. let include_image_gen_tool = *image_generation_tool_auth_allowed @@ -256,6 +258,7 @@ impl ToolsConfig { request_permissions_tool_enabled, code_mode_enabled: include_code_mode, code_mode_only_enabled: include_code_mode_only, + supports_audio_input, can_request_original_image_detail: include_original_image_detail, collab_tools: include_collab_tools, goal_tools: include_goal_tools, diff --git a/codex-rs/tools/src/tool_config_tests.rs b/codex-rs/tools/src/tool_config_tests.rs index 252ad7a3205a..496474090d92 100644 --- a/codex-rs/tools/src/tool_config_tests.rs +++ b/codex-rs/tools/src/tool_config_tests.rs @@ -265,6 +265,48 @@ fn image_generation_requires_feature_and_supported_model() { assert!(!unsupported_tools_config.image_gen_tool); } +#[test] +fn audio_input_support_tracks_model_modalities() { + let supported_model_info = ModelInfo { + input_modalities: vec![ + InputModality::Text, + InputModality::Image, + InputModality::Audio, + ], + ..model_info() + }; + let unsupported_model_info = ModelInfo { + input_modalities: vec![InputModality::Text, InputModality::Image], + ..model_info() + }; + let features = Features::with_defaults(); + let available_models = Vec::new(); + + let supported_tools_config = ToolsConfig::new(&ToolsConfigParams { + model_info: &supported_model_info, + available_models: &available_models, + features: &features, + image_generation_tool_auth_allowed: true, + web_search_mode: Some(WebSearchMode::Cached), + session_source: SessionSource::Cli, + permission_profile: &PermissionProfile::Disabled, + windows_sandbox_level: WindowsSandboxLevel::Disabled, + }); + let unsupported_tools_config = ToolsConfig::new(&ToolsConfigParams { + model_info: &unsupported_model_info, + available_models: &available_models, + features: &features, + image_generation_tool_auth_allowed: true, + web_search_mode: Some(WebSearchMode::Cached), + session_source: SessionSource::Cli, + permission_profile: &PermissionProfile::Disabled, + windows_sandbox_level: WindowsSandboxLevel::Disabled, + }); + + assert!(supported_tools_config.supports_audio_input); + assert!(!unsupported_tools_config.supports_audio_input); +} + #[test] fn provider_capability_methods_disable_provider_bound_tool_surfaces() { let model_info = model_info(); diff --git a/codex-rs/tools/src/tool_output.rs b/codex-rs/tools/src/tool_output.rs index 2044295174b7..ef13f30400b5 100644 --- a/codex-rs/tools/src/tool_output.rs +++ b/codex-rs/tools/src/tool_output.rs @@ -191,7 +191,8 @@ fn content_items_to_code_mode_result(items: &[FunctionCallOutputContentItem]) -> Some(image_url.clone()) } FunctionCallOutputContentItem::InputText { .. } - | FunctionCallOutputContentItem::InputImage { .. } => None, + | FunctionCallOutputContentItem::InputImage { .. } + | FunctionCallOutputContentItem::InputAudio { .. } => None, }) .collect::>() .join("\n"), diff --git a/codex-rs/utils/output-truncation/src/lib.rs b/codex-rs/utils/output-truncation/src/lib.rs index 24b1630da134..906d981ac333 100644 --- a/codex-rs/utils/output-truncation/src/lib.rs +++ b/codex-rs/utils/output-truncation/src/lib.rs @@ -34,7 +34,8 @@ pub fn formatted_truncate_text_content_items_with_policy( .iter() .filter_map(|item| match item { FunctionCallOutputContentItem::InputText { text } => Some(text.as_str()), - FunctionCallOutputContentItem::InputImage { .. } => None, + FunctionCallOutputContentItem::InputImage { .. } + | FunctionCallOutputContentItem::InputAudio { .. } => None, }) .collect::>(); @@ -64,6 +65,11 @@ pub fn formatted_truncate_text_content_items_with_policy( detail: *detail, }) } + FunctionCallOutputContentItem::InputAudio { input_audio } => { + Some(FunctionCallOutputContentItem::InputAudio { + input_audio: input_audio.clone(), + }) + } FunctionCallOutputContentItem::InputText { .. } => None, })); @@ -117,6 +123,11 @@ pub fn truncate_function_output_items_with_policy( detail: *detail, }); } + FunctionCallOutputContentItem::InputAudio { input_audio } => { + out.push(FunctionCallOutputContentItem::InputAudio { + input_audio: input_audio.clone(), + }); + } } } diff --git a/codex-rs/utils/output-truncation/src/truncate_tests.rs b/codex-rs/utils/output-truncation/src/truncate_tests.rs index 74acb15ca3d2..ce1afcb0ca21 100644 --- a/codex-rs/utils/output-truncation/src/truncate_tests.rs +++ b/codex-rs/utils/output-truncation/src/truncate_tests.rs @@ -7,6 +7,7 @@ use crate::truncate_function_output_items_with_policy; use crate::truncate_text; use codex_protocol::models::DEFAULT_IMAGE_DETAIL; use codex_protocol::models::FunctionCallOutputContentItem; +use codex_protocol::models::InputAudio; use pretty_assertions::assert_eq; #[test] @@ -251,6 +252,43 @@ fn formatted_truncate_text_content_items_with_policy_merges_text_and_appends_ima assert_eq!(original_token_count, Some(4)); } +#[test] +fn formatted_truncate_text_content_items_with_policy_preserves_audio_like_images() { + let items = vec![ + FunctionCallOutputContentItem::InputText { + text: "abcd".to_string(), + }, + FunctionCallOutputContentItem::InputAudio { + input_audio: InputAudio { + data: "UklGRg==".to_string(), + format: "wav".to_string(), + }, + }, + FunctionCallOutputContentItem::InputText { + text: "efgh".to_string(), + }, + ]; + + let (output, original_token_count) = + formatted_truncate_text_content_items_with_policy(&items, TruncationPolicy::Bytes(4)); + + assert_eq!( + output, + vec![ + FunctionCallOutputContentItem::InputText { + text: "Total output lines: 2\n\nab…5 chars truncated…gh".to_string(), + }, + FunctionCallOutputContentItem::InputAudio { + input_audio: InputAudio { + data: "UklGRg==".to_string(), + format: "wav".to_string(), + }, + }, + ] + ); + assert_eq!(original_token_count, Some(3)); +} + #[test] fn formatted_truncate_text_content_items_with_policy_merges_all_text_for_token_budget() { let items = vec![