diff --git a/codex-rs/app-server-protocol/schema/json/ClientRequest.json b/codex-rs/app-server-protocol/schema/json/ClientRequest.json
index a6fe99b35e98..7a2c8a57f1f7 100644
--- a/codex-rs/app-server-protocol/schema/json/ClientRequest.json
+++ b/codex-rs/app-server-protocol/schema/json/ClientRequest.json
@@ -1173,6 +1173,26 @@
           ],
           "title": "InputImageFunctionCallOutputContentItem",
           "type": "object"
+        },
+        {
+          "properties": {
+            "input_audio": {
+              "$ref": "#/definitions/InputAudio"
+            },
+            "type": {
+              "enum": [
+                "input_audio"
+              ],
+              "title": "InputAudioFunctionCallOutputContentItemType",
+              "type": "string"
+            }
+          },
+          "required": [
+            "input_audio",
+            "type"
+          ],
+          "title": "InputAudioFunctionCallOutputContentItem",
+          "type": "object"
         }
       ]
     },
@@ -1289,6 +1309,21 @@
       ],
       "type": "object"
     },
+    "InputAudio": {
+      "properties": {
+        "data": {
+          "type": "string"
+        },
+        "format": {
+          "type": "string"
+        }
+      },
+      "required": [
+        "data",
+        "format"
+      ],
+      "type": "object"
+    },
     "ListMcpServerStatusParams": {
       "properties": {
         "cursor": {
diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json
index 066e1f4870f3..ac33cfaee6eb 100644
--- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json
+++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json
@@ -9145,6 +9145,26 @@
             ],
             "title": "InputImageFunctionCallOutputContentItem",
             "type": "object"
+          },
+          {
+            "properties": {
+              "input_audio": {
+                "$ref": "#/definitions/v2/InputAudio"
+              },
+              "type": {
+                "enum": [
+                  "input_audio"
+                ],
+                "title": "InputAudioFunctionCallOutputContentItemType",
+                "type": "string"
+              }
+            },
+            "required": [
+              "input_audio",
+              "type"
+            ],
+            "title": "InputAudioFunctionCallOutputContentItem",
+            "type": "object"
           }
         ]
       },
@@ -9935,6 +9955,21 @@
         ],
         "type": "string"
       },
+      "InputAudio": {
+        "properties": {
+          "data": {
+            "type": "string"
+          },
+          "format": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "data",
+          "format"
+        ],
+        "type": "object"
+      },
       "InputModality": {
         "description": "Canonical user-input modality tags advertised by a model.",
         "oneOf": [
@@ -9951,6 +9986,13 @@
               "image"
             ],
             "type": "string"
+          },
+          {
+            "description": "Audio content included in tool payloads.",
+            "enum": [
+              "audio"
+            ],
+            "type": "string"
           }
         ]
       },
diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json
index 95ce7e4aefb8..fefb9e94aa38 100644
--- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json
+++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json
@@ -5534,6 +5534,26 @@
           ],
           "title": "InputImageFunctionCallOutputContentItem",
           "type": "object"
+        },
+        {
+          "properties": {
+            "input_audio": {
+              "$ref": "#/definitions/InputAudio"
+            },
+            "type": {
+              "enum": [
+                "input_audio"
+              ],
+              "title": "InputAudioFunctionCallOutputContentItemType",
+              "type": "string"
+            }
+          },
+          "required": [
+            "input_audio",
+            "type"
+          ],
+          "title": "InputAudioFunctionCallOutputContentItem",
+          "type": "object"
         }
       ]
     },
@@ -6484,6 +6504,21 @@
       "title": "InitializeParams",
       "type": "object"
     },
+    "InputAudio": {
+      "properties": {
+        "data": {
+          "type": "string"
+        },
+        "format": {
+          "type": "string"
+        }
+      },
+      "required": [
+        "data",
+        "format"
+      ],
+      "type": "object"
+    },
     "InputModality": {
       "description": "Canonical user-input modality tags advertised by a model.",
       "oneOf": [
@@ -6500,6 +6535,13 @@
             "image"
           ],
           "type": "string"
+        },
+        {
+          "description": "Audio content included in tool payloads.",
+          "enum": [
+            "audio"
+          ],
+          "type": "string"
         }
       ]
     },
diff --git a/codex-rs/app-server-protocol/schema/json/v2/ModelListResponse.json b/codex-rs/app-server-protocol/schema/json/v2/ModelListResponse.json
index c0221805eb08..0a97f98b6a06 100644
--- a/codex-rs/app-server-protocol/schema/json/v2/ModelListResponse.json
+++ b/codex-rs/app-server-protocol/schema/json/v2/ModelListResponse.json
@@ -17,6 +17,13 @@
             "image"
           ],
           "type": "string"
+        },
+        {
+          "description": "Audio content included in tool payloads.",
+          "enum": [
+            "audio"
+          ],
+          "type": "string"
         }
       ]
     },
diff --git a/codex-rs/app-server-protocol/schema/json/v2/RawResponseItemCompletedNotification.json b/codex-rs/app-server-protocol/schema/json/v2/RawResponseItemCompletedNotification.json
index 6973d15baa6d..b02ddff73ed4 100644
--- a/codex-rs/app-server-protocol/schema/json/v2/RawResponseItemCompletedNotification.json
+++ b/codex-rs/app-server-protocol/schema/json/v2/RawResponseItemCompletedNotification.json
@@ -140,6 +140,26 @@
           ],
           "title": "InputImageFunctionCallOutputContentItem",
           "type": "object"
+        },
+        {
+          "properties": {
+            "input_audio": {
+              "$ref": "#/definitions/InputAudio"
+            },
+            "type": {
+              "enum": [
+                "input_audio"
+              ],
+              "title": "InputAudioFunctionCallOutputContentItemType",
+              "type": "string"
+            }
+          },
+          "required": [
+            "input_audio",
+            "type"
+          ],
+          "title": "InputAudioFunctionCallOutputContentItem",
+          "type": "object"
         }
       ]
     },
@@ -152,6 +172,21 @@
       ],
       "type": "string"
     },
+    "InputAudio": {
+      "properties": {
+        "data": {
+          "type": "string"
+        },
+        "format": {
+          "type": "string"
+        }
+      },
+      "required": [
+        "data",
+        "format"
+      ],
+      "type": "object"
+    },
     "LocalShellAction": {
       "oneOf": [
         {
diff --git a/codex-rs/app-server-protocol/schema/json/v2/ThreadResumeParams.json b/codex-rs/app-server-protocol/schema/json/v2/ThreadResumeParams.json
index 5f07fe0149db..2f6dbc0f8436 100644
--- a/codex-rs/app-server-protocol/schema/json/v2/ThreadResumeParams.json
+++ b/codex-rs/app-server-protocol/schema/json/v2/ThreadResumeParams.json
@@ -203,6 +203,26 @@
           ],
           "title": "InputImageFunctionCallOutputContentItem",
           "type": "object"
+        },
+        {
+          "properties": {
+            "input_audio": {
+              "$ref": "#/definitions/InputAudio"
+            },
+            "type": {
+              "enum": [
+                "input_audio"
+              ],
+              "title": "InputAudioFunctionCallOutputContentItemType",
+              "type": "string"
+            }
+          },
+          "required": [
+            "input_audio",
+            "type"
+          ],
+          "title": "InputAudioFunctionCallOutputContentItem",
+          "type": "object"
         }
       ]
     },
@@ -215,6 +235,21 @@
       ],
       "type": "string"
     },
+    "InputAudio": {
+      "properties": {
+        "data": {
+          "type": "string"
+        },
+        "format": {
+          "type": "string"
+        }
+      },
+      "required": [
+        "data",
+        "format"
+      ],
+      "type": "object"
+    },
     "LocalShellAction": {
       "oneOf": [
         {
diff --git a/codex-rs/app-server-protocol/schema/typescript/FunctionCallOutputContentItem.ts b/codex-rs/app-server-protocol/schema/typescript/FunctionCallOutputContentItem.ts
index fb2996f1e54a..330c99f9dd27 100644
--- a/codex-rs/app-server-protocol/schema/typescript/FunctionCallOutputContentItem.ts
+++ b/codex-rs/app-server-protocol/schema/typescript/FunctionCallOutputContentItem.ts
@@ -2,9 +2,10 @@
 
 // This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
 import type { ImageDetail } from "./ImageDetail";
+import type { InputAudio } from "./InputAudio";
 
 /**
  * Responses API compatible content items that can be returned by a tool call.
  * This is a subset of ContentItem with the types we support as function call outputs.
  */
-export type FunctionCallOutputContentItem = { "type": "input_text", text: string, } | { "type": "input_image", image_url: string, detail?: ImageDetail, };
+export type FunctionCallOutputContentItem = { "type": "input_text", text: string, } | { "type": "input_image", image_url: string, detail?: ImageDetail, } | { "type": "input_audio", input_audio: InputAudio, };
diff --git a/codex-rs/app-server-protocol/schema/typescript/InputAudio.ts b/codex-rs/app-server-protocol/schema/typescript/InputAudio.ts
new file mode 100644
index 000000000000..637996baae93
--- /dev/null
+++ b/codex-rs/app-server-protocol/schema/typescript/InputAudio.ts
@@ -0,0 +1,5 @@
+// GENERATED CODE! DO NOT MODIFY BY HAND!
+
+// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
+
+export type InputAudio = { data: string, format: string, };
diff --git a/codex-rs/app-server-protocol/schema/typescript/InputModality.ts b/codex-rs/app-server-protocol/schema/typescript/InputModality.ts
index 73661938b38a..40d598df3db5 100644
--- a/codex-rs/app-server-protocol/schema/typescript/InputModality.ts
+++ b/codex-rs/app-server-protocol/schema/typescript/InputModality.ts
@@ -5,4 +5,4 @@
 /**
  * Canonical user-input modality tags advertised by a model.
  */
-export type InputModality = "text" | "image";
+export type InputModality = "text" | "image" | "audio";
diff --git a/codex-rs/app-server-protocol/schema/typescript/index.ts b/codex-rs/app-server-protocol/schema/typescript/index.ts
index 97ea43560192..c9d7a73f44b6 100644
--- a/codex-rs/app-server-protocol/schema/typescript/index.ts
+++ b/codex-rs/app-server-protocol/schema/typescript/index.ts
@@ -36,6 +36,7 @@ export type { ImageDetail } from "./ImageDetail";
 export type { InitializeCapabilities } from "./InitializeCapabilities";
 export type { InitializeParams } from "./InitializeParams";
 export type { InitializeResponse } from "./InitializeResponse";
+export type { InputAudio } from "./InputAudio";
 export type { InputModality } from "./InputModality";
 export type { InternalSessionSource } from "./InternalSessionSource";
 export type { LocalShellAction } from "./LocalShellAction";
diff --git a/codex-rs/code-mode/src/description.rs b/codex-rs/code-mode/src/description.rs
index 0c2813e51a88..33c7e6b4a07f 100644
--- a/codex-rs/code-mode/src/description.rs
+++ b/codex-rs/code-mode/src/description.rs
@@ -9,7 +9,7 @@ use crate::PUBLIC_TOOL_NAME;
 const MAX_JS_SAFE_INTEGER: u64 = (1_u64 << 53) - 1;
 const DEFERRED_NESTED_TOOLS_GUIDANCE: &str = r#"Some nested MCP/app tools may be omitted from this description. They are still available on the global `tools` object and listed in `ALL_TOOLS`.
 To find one, filter `ALL_TOOLS` by `name` and `description`."#;
-const EXEC_DESCRIPTION_TEMPLATE: &str = r#"Run JavaScript code to orchestrate/compose tool calls
+const EXEC_DESCRIPTION_TEMPLATE_PREFIX: &str = r#"Run JavaScript code to orchestrate/compose tool calls
 - Evaluates the provided JavaScript code in a fresh V8 isolate as an async module.
 - All nested tools are available on the global `tools` object, for example `await tools.exec_command(...)`. Tool names are exposed as normalized JavaScript identifiers, for example `await tools.mcp__ologs__get_profile(...)`.
 - Nested tool methods take either a string or an object as their input argument.
@@ -24,8 +24,9 @@ const EXEC_DESCRIPTION_TEMPLATE: &str = r#"Run JavaScript code to orchestrate/co
 - Global helpers:
 - `exit()`: Immediately ends the current script successfully (like an early return from the top level).
 - `text(value: string | number | boolean | undefined | null)`: Appends a text item. Non-string values are stringified with `JSON.stringify(...)` when possible.
-- `image(imageUrlOrItem: string | { image_url: string; detail?: "auto" | "low" | "high" | "original" | null } | ImageContent, detail?: "auto" | "low" | "high" | "original" | null)`: Appends an image item. `image_url` can be an HTTPS URL or a base64-encoded `data:` URL. To forward an MCP tool image, pass an individual `ImageContent` block from `result.content`, for example `image(result.content[0])`. MCP image blocks may request detail with `_meta: { "codex/imageDetail": "original" }`. When provided, the second `detail` argument overrides any detail embedded in the first argument.
-- `store(key: string, value: any)`: stores a serializable value under a string key for later `exec` calls in the same session.
+- `image(imageUrlOrItem: string | { image_url: string; detail?: "auto" | "low" | "high" | "original" | null } | ImageContent, detail?: "auto" | "low" | "high" | "original" | null)`: Appends an image item. `image_url` can be an HTTPS URL or a base64-encoded `data:` URL. To forward an MCP tool image, pass an individual `ImageContent` block from `result.content`, for example `image(result.content[0])`. MCP image blocks may request detail with `_meta: { "codex/imageDetail": "original" }`. When provided, the second `detail` argument overrides any detail embedded in the first argument."#;
+const AUDIO_HELPER_DESCRIPTION: &str = r#"- `audio(audioItem: { data: string; format?: string | null; mimeType?: string | null; mime_type?: string | null } | AudioContent)`: Appends an audio item. `data` can be raw base64 audio or a base64-encoded `data:audio/...` URL. To forward an MCP tool audio block, pass an individual `AudioContent` block from `result.content`, for example `audio(result.content[0])`."#;
+const EXEC_DESCRIPTION_TEMPLATE_SUFFIX: &str = r#"- `store(key: string, value: any)`: stores a serializable value under a string key for later `exec` calls in the same session.
 - `load(key: string)`: returns the stored value for a string key, or `undefined` if it is missing.
 - `notify(value: string | number | boolean | undefined | null)`: immediately injects an extra `custom_tool_call_output` for the current `exec` call. Values are stringified like `text(...)`.
 - `setTimeout(callback: () => void, delayMs?: number)`: schedules a callback to run later and returns a timeout id. Pending timeouts do not keep `exec` alive by themselves; await an explicit promise if you need to wait for one.
@@ -41,7 +42,7 @@ const WAIT_DESCRIPTION_TEMPLATE: &str = r#"- Use `wait` only after `exec` return
 - If the cell is still running, `wait` may yield again with the same `cell_id`.
 - If the cell has already finished, `wait` returns the completed result and closes the cell."#;
 // Based off of https://modelcontextprotocol.io/specification/draft/schema#calltoolresult
-const MCP_TYPESCRIPT_PREAMBLE: &str = r#"type Role = "user" | "assistant";
+const MCP_TYPESCRIPT_PREAMBLE_PREFIX: &str = r#"type Role = "user" | "assistant";
 type MetaObject = Record<string, unknown>;
 type Annotations = {
   audience?: Role[];
@@ -79,14 +80,16 @@ type ImageContent = {
   annotations?: Annotations;
   _meta?: MetaObject;
 };
-type AudioContent = {
+"#;
+const MCP_AUDIO_CONTENT_TYPE: &str = r#"type AudioContent = {
   type: "audio";
   data: string;
   mimeType: string;
   annotations?: Annotations;
   _meta?: MetaObject;
 };
-type ResourceLink = {
+"#;
+const MCP_TYPESCRIPT_PREAMBLE_SUFFIX: &str = r#"type ResourceLink = {
   icons?: Icon[];
   name: string;
   title?: string;
@@ -106,8 +109,10 @@ type EmbeddedResource = {
 };
 type ContentBlock =
   | TextContent
-  | ImageContent
-  | AudioContent
+  | ImageContent"#;
+const MCP_AUDIO_CONTENT_BLOCK_VARIANT: &str = r#"
+  | AudioContent"#;
+const MCP_TYPESCRIPT_PREAMBLE_END: &str = r#"
   | ResourceLink
   | EmbeddedResource;
 type CallToolResult<TStructured = { [key: string]: unknown }> = {
@@ -143,6 +148,13 @@ pub struct ToolNamespaceDescription {
     pub description: String,
 }
 
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub struct ExecToolDescriptionOptions {
+    pub code_mode_only: bool,
+    pub deferred_tools_available: bool,
+    pub supports_audio_input: bool,
+}
+
 #[derive(Debug, Default, Deserialize, PartialEq, Eq)]
 #[serde(deny_unknown_fields)]
 struct CodeModeExecPragma {
@@ -250,15 +262,21 @@ pub fn is_code_mode_nested_tool(tool_name: &str) -> bool {
 pub fn build_exec_tool_description(
     enabled_tools: &[ToolDefinition],
     namespace_descriptions: &BTreeMap<String, ToolNamespaceDescription>,
-    code_mode_only: bool,
-    deferred_tools_available: bool,
+    options: ExecToolDescriptionOptions,
 ) -> String {
     let mut sections = Vec::new();
-    sections.push(EXEC_DESCRIPTION_TEMPLATE.to_string());
-    if deferred_tools_available {
+    let mut exec_description = String::from(EXEC_DESCRIPTION_TEMPLATE_PREFIX);
+    if options.supports_audio_input {
+        exec_description.push('\n');
+        exec_description.push_str(AUDIO_HELPER_DESCRIPTION);
+    }
+    exec_description.push('\n');
+    exec_description.push_str(EXEC_DESCRIPTION_TEMPLATE_SUFFIX);
+    sections.push(exec_description);
+    if options.deferred_tools_available {
         sections.push(DEFERRED_NESTED_TOOLS_GUIDANCE.to_string());
     }
-    if !code_mode_only {
+    if !options.code_mode_only {
         return sections.join("\n\n");
     }
 
@@ -305,8 +323,18 @@ pub fn build_exec_tool_description(
         }
 
         if has_mcp_tools {
+            let mut mcp_typescript_preamble = String::from(MCP_TYPESCRIPT_PREAMBLE_PREFIX);
+            if options.supports_audio_input {
+                mcp_typescript_preamble.push_str(MCP_AUDIO_CONTENT_TYPE);
+            }
+            mcp_typescript_preamble.push_str(MCP_TYPESCRIPT_PREAMBLE_SUFFIX);
+            if options.supports_audio_input {
+                mcp_typescript_preamble.push_str(MCP_AUDIO_CONTENT_BLOCK_VARIANT);
+            }
+            mcp_typescript_preamble.push_str(MCP_TYPESCRIPT_PREAMBLE_END);
+
             sections.push(format!(
-                "Shared MCP Types:\n```ts\n{MCP_TYPESCRIPT_PREAMBLE}\n```"
+                "Shared MCP Types:\n```ts\n{mcp_typescript_preamble}\n```"
             ));
         }
         let nested_tool_reference = nested_tool_sections.join("\n\n");
@@ -706,6 +734,7 @@ fn render_json_schema_literal(value: &JsonValue) -> String {
 #[cfg(test)]
 mod tests {
     use super::CodeModeToolKind;
+    use super::ExecToolDescriptionOptions;
     use super::ParsedExecSource;
     use super::ToolDefinition;
     use super::ToolNamespaceDescription;
@@ -863,8 +892,11 @@ mod tests {
                 output_schema: None,
             }],
             &BTreeMap::new(),
-            /*code_mode_only*/ true,
-            /*deferred_tools_available*/ false,
+            ExecToolDescriptionOptions {
+                code_mode_only: true,
+                deferred_tools_available: false,
+                supports_audio_input: false,
+            },
         );
         assert!(description.contains(
             "### `foo`
@@ -878,13 +910,41 @@ bar"
         let description = build_exec_tool_description(
             &[],
             &BTreeMap::new(),
-            /*code_mode_only*/ false,
-            /*deferred_tools_available*/ false,
+            ExecToolDescriptionOptions {
+                code_mode_only: false,
+                deferred_tools_available: false,
+                supports_audio_input: false,
+            },
         );
         assert!(description.contains("`setTimeout(callback: () => void, delayMs?: number)`"));
         assert!(description.contains("`clearTimeout(timeoutId?: number)`"));
     }
 
+    #[test]
+    fn exec_description_gates_audio_helper_on_audio_input_support() {
+        let unsupported_description = build_exec_tool_description(
+            &[],
+            &BTreeMap::new(),
+            ExecToolDescriptionOptions {
+                code_mode_only: false,
+                deferred_tools_available: false,
+                supports_audio_input: false,
+            },
+        );
+        assert!(!unsupported_description.contains("`audio(audioItem"));
+
+        let supported_description = build_exec_tool_description(
+            &[],
+            &BTreeMap::new(),
+            ExecToolDescriptionOptions {
+                code_mode_only: false,
+                deferred_tools_available: false,
+                supports_audio_input: true,
+            },
+        );
+        assert!(supported_description.contains("`audio(audioItem"));
+    }
+
     #[test]
     fn code_mode_only_description_groups_namespace_instructions_once() {
         let namespace_descriptions = BTreeMap::from([(
@@ -930,8 +990,11 @@ bar"
                 },
             ],
             &namespace_descriptions,
-            /*code_mode_only*/ true,
-            /*deferred_tools_available*/ false,
+            ExecToolDescriptionOptions {
+                code_mode_only: true,
+                deferred_tools_available: false,
+                supports_audio_input: false,
+            },
         );
         assert_eq!(description.matches("## mcp__sample").count(), 1);
         assert!(description.contains("## mcp__sample\nShared namespace guidance."));
@@ -970,8 +1033,11 @@ bar"
                 }))),
             }],
             &namespace_descriptions,
-            /*code_mode_only*/ true,
-            /*deferred_tools_available*/ false,
+            ExecToolDescriptionOptions {
+                code_mode_only: true,
+                deferred_tools_available: false,
+                supports_audio_input: false,
+            },
         );
 
         assert!(!description.contains("## mcp__sample"));
@@ -1069,8 +1135,11 @@ bar"
                 },
             ],
             &BTreeMap::new(),
-            /*code_mode_only*/ true,
-            /*deferred_tools_available*/ false,
+            ExecToolDescriptionOptions {
+                code_mode_only: true,
+                deferred_tools_available: false,
+                supports_audio_input: false,
+            },
         );
 
         assert_eq!(
@@ -1082,13 +1151,60 @@ bar"
         assert_eq!(description.matches("Shared MCP Types:").count(), 1);
     }
 
+    #[test]
+    fn code_mode_only_description_gates_mcp_audio_type_on_audio_input_support() {
+        let tools = vec![ToolDefinition {
+            name: "mcp__sample__audio".to_string(),
+            tool_name: ToolName::namespaced("mcp__sample__", "audio"),
+            description: "Audio tool".to_string(),
+            kind: CodeModeToolKind::Function,
+            input_schema: Some(json!({
+                "type": "object",
+                "properties": {},
+                "additionalProperties": false
+            })),
+            output_schema: Some(mcp_call_tool_result_schema(json!({
+                "type": "object",
+                "properties": {},
+                "additionalProperties": false
+            }))),
+        }];
+
+        let unsupported_description = build_exec_tool_description(
+            &tools,
+            &BTreeMap::new(),
+            ExecToolDescriptionOptions {
+                code_mode_only: true,
+                deferred_tools_available: false,
+                supports_audio_input: false,
+            },
+        );
+        assert!(!unsupported_description.contains("type AudioContent"));
+        assert!(!unsupported_description.contains("| AudioContent"));
+
+        let supported_description = build_exec_tool_description(
+            &tools,
+            &BTreeMap::new(),
+            ExecToolDescriptionOptions {
+                code_mode_only: true,
+                deferred_tools_available: false,
+                supports_audio_input: true,
+            },
+        );
+        assert!(supported_description.contains("type AudioContent"));
+        assert!(supported_description.contains("| AudioContent"));
+    }
+
     #[test]
     fn exec_description_mentions_deferred_nested_tools_when_available() {
         let description = build_exec_tool_description(
             &[],
             &BTreeMap::new(),
-            /*code_mode_only*/ false,
-            /*deferred_tools_available*/ true,
+            ExecToolDescriptionOptions {
+                code_mode_only: false,
+                deferred_tools_available: true,
+                supports_audio_input: false,
+            },
         );
 
         assert!(description.contains("Some nested MCP/app tools may be omitted"));
diff --git a/codex-rs/code-mode/src/lib.rs b/codex-rs/code-mode/src/lib.rs
index 3da8c7732592..826c19d2d951 100644
--- a/codex-rs/code-mode/src/lib.rs
+++ b/codex-rs/code-mode/src/lib.rs
@@ -5,6 +5,7 @@ mod service;
 
 pub use description::CODE_MODE_PRAGMA_PREFIX;
 pub use description::CodeModeToolKind;
+pub use description::ExecToolDescriptionOptions;
 pub use description::ToolDefinition;
 pub use description::ToolNamespaceDescription;
 pub use description::augment_tool_definition;
@@ -18,6 +19,7 @@ pub use description::render_json_schema_to_typescript;
 pub use response::DEFAULT_IMAGE_DETAIL;
 pub use response::FunctionCallOutputContentItem;
 pub use response::ImageDetail;
+pub use response::InputAudio;
 pub use runtime::CodeModeNestedToolCall;
 pub use runtime::DEFAULT_EXEC_YIELD_TIME_MS;
 pub use runtime::DEFAULT_MAX_OUTPUT_TOKENS_PER_EXEC_CALL;
diff --git a/codex-rs/code-mode/src/response.rs b/codex-rs/code-mode/src/response.rs
index 0ac3a03770e4..b6fdb83a1885 100644
--- a/codex-rs/code-mode/src/response.rs
+++ b/codex-rs/code-mode/src/response.rs
@@ -23,4 +23,13 @@ pub enum FunctionCallOutputContentItem {
         #[serde(default, skip_serializing_if = "Option::is_none")]
         detail: Option<ImageDetail>,
     },
+    InputAudio {
+        input_audio: InputAudio,
+    },
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct InputAudio {
+    pub data: String,
+    pub format: String,
 }
diff --git a/codex-rs/code-mode/src/runtime/callbacks.rs b/codex-rs/code-mode/src/runtime/callbacks.rs
index c3a648ae3297..ae8b471b306d 100644
--- a/codex-rs/code-mode/src/runtime/callbacks.rs
+++ b/codex-rs/code-mode/src/runtime/callbacks.rs
@@ -5,6 +5,7 @@ use super::RuntimeEvent;
 use super::RuntimeState;
 use super::timers;
 use super::value::json_to_v8;
+use super::value::normalize_output_audio;
 use super::value::normalize_output_image;
 use super::value::serialize_output_text;
 use super::value::throw_type_error;
@@ -129,6 +130,26 @@ pub(super) fn image_callback(
     retval.set(v8::undefined(scope).into());
 }
 
+pub(super) fn audio_callback(
+    scope: &mut v8::PinScope<'_, '_>,
+    args: v8::FunctionCallbackArguments,
+    mut retval: v8::ReturnValue<v8::Value>,
+) {
+    let value = if args.length() == 0 {
+        v8::undefined(scope).into()
+    } else {
+        args.get(0)
+    };
+    let audio_item = match normalize_output_audio(scope, value) {
+        Ok(audio_item) => audio_item,
+        Err(()) => return,
+    };
+    if let Some(state) = scope.get_slot::<RuntimeState>() {
+        let _ = state.event_tx.send(RuntimeEvent::ContentItem(audio_item));
+    }
+    retval.set(v8::undefined(scope).into());
+}
+
 pub(super) fn store_callback(
     scope: &mut v8::PinScope<'_, '_>,
     args: v8::FunctionCallbackArguments,
diff --git a/codex-rs/code-mode/src/runtime/globals.rs b/codex-rs/code-mode/src/runtime/globals.rs
index 2ec6953f093b..375dbf80d517 100644
--- a/codex-rs/code-mode/src/runtime/globals.rs
+++ b/codex-rs/code-mode/src/runtime/globals.rs
@@ -1,4 +1,5 @@
 use super::RuntimeState;
+use super::callbacks::audio_callback;
 use super::callbacks::clear_timeout_callback;
 use super::callbacks::exit_callback;
 use super::callbacks::image_callback;
@@ -23,6 +24,7 @@ pub(super) fn install_globals(scope: &mut v8::PinScope<'_, '_>) -> Result<(), St
     let set_timeout = helper_function(scope, "setTimeout", set_timeout_callback)?;
     let text = helper_function(scope, "text", text_callback)?;
     let image = helper_function(scope, "image", image_callback)?;
+    let audio = helper_function(scope, "audio", audio_callback)?;
     let store = helper_function(scope, "store", store_callback)?;
     let load = helper_function(scope, "load", load_callback)?;
     let notify = helper_function(scope, "notify", notify_callback)?;
@@ -35,6 +37,7 @@ pub(super) fn install_globals(scope: &mut v8::PinScope<'_, '_>) -> Result<(), St
     set_global(scope, global, "setTimeout", set_timeout.into())?;
     set_global(scope, global, "text", text.into())?;
     set_global(scope, global, "image", image.into())?;
+    set_global(scope, global, "audio", audio.into())?;
     set_global(scope, global, "store", store.into())?;
     set_global(scope, global, "load", load.into())?;
     set_global(scope, global, "notify", notify.into())?;
diff --git a/codex-rs/code-mode/src/runtime/value.rs b/codex-rs/code-mode/src/runtime/value.rs
index 8d76a832d365..57e4985c7628 100644
--- a/codex-rs/code-mode/src/runtime/value.rs
+++ b/codex-rs/code-mode/src/runtime/value.rs
@@ -3,8 +3,10 @@ use serde_json::Value as JsonValue;
 use crate::response::DEFAULT_IMAGE_DETAIL;
 use crate::response::FunctionCallOutputContentItem;
 use crate::response::ImageDetail;
+use crate::response::InputAudio;
 
 const IMAGE_HELPER_EXPECTS_MESSAGE: &str = "image expects a non-empty image URL string, an object with image_url and optional detail, or a raw MCP image block";
+const AUDIO_HELPER_EXPECTS_MESSAGE: &str = "audio expects an object with non-empty data and format/mimeType/mime_type, or a raw MCP audio block";
 const CODEX_IMAGE_DETAIL_META_KEY: &str = "codex/imageDetail";
 
 pub(super) fn serialize_output_text(
@@ -97,6 +99,35 @@ pub(super) fn normalize_output_image(
     }
 }
 
+pub(super) fn normalize_output_audio(
+    scope: &mut v8::PinScope<'_, '_>,
+    value: v8::Local<'_, v8::Value>,
+) -> Result<FunctionCallOutputContentItem, ()> {
+    let result = (|| -> Result<FunctionCallOutputContentItem, String> {
+        if !value.is_object() || value.is_array() {
+            return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
+        }
+
+        let object = v8::Local::<v8::Object>::try_from(value)
+            .map_err(|_| AUDIO_HELPER_EXPECTS_MESSAGE.to_string())?;
+        let input_audio = if let Some(audio) = parse_non_mcp_output_audio(scope, object)? {
+            audio
+        } else {
+            parse_mcp_output_audio(scope, value)?
+        };
+
+        Ok(FunctionCallOutputContentItem::InputAudio { input_audio })
+    })();
+
+    match result {
+        Ok(item) => Ok(item),
+        Err(error_text) => {
+            throw_type_error(scope, &error_text);
+            Err(())
+        }
+    }
+}
+
 fn parse_non_mcp_output_image(
     scope: &mut v8::PinScope<'_, '_>,
     object: v8::Local<'_, v8::Object>,
@@ -165,6 +196,90 @@ fn parse_mcp_output_image(
     Ok((image_url, detail))
 }
 
+fn parse_non_mcp_output_audio(
+    scope: &mut v8::PinScope<'_, '_>,
+    object: v8::Local<'_, v8::Object>,
+) -> Result<Option<InputAudio>, String> {
+    let data_key = v8::String::new(scope, "data")
+        .ok_or_else(|| "failed to allocate audio helper keys".to_string())?;
+    let Some(data) = object.get(scope, data_key.into()) else {
+        return Ok(None);
+    };
+    if data.is_undefined() {
+        return Ok(None);
+    }
+    if !data.is_string() {
+        return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
+    }
+    let data = data.to_rust_string_lossy(scope);
+    let format = optional_string_property(scope, object, "format")?;
+    let mime_type = optional_string_property(scope, object, "mimeType")?
+        .or(optional_string_property(scope, object, "mime_type")?);
+    let Some(input_audio) = codex_protocol::models::input_audio_from_data(
+        &data,
+        format.as_deref(),
+        mime_type.as_deref(),
+    ) else {
+        return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
+    };
+    Ok(Some(InputAudio {
+        data: input_audio.data,
+        format: input_audio.format,
+    }))
+}
+
+fn parse_mcp_output_audio(
+    scope: &mut v8::PinScope<'_, '_>,
+    value: v8::Local<'_, v8::Value>,
+) -> Result<InputAudio, String> {
+    let Some(result) = v8_value_to_json(scope, value)? else {
+        return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
+    };
+    let JsonValue::Object(result) = result else {
+        return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
+    };
+    let Some(item_type) = result.get("type").and_then(JsonValue::as_str) else {
+        return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
+    };
+    if item_type != "audio" {
+        return Err(format!(
+            "audio only accepts MCP audio blocks, got \"{item_type}\""
+        ));
+    }
+    let data = result
+        .get("data")
+        .and_then(JsonValue::as_str)
+        .ok_or_else(|| "audio expected MCP audio data".to_string())?;
+    let mime_type = result
+        .get("mimeType")
+        .or_else(|| result.get("mime_type"))
+        .and_then(JsonValue::as_str);
+    let Some(input_audio) =
+        codex_protocol::models::input_audio_from_data(data, /*format*/ None, mime_type)
+    else {
+        return Err(AUDIO_HELPER_EXPECTS_MESSAGE.to_string());
+    };
+    Ok(InputAudio {
+        data: input_audio.data,
+        format: input_audio.format,
+    })
+}
+
+fn optional_string_property(
+    scope: &mut v8::PinScope<'_, '_>,
+    object: v8::Local<'_, v8::Object>,
+    name: &str,
+) -> Result<Option<String>, String> {
+    let key = v8::String::new(scope, name)
+        .ok_or_else(|| "failed to allocate audio helper keys".to_string())?;
+    match object.get(scope, key.into()) {
+        Some(value) if value.is_string() => Ok(Some(value.to_rust_string_lossy(scope))),
+        Some(value) if value.is_null() || value.is_undefined() => Ok(None),
+        Some(_) => Err(format!("{name} must be a string when provided")),
+        None => Ok(None),
+    }
+}
+
 fn parse_image_detail_value<'s>(
     scope: &mut v8::PinScope<'s, '_>,
     value: Option<v8::Local<'s, v8::Value>>,
diff --git a/codex-rs/code-mode/src/service.rs b/codex-rs/code-mode/src/service.rs
index 44e4be49396d..43a59aee98a5 100644
--- a/codex-rs/code-mode/src/service.rs
+++ b/codex-rs/code-mode/src/service.rs
@@ -703,6 +703,7 @@ mod tests {
     use super::run_session_control;
     use crate::CodeModeToolKind;
     use crate::FunctionCallOutputContentItem;
+    use crate::InputAudio;
     use crate::ToolDefinition;
     use crate::runtime::ExecuteRequest;
     use crate::runtime::ExecuteToPendingOutcome;
@@ -1230,6 +1231,7 @@ text(formatter.format(new Date("2025-01-02T03:04:05Z")));
 const returnsUndefined = [
   text("first"),
   image("https://example.com/image.jpg"),
+  audio({ data: "BASE64", format: "wav" }),
   notify("ping"),
 ].map((value) => value === undefined);
 text(JSON.stringify(returnsUndefined));
@@ -1253,8 +1255,14 @@ text(JSON.stringify(returnsUndefined));
                         image_url: "https://example.com/image.jpg".to_string(),
                         detail: Some(crate::DEFAULT_IMAGE_DETAIL),
                     },
+                    FunctionCallOutputContentItem::InputAudio {
+                        input_audio: InputAudio {
+                            data: "BASE64".to_string(),
+                            format: "wav".to_string(),
+                        },
+                    },
                     FunctionCallOutputContentItem::InputText {
-                        text: "[true,true,true]".to_string(),
+                        text: "[true,true,true,true]".to_string(),
                     },
                 ],
                 stored_values: HashMap::new(),
@@ -1411,6 +1419,147 @@ image({
         );
     }
 
+    #[tokio::test]
+    async fn audio_helper_accepts_explicit_object() {
+        let service = CodeModeService::new();
+
+        let response = service
+            .execute(ExecuteRequest {
+                source: r#"audio({ data: "BASE64", format: "wav" });"#.to_string(),
+                yield_time_ms: None,
+                ..execute_request("")
+            })
+            .await
+            .unwrap();
+
+        assert_eq!(
+            response,
+            RuntimeResponse::Result {
+                cell_id: "1".to_string(),
+                content_items: vec![FunctionCallOutputContentItem::InputAudio {
+                    input_audio: InputAudio {
+                        data: "BASE64".to_string(),
+                        format: "wav".to_string(),
+                    },
+                }],
+                stored_values: HashMap::new(),
+                error_text: None,
+            }
+        );
+    }
+
+    #[tokio::test]
+    async fn audio_helper_strips_data_url_and_derives_format() {
+        let service = CodeModeService::new();
+
+        let response = service
+            .execute(ExecuteRequest {
+                source: r#"audio({ data: "data:audio/mpeg;base64,BASE64" });"#.to_string(),
+                yield_time_ms: None,
+                ..execute_request("")
+            })
+            .await
+            .unwrap();
+
+        assert_eq!(
+            response,
+            RuntimeResponse::Result {
+                cell_id: "1".to_string(),
+                content_items: vec![FunctionCallOutputContentItem::InputAudio {
+                    input_audio: InputAudio {
+                        data: "BASE64".to_string(),
+                        format: "mp3".to_string(),
+                    },
+                }],
+                stored_values: HashMap::new(),
+                error_text: None,
+            }
+        );
+    }
+
+    #[tokio::test]
+    async fn audio_helper_accepts_raw_mcp_audio_block() {
+        let service = CodeModeService::new();
+
+        let response = service
+            .execute(ExecuteRequest {
+                source: r#"audio({ type: "audio", data: "BASE64", mimeType: "audio/ogg" });"#
+                    .to_string(),
+                yield_time_ms: None,
+                ..execute_request("")
+            })
+            .await
+            .unwrap();
+
+        assert_eq!(
+            response,
+            RuntimeResponse::Result {
+                cell_id: "1".to_string(),
+                content_items: vec![FunctionCallOutputContentItem::InputAudio {
+                    input_audio: InputAudio {
+                        data: "BASE64".to_string(),
+                        format: "ogg".to_string(),
+                    },
+                }],
+                stored_values: HashMap::new(),
+                error_text: None,
+            }
+        );
+    }
+
+    #[tokio::test]
+    async fn audio_helper_rejects_bare_string() {
+        let service = CodeModeService::new();
+
+        let response = service
+            .execute(ExecuteRequest {
+                source: r#"audio("BASE64");"#.to_string(),
+                yield_time_ms: None,
+                ..execute_request("")
+            })
+            .await
+            .unwrap();
+
+        assert_eq!(
+            response,
+            RuntimeResponse::Result {
+                cell_id: "1".to_string(),
+                content_items: Vec::new(),
+                stored_values: HashMap::new(),
+                error_text: Some(
+                    "audio expects an object with non-empty data and format/mimeType/mime_type, or a raw MCP audio block".to_string(),
+                ),
+            }
+        );
+    }
+
+    #[tokio::test]
+    async fn audio_helper_rejects_non_audio_mime_type() {
+        let service = CodeModeService::new();
+
+        let response = service
+            .execute(ExecuteRequest {
+                source: r#"audio({ data: "BASE64", mimeType: "application/octet-stream" });"#
+                    .to_string(),
+                yield_time_ms: None,
+                ..execute_request("")
+            })
+            .await
+            .unwrap();
+
+        assert_eq!(
+            response,
+            RuntimeResponse::Result {
+                cell_id: "1".to_string(),
+                content_items: Vec::new(),
+                stored_values: HashMap::new(),
+                error_text: Some(
+                    "audio expects an object with non-empty data and format/mimeType/mime_type, or a raw MCP audio block".to_string(),
+                ),
+            }
+        );
+    }
+
     #[tokio::test]
     async fn wait_reports_missing_cell_separately_from_runtime_results() {
         let service = CodeModeService::new();
diff --git a/codex-rs/core/src/context_manager/history.rs b/codex-rs/core/src/context_manager/history.rs
index 80c057e0eb1d..301bf20cd71a 100644
--- a/codex-rs/core/src/context_manager/history.rs
+++ b/codex-rs/core/src/context_manager/history.rs
@@ -113,9 +113,8 @@ impl ContextManager {
     }
 
     /// Returns the history prepared for sending to the model. This applies a proper
-    /// normalization and drops un-suited items. When `input_modalities` does not
-    /// include `InputModality::Image`, images are stripped from messages and tool
-    /// outputs.
+    /// normalization and drops un-suited items. Unsupported media content is
+    /// stripped from messages and tool outputs according to `input_modalities`.
     pub(crate) fn for_prompt(mut self, input_modalities: &[InputModality]) -> Vec<ResponseItem> {
         self.normalize_history(input_modalities);
         self.items
@@ -365,8 +364,8 @@ impl ContextManager {
         // all outputs must have a corresponding function/tool call
         normalize::remove_orphan_outputs(&mut self.items);
 
-        // strip images when model does not support them
-        normalize::strip_images_when_unsupported(input_modalities, &mut self.items);
+        // strip unsupported media content before sending history to the model
+        normalize::strip_unsupported_media_content(input_modalities, &mut self.items);
     }
 
     fn process_item(&self, item: &ResponseItem, policy: TruncationPolicy) -> ResponseItem {
diff --git a/codex-rs/core/src/context_manager/history_tests.rs b/codex-rs/core/src/context_manager/history_tests.rs
index 74f4d29bfb4e..df6c09a395d3 100644
--- a/codex-rs/core/src/context_manager/history_tests.rs
+++ b/codex-rs/core/src/context_manager/history_tests.rs
@@ -10,6 +10,7 @@ use codex_protocol::models::FunctionCallOutputBody;
 use codex_protocol::models::FunctionCallOutputContentItem;
 use codex_protocol::models::FunctionCallOutputPayload;
 use codex_protocol::models::ImageDetail;
+use codex_protocol::models::InputAudio;
 use codex_protocol::models::LocalShellAction;
 use codex_protocol::models::LocalShellExecAction;
 use codex_protocol::models::LocalShellStatus;
@@ -513,6 +514,85 @@ fn for_prompt_strips_images_when_model_does_not_support_images() {
     }
 }
 
+#[test]
+fn for_prompt_strips_audio_when_model_does_not_support_audio() {
+    let items = vec![
+        ResponseItem::FunctionCall {
+            id: None,
+            name: "audio_tool".to_string(),
+            namespace: None,
+            arguments: "{}".to_string(),
+            call_id: "call-1".to_string(),
+        },
+        ResponseItem::FunctionCallOutput {
+            call_id: "call-1".to_string(),
+            output: FunctionCallOutputPayload::from_content_items(vec![
+                FunctionCallOutputContentItem::InputText {
+                    text: "audio result".to_string(),
+                },
+                FunctionCallOutputContentItem::InputAudio {
+                    input_audio: InputAudio {
+                        data: "UklGRg==".to_string(),
+                        format: "wav".to_string(),
+                    },
+                },
+            ]),
+        },
+    ];
+    let history = create_history_with_items(items);
+    let default_modalities = default_input_modalities();
+    let stripped = history.clone().for_prompt(&default_modalities);
+
+    assert_eq!(
+        stripped,
+        vec![
+            ResponseItem::FunctionCall {
+                id: None,
+                name: "audio_tool".to_string(),
+                namespace: None,
+                arguments: "{}".to_string(),
+                call_id: "call-1".to_string(),
+            },
+            ResponseItem::FunctionCallOutput {
+                call_id: "call-1".to_string(),
+                output: FunctionCallOutputPayload::from_content_items(vec![
+                    FunctionCallOutputContentItem::InputText {
+                        text: "audio result".to_string(),
+                    },
+                    FunctionCallOutputContentItem::InputText {
+                        text: "audio content omitted because you do not support audio input"
+                            .to_string(),
+                    },
+                ]),
+            },
+        ]
+    );
+
+    let audio_modalities = vec![
+        InputModality::Text,
+        InputModality::Image,
+        InputModality::Audio,
+    ];
+    let with_audio = history.for_prompt(&audio_modalities);
+    assert_eq!(
+        with_audio[1],
+        ResponseItem::FunctionCallOutput {
+            call_id: "call-1".to_string(),
+            output: FunctionCallOutputPayload::from_content_items(vec![
+                FunctionCallOutputContentItem::InputText {
+                    text: "audio result".to_string(),
+                },
+                FunctionCallOutputContentItem::InputAudio {
+                    input_audio: InputAudio {
+                        data: "UklGRg==".to_string(),
+                        format: "wav".to_string(),
+                    },
+                },
+            ]),
+        }
+    );
+}
+
 #[test]
 fn for_prompt_preserves_image_generation_calls_when_images_are_supported() {
     let history = create_history_with_items(vec![
diff --git a/codex-rs/core/src/context_manager/normalize.rs b/codex-rs/core/src/context_manager/normalize.rs
index 839bae331ed2..d9cb5423ea38 100644
--- a/codex-rs/core/src/context_manager/normalize.rs
+++ b/codex-rs/core/src/context_manager/normalize.rs
@@ -10,6 +10,8 @@ use tracing::info;
 
 const IMAGE_CONTENT_OMITTED_PLACEHOLDER: &str =
     "image content omitted because you do not support image input";
+const AUDIO_CONTENT_OMITTED_PLACEHOLDER: &str =
+    "audio content omitted because you do not support audio input";
 
 pub(crate) fn ensure_call_outputs_present(items: &mut Vec<ResponseItem>) {
     // Collect synthetic outputs to insert immediately after their calls.
@@ -290,14 +292,14 @@ where
     }
 }
 
-/// Strip image content from messages and tool outputs when the model does not support images.
-/// When `input_modalities` contains `InputModality::Image`, no stripping is performed.
-pub(crate) fn strip_images_when_unsupported(
+/// Strip unsupported media content from messages and tool outputs.
+pub(crate) fn strip_unsupported_media_content(
     input_modalities: &[InputModality],
     items: &mut [ResponseItem],
 ) {
     let supports_images = input_modalities.contains(&InputModality::Image);
-    if supports_images {
+    let supports_audio = input_modalities.contains(&InputModality::Audio);
+    if supports_images && supports_audio {
         return;
     }
 
@@ -307,7 +309,7 @@ pub(crate) fn strip_images_when_unsupported(
                 let mut normalized_content = Vec::with_capacity(content.len());
                 for content_item in content.iter() {
                     match content_item {
-                        ContentItem::InputImage { .. } => {
+                        ContentItem::InputImage { .. } if !supports_images => {
                             normalized_content.push(ContentItem::InputText {
                                 text: IMAGE_CONTENT_OMITTED_PLACEHOLDER.to_string(),
                             });
@@ -323,20 +325,29 @@ pub(crate) fn strip_images_when_unsupported(
                     let mut normalized_content_items = Vec::with_capacity(content_items.len());
                     for content_item in content_items.iter() {
                         match content_item {
-                            FunctionCallOutputContentItem::InputImage { .. } => {
+                            FunctionCallOutputContentItem::InputImage { .. }
+                                if !supports_images =>
+                            {
                                 normalized_content_items.push(
                                     FunctionCallOutputContentItem::InputText {
                                         text: IMAGE_CONTENT_OMITTED_PLACEHOLDER.to_string(),
                                     },
                                 );
                             }
+                            FunctionCallOutputContentItem::InputAudio { .. } if !supports_audio => {
+                                normalized_content_items.push(
+                                    FunctionCallOutputContentItem::InputText {
+                                        text: AUDIO_CONTENT_OMITTED_PLACEHOLDER.to_string(),
+                                    },
+                                );
+                            }
                             _ => normalized_content_items.push(content_item.clone()),
                         }
                     }
                     *content_items = normalized_content_items;
                 }
             }
-            ResponseItem::ImageGenerationCall { result, .. } => {
+            ResponseItem::ImageGenerationCall { result, .. } if !supports_images => {
                 result.clear();
             }
             _ => {}
diff --git a/codex-rs/core/src/mcp_tool_call.rs b/codex-rs/core/src/mcp_tool_call.rs
index fc8ce4d8ca32..ead0e933a7b4 100644
--- a/codex-rs/core/src/mcp_tool_call.rs
+++ b/codex-rs/core/src/mcp_tool_call.rs
@@ -589,13 +589,8 @@ async fn execute_mcp_tool_call(
         )
         .await
         .map_err(|e| format!("tool call error: {e:?}"))?;
-    let result = sanitize_mcp_tool_result_for_model(
-        turn_context
-            .model_info
-            .input_modalities
-            .contains(&InputModality::Image),
-        Ok(result),
-    )?;
+    let result =
+        sanitize_mcp_tool_result_for_model(&turn_context.model_info.input_modalities, Ok(result))?;
     Ok(maybe_request_codex_apps_auth_elicitation(
         sess,
         turn_context,
@@ -783,36 +778,61 @@ async fn maybe_mark_thread_memory_mode_polluted(
 }
 
 fn sanitize_mcp_tool_result_for_model(
-    supports_image_input: bool,
+    input_modalities: &[InputModality],
     result: Result<CallToolResult, String>,
 ) -> Result<CallToolResult, String> {
-    if supports_image_input {
-        return result;
-    }
+    let supports_image_input = input_modalities.contains(&InputModality::Image);
+    let supports_audio_input = input_modalities.contains(&InputModality::Audio);
+
+    result.and_then(|call_tool_result| {
+        if !supports_audio_input
+            && !has_non_null_structured_content(&call_tool_result)
+            && call_tool_result
+                .content
+                .iter()
+                .any(|block| block.get("type").and_then(serde_json::Value::as_str) == Some("audio"))
+        {
+            return Err(
+                "audio content returned by MCP tool but the selected model does not support audio input"
+                    .to_string(),
+            );
+        }
 
-    result.map(|call_tool_result| CallToolResult {
-        content: call_tool_result
-            .content
-            .iter()
-            .map(|block| {
-                if let Some(content_type) = block.get("type").and_then(serde_json::Value::as_str)
-                    && content_type == "image"
-                {
-                    return serde_json::json!({
-                        "type": "text",
-                        "text": "<image content omitted because you do not support image input>",
-                    });
-                }
+        if supports_image_input {
+            return Ok(call_tool_result);
+        }
 
-                block.clone()
-            })
-            .collect::<Vec<_>>(),
-        structured_content: call_tool_result.structured_content,
-        is_error: call_tool_result.is_error,
-        meta: call_tool_result.meta,
+        Ok(CallToolResult {
+            content: call_tool_result
+                .content
+                .iter()
+                .map(|block| {
+                    if let Some(content_type) = block.get("type").and_then(serde_json::Value::as_str)
+                        && content_type == "image"
+                    {
+                        return serde_json::json!({
+                            "type": "text",
+                            "text": "<image content omitted because you do not support image input>",
+                        });
+                    }
+
+                    block.clone()
+                })
+                .collect::<Vec<_>>(),
+            structured_content: call_tool_result.structured_content,
+            is_error: call_tool_result.is_error,
+            meta: call_tool_result.meta,
+        })
     })
 }
 
+fn has_non_null_structured_content(call_tool_result: &CallToolResult) -> bool {
+    call_tool_result
+        .structured_content
+        .as_ref()
+        .is_some_and(|structured_content| !structured_content.is_null())
+}
+
 fn truncate_mcp_tool_result_for_event(
     result: &Result<CallToolResult, String>,
 ) -> Result<CallToolResult, String> {
diff --git a/codex-rs/core/src/mcp_tool_call_tests.rs b/codex-rs/core/src/mcp_tool_call_tests.rs
index d8b6326cd0b9..dbbef3cf8643 100644
--- a/codex-rs/core/src/mcp_tool_call_tests.rs
+++ b/codex-rs/core/src/mcp_tool_call_tests.rs
@@ -924,7 +924,7 @@ fn sanitize_mcp_tool_result_for_model_rewrites_image_content() {
         meta: None,
     });
 
-    let got = sanitize_mcp_tool_result_for_model(/*supports_image_input*/ false, result)
+    let got = sanitize_mcp_tool_result_for_model(&[InputModality::Text], result)
         .expect("sanitized result");
 
     assert_eq!(
@@ -956,7 +956,7 @@ fn sanitize_mcp_tool_result_for_model_preserves_image_when_supported() {
     };
 
     let got = sanitize_mcp_tool_result_for_model(
-        /*supports_image_input*/ true,
+        &[InputModality::Text, InputModality::Image],
         Ok(original.clone()),
     )
     .expect("unsanitized result");
@@ -964,6 +964,73 @@ fn sanitize_mcp_tool_result_for_model_preserves_image_when_supported() {
     assert_eq!(got, original);
 }
 
+#[test]
+fn sanitize_mcp_tool_result_for_model_rejects_audio_when_unsupported() {
+    let result = Ok(CallToolResult {
+        content: vec![serde_json::json!({
+            "type": "audio",
+            "data": "UklGRg==",
+            "mimeType": "audio/wav",
+        })],
+        structured_content: None,
+        is_error: Some(false),
+        meta: None,
+    });
+
+    let err = sanitize_mcp_tool_result_for_model(&[InputModality::Text], result)
+        .expect_err("unsupported audio should fail");
+
+    assert_eq!(
+        err,
+        "audio content returned by MCP tool but the selected model does not support audio input"
+    );
+}
+
+#[test]
+fn sanitize_mcp_tool_result_for_model_preserves_audio_when_supported() {
+    let original = CallToolResult {
+        content: vec![serde_json::json!({
+            "type": "audio",
+            "data": "UklGRg==",
+            "mimeType": "audio/wav",
+        })],
+        structured_content: None,
+        is_error: Some(false),
+        meta: Some(serde_json::json!({"k": "v"})),
+    };
+
+    let got = sanitize_mcp_tool_result_for_model(
+        &[
+            InputModality::Text,
+            InputModality::Image,
+            InputModality::Audio,
+        ],
+        Ok(original.clone()),
+    )
+    .expect("supported audio should remain unchanged");
+
+    assert_eq!(got, original);
+}
+
+#[test]
+fn sanitize_mcp_tool_result_for_model_lets_structured_content_take_precedence_over_audio() {
+    let original = CallToolResult {
+        content: vec![serde_json::json!({
+            "type": "audio",
+            "data": "UklGRg==",
+            "mimeType": "audio/wav",
+        })],
+        structured_content: Some(serde_json::json!({"answer": "structured"})),
+        is_error: Some(false),
+        meta: None,
+    };
+
+    let got = sanitize_mcp_tool_result_for_model(&[InputModality::Text], Ok(original.clone()))
+        .expect("structured content should take precedence");
+
+    assert_eq!(got, original);
+}
+
 #[test]
 fn truncate_mcp_tool_result_for_event_preserves_small_result() {
     let original = CallToolResult {
diff --git a/codex-rs/core/src/tools/code_mode/execute_spec.rs b/codex-rs/core/src/tools/code_mode/execute_spec.rs
index 0a858bd2060e..fcad41e01410 100644
--- a/codex-rs/core/src/tools/code_mode/execute_spec.rs
+++ b/codex-rs/core/src/tools/code_mode/execute_spec.rs
@@ -7,8 +7,7 @@ use std::collections::BTreeMap;
 pub(crate) fn create_code_mode_tool(
     enabled_tools: &[CodeModeToolDefinition],
     namespace_descriptions: &BTreeMap<String, codex_code_mode::ToolNamespaceDescription>,
-    code_mode_only: bool,
-    deferred_tools_available: bool,
+    options: codex_code_mode::ExecToolDescriptionOptions,
 ) -> ToolSpec {
     const CODE_MODE_FREEFORM_GRAMMAR: &str = r#"
 start: pragma_source | plain_source
@@ -25,8 +24,7 @@ SOURCE: /[\s\S]+/
         description: codex_code_mode::build_exec_tool_description(
             enabled_tools,
             namespace_descriptions,
-            code_mode_only,
-            deferred_tools_available,
+            options,
         ),
         format: FreeformToolFormat {
             r#type: "grammar".to_string(),
@@ -57,16 +55,22 @@ mod tests {
             create_code_mode_tool(
                 &enabled_tools,
                 &BTreeMap::new(),
-                /*code_mode_only*/ true,
-                /*deferred_tools_available*/ false,
+                codex_code_mode::ExecToolDescriptionOptions {
+                    code_mode_only: true,
+                    deferred_tools_available: false,
+                    supports_audio_input: false,
+                },
             ),
             ToolSpec::Freeform(FreeformTool {
                 name: codex_code_mode::PUBLIC_TOOL_NAME.to_string(),
                 description: codex_code_mode::build_exec_tool_description(
                     &enabled_tools,
                     &BTreeMap::new(),
-                    /*code_mode_only*/ true,
-                    /*deferred_tools_available*/ false
+                    codex_code_mode::ExecToolDescriptionOptions {
+                        code_mode_only: true,
+                        deferred_tools_available: false,
+                        supports_audio_input: false,
+                    }
                 ),
                 format: FreeformToolFormat {
                     r#type: "grammar".to_string(),
diff --git a/codex-rs/core/src/tools/code_mode/mod.rs b/codex-rs/core/src/tools/code_mode/mod.rs
index ff9f8c889384..c4340af29231 100644
--- a/codex-rs/core/src/tools/code_mode/mod.rs
+++ b/codex-rs/core/src/tools/code_mode/mod.rs
@@ -14,6 +14,7 @@ use codex_code_mode::RuntimeResponse;
 use codex_protocol::models::FunctionCallOutputContentItem;
 use codex_protocol::models::FunctionCallOutputPayload;
 use codex_protocol::models::ResponseInputItem;
+use codex_protocol::openai_models::InputModality;
 use serde_json::Value as JsonValue;
 use tokio_util::sync::CancellationToken;
 
@@ -168,6 +169,9 @@ pub(super) async fn handle_runtime_response(
     match response {
         RuntimeResponse::Yielded { content_items, .. } => {
             let mut content_items = into_function_call_output_content_items(content_items);
+            if let Some(output) = unsupported_audio_output(exec.turn.as_ref(), &content_items) {
+                return Ok(output);
+            }
             sanitize_runtime_image_detail(exec.turn.as_ref(), &mut content_items);
             content_items = truncate_code_mode_result(content_items, max_output_tokens);
             prepend_script_status(&mut content_items, &script_status, started_at.elapsed());
@@ -175,6 +179,9 @@ pub(super) async fn handle_runtime_response(
         }
         RuntimeResponse::Terminated { content_items, .. } => {
             let mut content_items = into_function_call_output_content_items(content_items);
+            if let Some(output) = unsupported_audio_output(exec.turn.as_ref(), &content_items) {
+                return Ok(output);
+            }
             sanitize_runtime_image_detail(exec.turn.as_ref(), &mut content_items);
             content_items = truncate_code_mode_result(content_items, max_output_tokens);
             prepend_script_status(&mut content_items, &script_status, started_at.elapsed());
@@ -187,12 +194,15 @@ pub(super) async fn handle_runtime_response(
             ..
         } => {
             let mut content_items = into_function_call_output_content_items(content_items);
-            sanitize_runtime_image_detail(exec.turn.as_ref(), &mut content_items);
             exec.session
                 .services
                 .code_mode_service
                 .replace_stored_values(stored_values)
                 .await;
+            if let Some(output) = unsupported_audio_output(exec.turn.as_ref(), &content_items) {
+                return Ok(output);
+            }
+            sanitize_runtime_image_detail(exec.turn.as_ref(), &mut content_items);
             let success = error_text.is_none();
             if let Some(error_text) = error_text {
                 content_items.push(FunctionCallOutputContentItem::InputText {
@@ -209,6 +219,29 @@ pub(super) async fn handle_runtime_response(
     }
 }
 
+fn unsupported_audio_output(
+    turn: &TurnContext,
+    items: &[FunctionCallOutputContentItem],
+) -> Option<FunctionToolOutput> {
+    let supports_audio = turn
+        .model_info
+        .input_modalities
+        .contains(&InputModality::Audio);
+    if supports_audio
+        || !items
+            .iter()
+            .any(|item| matches!(item, FunctionCallOutputContentItem::InputAudio { .. }))
+    {
+        return None;
+    }
+
+    Some(FunctionToolOutput::from_text(
+        "audio content emitted by code mode but the selected model does not support audio input"
+            .to_string(),
+        Some(false),
+    ))
+}
+
 fn sanitize_runtime_image_detail(turn: &TurnContext, items: &mut [FunctionCallOutputContentItem]) {
     sanitize_image_detail_items(can_request_original_image_detail(&turn.model_info), items);
 }
diff --git a/codex-rs/core/src/tools/code_mode/response_adapter.rs b/codex-rs/core/src/tools/code_mode/response_adapter.rs
index e20cf6a0713e..f964dab900ba 100644
--- a/codex-rs/core/src/tools/code_mode/response_adapter.rs
+++ b/codex-rs/core/src/tools/code_mode/response_adapter.rs
@@ -42,6 +42,14 @@ impl IntoProtocol<FunctionCallOutputContentItem>
                         .or(Some(DEFAULT_IMAGE_DETAIL)),
                 }
             }
+            codex_code_mode::FunctionCallOutputContentItem::InputAudio { input_audio } => {
+                FunctionCallOutputContentItem::InputAudio {
+                    input_audio: codex_protocol::models::InputAudio {
+                        data: input_audio.data,
+                        format: input_audio.format,
+                    },
+                }
+            }
         }
     }
 }
diff --git a/codex-rs/core/src/tools/spec_plan.rs b/codex-rs/core/src/tools/spec_plan.rs
index 24f59a0f04e8..d4d1835f9438 100644
--- a/codex-rs/core/src/tools/spec_plan.rs
+++ b/codex-rs/core/src/tools/spec_plan.rs
@@ -177,8 +177,11 @@ fn build_code_mode_executors(
             create_code_mode_tool(
                 &enabled_tools,
                 &namespace_descriptions,
-                config.code_mode_only_enabled,
-                deferred_tools_available,
+                codex_code_mode::ExecToolDescriptionOptions {
+                    code_mode_only: config.code_mode_only_enabled,
+                    deferred_tools_available,
+                    supports_audio_input: config.supports_audio_input,
+                },
             ),
             code_mode_nested_tool_specs,
         )),
diff --git a/codex-rs/core/src/tools/spec_plan_tests.rs b/codex-rs/core/src/tools/spec_plan_tests.rs
index dd690b5631c0..b28a170c14a5 100644
--- a/codex-rs/core/src/tools/spec_plan_tests.rs
+++ b/codex-rs/core/src/tools/spec_plan_tests.rs
@@ -2350,6 +2350,70 @@ fn code_mode_exec_description_omits_nested_tool_details_when_not_code_mode_only(
     assert!(!description.contains("### `view_image`"));
 }
 
+#[test]
+fn code_mode_exec_audio_helper_docs_require_audio_input_support() {
+    let unsupported_model_info = model_info();
+    let mut supported_model_info = unsupported_model_info.clone();
+    supported_model_info.input_modalities = vec![
+        InputModality::Text,
+        InputModality::Image,
+        InputModality::Audio,
+    ];
+    let mut features = Features::with_defaults();
+    features.enable(Feature::CodeMode);
+    let available_models = Vec::new();
+    let unsupported_tools_config = ToolsConfig::new(&ToolsConfigParams {
+        model_info: &unsupported_model_info,
+        available_models: &available_models,
+        features: &features,
+        image_generation_tool_auth_allowed: true,
+        web_search_mode: Some(WebSearchMode::Cached),
+        session_source: SessionSource::Cli,
+        permission_profile: &PermissionProfile::Disabled,
+        windows_sandbox_level: WindowsSandboxLevel::Disabled,
+    });
+    let supported_tools_config = ToolsConfig::new(&ToolsConfigParams {
+        model_info: &supported_model_info,
+        available_models: &available_models,
+        features: &features,
+        image_generation_tool_auth_allowed: true,
+        web_search_mode: Some(WebSearchMode::Cached),
+        session_source: SessionSource::Cli,
+        permission_profile: &PermissionProfile::Disabled,
+        windows_sandbox_level: WindowsSandboxLevel::Disabled,
+    });
+
+    let (unsupported_tools, _) = build_specs(
+        &unsupported_tools_config,
+        /*mcp_tools*/ None,
+        /*deferred_mcp_tools*/ None,
+        &[],
+    );
+    let ToolSpec::Freeform(FreeformTool {
+        description: unsupported_description,
+        ..
+    }) = find_tool(&unsupported_tools, "exec")
+    else {
+        panic!("expected freeform tool");
+    };
+    assert!(!unsupported_description.contains("`audio(audioItem"));
+
+    let (supported_tools, _) = build_specs(
+        &supported_tools_config,
+        /*mcp_tools*/ None,
+        /*deferred_mcp_tools*/ None,
+        &[],
+    );
+    let ToolSpec::Freeform(FreeformTool {
+        description: supported_description,
+        ..
+    }) = find_tool(&supported_tools, "exec")
+    else {
+        panic!("expected freeform tool");
+    };
+    assert!(supported_description.contains("`audio(audioItem"));
+}
+
 fn model_info() -> ModelInfo {
     serde_json::from_value(json!({
         "slug": "gpt-5-codex",
diff --git a/codex-rs/core/tests/suite/code_mode.rs b/codex-rs/core/tests/suite/code_mode.rs
index af63d092f8cc..eac24781427d 100644
--- a/codex-rs/core/tests/suite/code_mode.rs
+++ b/codex-rs/core/tests/suite/code_mode.rs
@@ -12,6 +12,7 @@ use codex_protocol::dynamic_tools::DynamicToolCallOutputContentItem;
 use codex_protocol::dynamic_tools::DynamicToolResponse;
 use codex_protocol::dynamic_tools::DynamicToolSpec;
 use codex_protocol::models::PermissionProfile;
+use codex_protocol::openai_models::InputModality;
 use codex_protocol::protocol::AskForApproval;
 use codex_protocol::protocol::EventMsg;
 use codex_protocol::protocol::Op;
@@ -177,6 +178,54 @@ async fn run_code_mode_turn(
     Ok((test, second_mock))
 }
 
+async fn run_code_mode_turn_with_audio_model(
+    server: &MockServer,
+    prompt: &str,
+    code: &str,
+) -> Result<(TestCodex, ResponseMock)> {
+    let mut builder = test_codex()
+        .with_model("gpt-5.4")
+        .with_config(move |config| {
+            let _ = config.features.enable(Feature::CodeMode);
+            let mut model_catalog = bundled_models_response()
+                .unwrap_or_else(|err| panic!("bundled models.json should parse: {err}"));
+            let model = model_catalog
+                .models
+                .iter_mut()
+                .find(|model| model.slug == "gpt-5.4")
+                .expect("gpt-5.4 exists in bundled models.json");
+            model.input_modalities = vec![
+                InputModality::Text,
+                InputModality::Image,
+                InputModality::Audio,
+            ];
+            config.model_catalog = Some(model_catalog);
+        });
+    let test = builder.build(server).await?;
+
+    responses::mount_sse_once(
+        server,
+        sse(vec![
+            ev_response_created("resp-1"),
+            ev_custom_tool_call("call-1", "exec", code),
+            ev_completed("resp-1"),
+        ]),
+    )
+    .await;
+
+    let second_mock = responses::mount_sse_once(
+        server,
+        sse(vec![
+            ev_assistant_message("msg-1", "done"),
+            ev_completed("resp-2"),
+        ]),
+    )
+    .await;
+
+    test.submit_turn(prompt).await?;
+    Ok((test, second_mock))
+}
+
 async fn run_code_mode_turn_with_rmcp(
     server: &MockServer,
     prompt: &str,
@@ -1987,6 +2036,78 @@ image("data:image/png;base64,AAA");
     Ok(())
 }
 
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn code_mode_can_output_audio_via_global_helper_for_audio_model() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+    let (_test, second_mock) = run_code_mode_turn_with_audio_model(
+        &server,
+        "use exec to return audio",
+        r#"
+audio({ data: "BASE64", format: "wav" });
+audio({ data: "data:audio/mpeg;base64,MP3BASE64" });
+"#,
+    )
+    .await?;
+
+    let req = second_mock.single_request();
+    let items = custom_tool_output_items(&req, "call-1");
+    let (_, success) = custom_tool_output_body_and_success(&req, "call-1");
+    assert_ne!(
+        success,
+        Some(false),
+        "code_mode audio output failed unexpectedly"
+    );
+    assert_eq!(items.len(), 3);
+    assert_regex_match(
+        concat!(
+            r"(?s)\A",
+            r"Script completed\nWall time \d+\.\d seconds\nOutput:\n\z"
+        ),
+        text_item(&items, /*index*/ 0),
+    );
+    assert_eq!(
+        items[1],
+        serde_json::json!({
+            "type": "input_audio",
+            "input_audio": { "data": "BASE64", "format": "wav" }
+        }),
+    );
+    assert_eq!(
+        items[2],
+        serde_json::json!({
+            "type": "input_audio",
+            "input_audio": { "data": "MP3BASE64", "format": "mp3" }
+        }),
+    );
+
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn code_mode_audio_output_fails_for_non_audio_model() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+    let (_test, second_mock) = run_code_mode_turn(
+        &server,
+        "use exec to return audio",
+        r#"audio({ data: "BASE64", format: "wav" });"#,
+        /*include_apply_patch*/ false,
+    )
+    .await?;
+
+    let req = second_mock.single_request();
+    let (output, _success) = custom_tool_output_body_and_success(&req, "call-1");
+    assert_eq!(
+        output,
+        "audio content emitted by code mode but the selected model does not support audio input"
+    );
+
+    Ok(())
+}
+
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn code_mode_can_use_view_image_result_with_image_helper() -> Result<()> {
     skip_if_no_network!(Ok(()));
@@ -2422,6 +2543,7 @@ text(JSON.stringify(Object.getOwnPropertyNames(globalThis).sort()));
         "WeakSet",
         "__codexContentItems",
         "add_content",
+        "audio",
         "decodeURI",
         "decodeURIComponent",
         "encodeURI",
diff --git a/codex-rs/core/tests/suite/rmcp_client.rs b/codex-rs/core/tests/suite/rmcp_client.rs
index d1973d97a99d..40ba39b93876 100644
--- a/codex-rs/core/tests/suite/rmcp_client.rs
+++ b/codex-rs/core/tests/suite/rmcp_client.rs
@@ -93,6 +93,50 @@ fn assert_wall_time_header(output: &str) {
     assert_eq!(marker, "Output:");
 }
 
+fn test_model_info_with_modalities(
+    slug: &str,
+    description: &str,
+    input_modalities: Vec<InputModality>,
+) -> ModelInfo {
+    ModelInfo {
+        slug: slug.to_string(),
+        display_name: slug.to_string(),
+        description: Some(description.to_string()),
+        default_reasoning_level: None,
+        supported_reasoning_levels: vec![ReasoningEffortPreset {
+            effort: codex_protocol::openai_models::ReasoningEffort::Medium,
+            description: "Medium".to_string(),
+        }],
+        shell_type: ConfigShellToolType::Default,
+        visibility: ModelVisibility::List,
+        supported_in_api: true,
+        priority: 1,
+        additional_speed_tiers: Vec::new(),
+        service_tiers: Vec::new(),
+        upgrade: None,
+        base_instructions: "base instructions".to_string(),
+        model_messages: None,
+        supports_reasoning_summaries: false,
+        default_reasoning_summary: ReasoningSummary::Auto,
+        support_verbosity: false,
+        default_verbosity: None,
+        availability_nux: None,
+        apply_patch_tool_type: None,
+        web_search_tool_type: Default::default(),
+        truncation_policy: TruncationPolicyConfig::bytes(/*limit*/ 10_000),
+        supports_parallel_tool_calls: false,
+        supports_image_detail_original: false,
+        context_window: Some(272_000),
+        max_context_window: None,
+        auto_compact_token_limit: None,
+        effective_context_window_percent: 95,
+        experimental_supported_tools: Vec::new(),
+        input_modalities,
+        used_fallback_model_metadata: false,
+        supports_search_tool: false,
+    }
+}
+
 fn read_only_user_turn(fixture: &TestCodex, text: impl Into<String>) -> Op {
     read_only_user_turn_with_model(fixture, text, fixture.session_configured.model.clone())
 }
@@ -1386,6 +1430,257 @@ async fn stdio_image_responses_are_sanitized_for_text_only_model() -> anyhow::Re
     Ok(())
 }
 
+#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+#[serial(mcp_test_value)]
+async fn stdio_audio_responses_are_forwarded_for_audio_model() -> anyhow::Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+
+    let call_id = "audio-supported-1";
+    let server_name = "rmcp";
+    let namespace = format!("mcp__{server_name}__");
+    let audio_model_slug = "rmcp-audio-model";
+
+    let models_mock = mount_models_once(
+        &server,
+        ModelsResponse {
+            models: vec![test_model_info_with_modalities(
+                audio_model_slug,
+                "Test model with audio input support",
+                vec![
+                    InputModality::Text,
+                    InputModality::Image,
+                    InputModality::Audio,
+                ],
+            )],
+        },
+    )
+    .await;
+
+    mount_sse_once(
+        &server,
+        responses::sse(vec![
+            responses::ev_response_created("resp-1"),
+            responses::ev_function_call_with_namespace(call_id, &namespace, "audio", "{}"),
+            responses::ev_completed("resp-1"),
+        ]),
+    )
+    .await;
+    let final_mock = mount_sse_once(
+        &server,
+        responses::sse(vec![
+            responses::ev_assistant_message("msg-1", "rmcp audio tool completed successfully."),
+            responses::ev_completed("resp-2"),
+        ]),
+    )
+    .await;
+
+    let rmcp_test_server_bin = remote_aware_stdio_server_bin()?;
+
+    let fixture = test_codex()
+        .with_auth(CodexAuth::create_dummy_chatgpt_auth_for_testing())
+        .with_config(move |config| {
+            insert_mcp_server(
+                config,
+                server_name,
+                stdio_transport(
+                    rmcp_test_server_bin,
+                    Some(HashMap::from([
+                        ("MCP_TEST_AUDIO_DATA".to_string(), "UklGRg==".to_string()),
+                        (
+                            "MCP_TEST_AUDIO_MIME_TYPE".to_string(),
+                            "audio/mpeg".to_string(),
+                        ),
+                    ])),
+                    Vec::new(),
+                ),
+                TestMcpServerOptions {
+                    experimental_environment: remote_aware_experimental_environment(),
+                    ..Default::default()
+                },
+            );
+        })
+        .build_remote_aware(&server)
+        .await?;
+
+    fixture
+        .thread_manager
+        .get_models_manager()
+        .list_models(RefreshStrategy::Online)
+        .await;
+    assert_eq!(models_mock.requests().len(), 1);
+
+    fixture
+        .codex
+        .submit(read_only_user_turn_with_model(
+            &fixture,
+            "call the rmcp audio tool",
+            audio_model_slug.to_string(),
+        ))
+        .await?;
+
+    wait_for_event(&fixture.codex, |ev| {
+        matches!(ev, EventMsg::McpToolCallBegin(_))
+    })
+    .await;
+    wait_for_event(&fixture.codex, |ev| {
+        matches!(ev, EventMsg::McpToolCallEnd(_))
+    })
+    .await;
+    wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TurnComplete(_))).await;
+
+    let output_item = final_mock.single_request().function_call_output(call_id);
+    let output = output_item["output"]
+        .as_array()
+        .expect("audio MCP output should be content items");
+    assert_eq!(output.len(), 2);
+    assert_wall_time_header(
+        output[0]["text"]
+            .as_str()
+            .expect("first MCP audio output item should be wall-time text"),
+    );
+    assert_eq!(
+        output[1],
+        json!({
+            "type": "input_audio",
+            "input_audio": {
+                "data": "UklGRg==",
+                "format": "mp3",
+            },
+        })
+    );
+
+    server.verify().await;
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+#[serial(mcp_test_value)]
+async fn stdio_audio_responses_fail_for_text_only_model() -> anyhow::Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = responses::start_mock_server().await;
+
+    let call_id = "audio-text-only-1";
+    let server_name = "rmcp";
+    let namespace = format!("mcp__{server_name}__");
+    let text_only_model_slug = "rmcp-audio-text-only-model";
+
+    let models_mock = mount_models_once(
+        &server,
+        ModelsResponse {
+            models: vec![test_model_info_with_modalities(
+                text_only_model_slug,
+                "Test model without audio input support",
+                vec![InputModality::Text, InputModality::Image],
+            )],
+        },
+    )
+    .await;
+
+    mount_sse_once(
+        &server,
+        responses::sse(vec![
+            responses::ev_response_created("resp-1"),
+            responses::ev_function_call_with_namespace(call_id, &namespace, "audio", "{}"),
+            responses::ev_completed("resp-1"),
+        ]),
+    )
+    .await;
+    let final_mock = mount_sse_once(
+        &server,
+        responses::sse(vec![
+            responses::ev_assistant_message("msg-1", "rmcp audio tool failed."),
+            responses::ev_completed("resp-2"),
+        ]),
+    )
+    .await;
+
+    let rmcp_test_server_bin = remote_aware_stdio_server_bin()?;
+
+    let fixture = test_codex()
+        .with_auth(CodexAuth::create_dummy_chatgpt_auth_for_testing())
+        .with_config(move |config| {
+            insert_mcp_server(
+                config,
+                server_name,
+                stdio_transport(
+                    rmcp_test_server_bin,
+                    Some(HashMap::from([
+                        ("MCP_TEST_AUDIO_DATA".to_string(), "UklGRg==".to_string()),
+                        (
+                            "MCP_TEST_AUDIO_MIME_TYPE".to_string(),
+                            "audio/wav".to_string(),
+                        ),
+                    ])),
+                    Vec::new(),
+                ),
+                TestMcpServerOptions {
+                    experimental_environment: remote_aware_experimental_environment(),
+                    ..Default::default()
+                },
+            );
+        })
+        .build_remote_aware(&server)
+        .await?;
+
+    fixture
+        .thread_manager
+        .get_models_manager()
+        .list_models(RefreshStrategy::Online)
+        .await;
+    assert_eq!(models_mock.requests().len(), 1);
+
+    fixture
+        .codex
+        .submit(read_only_user_turn_with_model(
+            &fixture,
+            "call the rmcp audio tool",
+            text_only_model_slug.to_string(),
+        ))
+        .await?;
+
+    wait_for_event(&fixture.codex, |ev| {
+        matches!(ev, EventMsg::McpToolCallBegin(_))
+    })
+    .await;
+    let end_event = wait_for_event(&fixture.codex, |ev| {
+        matches!(ev, EventMsg::McpToolCallEnd(_))
+    })
+    .await;
+    let EventMsg::McpToolCallEnd(end) = end_event else {
+        unreachable!("event guard guarantees McpToolCallEnd");
+    };
+    assert_eq!(
+        end.result,
+        Err(
+            "audio content returned by MCP tool but the selected model does not support audio input"
+                .to_string()
+        )
+    );
+    wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TurnComplete(_))).await;
+
+    let output_item = final_mock.single_request().function_call_output(call_id);
+    let output_text = output_item
+        .get("output")
+        .and_then(Value::as_str)
+        .expect("function_call_output output should be a JSON string");
+    let wrapped_payload = split_wall_time_wrapped_output(output_text);
+    let output_json: Value = serde_json::from_str(wrapped_payload)
+        .expect("function_call_output output should be valid JSON");
+    assert_eq!(
+        output_json,
+        json!([{
+            "type": "text",
+            "text": "audio content returned by MCP tool but the selected model does not support audio input"
+        }])
+    );
+
+    server.verify().await;
+    Ok(())
+}
+
 #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
 #[serial(mcp_test_value)]
 async fn stdio_server_propagates_whitelisted_env_vars() -> anyhow::Result<()> {
diff --git a/codex-rs/protocol/src/models.rs b/codex-rs/protocol/src/models.rs
index 6919ee43e770..68d8a91b4523 100644
--- a/codex-rs/protocol/src/models.rs
+++ b/codex-rs/protocol/src/models.rs
@@ -1304,6 +1304,98 @@ pub enum FunctionCallOutputContentItem {
         #[ts(optional)]
         detail: Option<ImageDetail>,
     },
+    // Do not rename, these are serialized and used directly in the responses API.
+    InputAudio {
+        input_audio: InputAudio,
+    },
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, JsonSchema, TS)]
+pub struct InputAudio {
+    pub data: String,
+    pub format: String,
+}
+
+pub fn input_audio_from_data(
+    data: &str,
+    format: Option<&str>,
+    mime_type: Option<&str>,
+) -> Option<InputAudio> {
+    if data.is_empty() {
+        return None;
+    }
+
+    let (data, data_url_format) = if let Some((data, format)) = parse_audio_data_url(data) {
+        (data, Some(format))
+    } else if data
+        .get(.."data:".len())
+        .is_some_and(|prefix| prefix.eq_ignore_ascii_case("data:"))
+    {
+        return None;
+    } else {
+        (data.to_string(), None)
+    };
+    if data.is_empty() {
+        return None;
+    }
+
+    let mime_type_format = match mime_type {
+        Some(mime_type) => Some(audio_format_from_mime_type(mime_type)?),
+        None => None,
+    };
+
+    let format = format
+        .and_then(normalize_audio_format)
+        .or(data_url_format)
+        .or(mime_type_format)?;
+
+    Some(InputAudio { data, format })
+}
+
+fn parse_audio_data_url(data_url: &str) -> Option<(String, String)> {
+    if data_url.len() < "data:".len()
+        || !data_url
+            .get(.."data:".len())
+            .is_some_and(|prefix| prefix.eq_ignore_ascii_case("data:"))
+    {
+        return None;
+    }
+
+    let (metadata, data) = data_url["data:".len()..].split_once(',')?;
+    if !metadata
+        .split(';')
+        .any(|part| part.eq_ignore_ascii_case("base64"))
+    {
+        return None;
+    }
+
+    let mime_type = metadata.split(';').next()?;
+    let format = audio_format_from_mime_type(mime_type)?;
+    Some((data.to_string(), format))
+}
+
+fn audio_format_from_mime_type(mime_type: &str) -> Option<String> {
+    let media_type = mime_type.split(';').next()?.trim().to_ascii_lowercase();
+    let subtype = media_type.strip_prefix("audio/")?;
+    normalize_audio_format(subtype)
+}
+
+fn normalize_audio_format(format: &str) -> Option<String> {
+    let format = format.trim().to_ascii_lowercase();
+    if format.is_empty() {
+        return None;
+    }
+    if format.contains('/') {
+        return audio_format_from_mime_type(&format);
+    }
+
+    let format = format.strip_prefix("x-").unwrap_or(&format);
+    let format = match format {
+        "mpeg" => "mp3",
+        "wave" => "wav",
+        _ => format,
+    };
+    Some(format.to_string())
 }
 
 /// Converts structured function-call output content into plain text for
@@ -1311,7 +1403,7 @@ pub enum FunctionCallOutputContentItem {
 ///
 /// This conversion is intentionally lossy:
 /// - only `input_text` items are included
-/// - image items are ignored
+/// - image and audio items are ignored
 ///
 /// We use this helper where callers still need a string representation (for
 /// example telemetry previews or legacy string-only output paths) while keeping
@@ -1327,7 +1419,8 @@ pub fn function_call_output_content_items_to_text(
                 Some(text.as_str())
             }
             FunctionCallOutputContentItem::InputText { .. }
-            | FunctionCallOutputContentItem::InputImage { .. } => None,
+            | FunctionCallOutputContentItem::InputImage { .. }
+            | FunctionCallOutputContentItem::InputAudio { .. } => None,
         })
         .collect::<Vec<_>>();
 
@@ -1378,7 +1471,7 @@ impl FunctionCallOutputBody {
     /// human-readable surfaces.
     ///
     /// This conversion is intentionally lossy when the body contains content
-    /// items: image entries are dropped and text entries are joined with
+    /// items: image and audio entries are dropped and text entries are joined with
     /// newlines.
     pub fn to_text(&self) -> Option<String> {
         match self {
@@ -1556,11 +1649,18 @@ fn convert_mcp_content_to_items(
             #[serde(rename = "_meta", default)]
             meta: Option<serde_json::Value>,
         },
+        #[serde(rename = "audio")]
+        Audio {
+            data: String,
+            #[serde(rename = "mimeType", alias = "mime_type")]
+            mime_type: Option<String>,
+        },
         #[serde(other)]
         Unknown,
     }
 
     let mut saw_image = false;
+    let mut saw_audio = false;
     let mut items = Vec::with_capacity(contents.len());
 
     for content in contents {
@@ -1595,6 +1695,19 @@ fn convert_mcp_content_to_items(
                         .or(Some(DEFAULT_IMAGE_DETAIL)),
                 }
             }
+            Ok(McpContent::Audio { data, mime_type }) => {
+                if let Some(input_audio) =
+                    input_audio_from_data(&data, /*format*/ None, mime_type.as_deref())
+                {
+                    saw_audio = true;
+                    FunctionCallOutputContentItem::InputAudio { input_audio }
+                } else {
+                    FunctionCallOutputContentItem::InputText {
+                        text: serde_json::to_string(content)
+                            .unwrap_or_else(|_| "<content>".to_string()),
+                    }
+                }
+            }
             Ok(McpContent::Unknown) | Err(_) => FunctionCallOutputContentItem::InputText {
                 text: serde_json::to_string(content).unwrap_or_else(|_| "<content>".to_string()),
             },
@@ -1602,7 +1715,11 @@ fn convert_mcp_content_to_items(
         items.push(item);
     }
 
-    if saw_image { Some(items) } else { None }
+    if saw_image || saw_audio {
+        Some(items)
+    } else {
+        None
+    }
 }
 
 // Implement Display so callers can treat the payload like a plain string when logging or doing
@@ -2232,6 +2349,198 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn serializes_audio_outputs_as_array() -> Result<()> {
+        let call_tool_result = CallToolResult {
+            content: vec![
+                serde_json::json!({"type":"text","text":"caption"}),
+                serde_json::json!({"type":"audio","data":"BASE64","mimeType":"audio/mpeg"}),
+            ],
+            structured_content: None,
+            is_error: Some(false),
+            meta: None,
+        };
+
+        let payload = call_tool_result.into_function_call_output_payload();
+        assert_eq!(payload.success, Some(true));
+        let Some(items) = payload.content_items() else {
+            panic!("expected content items");
+        };
+        let items = items.to_vec();
+        assert_eq!(
+            items,
+            vec![
+                FunctionCallOutputContentItem::InputText {
+                    text: "caption".into(),
+                },
+                FunctionCallOutputContentItem::InputAudio {
+                    input_audio: InputAudio {
+                        data: "BASE64".into(),
+                        format: "mp3".into(),
+                    },
+                },
+            ]
+        );
+
+        let item = ResponseInputItem::FunctionCallOutput {
+            call_id: "call1".into(),
+            output: payload,
+        };
+
+        let json = serde_json::to_string(&item)?;
+        let v: serde_json::Value = serde_json::from_str(&json)?;
+
+        assert_eq!(
+            v.get("output").expect("output field"),
+            &serde_json::json!([
+                { "type": "input_text", "text": "caption" },
+                { "type": "input_audio", "input_audio": { "data": "BASE64", "format": "mp3" } }
+            ])
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn serializes_mixed_image_and_audio_outputs_as_array() {
+        let call_tool_result = CallToolResult {
+            content: vec![
+                serde_json::json!({"type":"image","data":"IMAGE","mimeType":"image/png"}),
+                serde_json::json!({"type":"audio","data":"AUDIO","mimeType":"audio/wav"}),
+            ],
+            structured_content: None,
+            is_error: Some(false),
+            meta: None,
+        };
+
+        let payload = call_tool_result.into_function_call_output_payload();
+        let Some(items) = payload.content_items() else {
+            panic!("expected content items");
+        };
+        assert_eq!(
+            items,
+            [
+                FunctionCallOutputContentItem::InputImage {
+                    image_url: "data:image/png;base64,IMAGE".into(),
+                    detail: Some(DEFAULT_IMAGE_DETAIL),
+                },
+                FunctionCallOutputContentItem::InputAudio {
+                    input_audio: InputAudio {
+                        data: "AUDIO".into(),
+                        format: "wav".into(),
+                    },
+                },
+            ]
+        );
+    }
+
+    #[test]
+    fn strips_audio_data_urls_and_derives_format() {
+        let call_tool_result = CallToolResult {
+            content: vec![serde_json::json!({
+                "type": "audio",
+                "data": "data:audio/ogg;base64,T2dnUw",
+            })],
+            structured_content: None,
+            is_error: Some(false),
+            meta: None,
+        };
+
+        let payload = call_tool_result.into_function_call_output_payload();
+        let Some(items) = payload.content_items() else {
+            panic!("expected content items");
+        };
+        assert_eq!(
+            items,
+            [FunctionCallOutputContentItem::InputAudio {
+                input_audio: InputAudio {
+                    data: "T2dnUw".into(),
+                    format: "ogg".into(),
+                },
+            }]
+        );
+    }
+
+    #[test]
+    fn audio_without_derivable_format_falls_back_to_text_payload() {
+        let content = vec![serde_json::json!({
+            "type": "audio",
+            "data": "BASE64",
+        })];
+        let call_tool_result = CallToolResult {
+            content: content.clone(),
+            structured_content: None,
+            is_error: Some(false),
+            meta: None,
+        };
+
+        let payload = call_tool_result.into_function_call_output_payload();
+        assert_eq!(
+            payload,
+            FunctionCallOutputPayload {
+                body: FunctionCallOutputBody::Text(serde_json::to_string(&content).unwrap()),
+                success: Some(true),
+            }
+        );
+    }
+
+    #[test]
+    fn malformed_audio_block_falls_back_to_text_inside_structured_payload() {
+        let malformed_audio = serde_json::json!({
+            "type": "audio",
+            "data": "data:image/png;base64,NOT_AUDIO",
+        });
+        let call_tool_result = CallToolResult {
+            content: vec![
+                serde_json::json!({"type":"image","data":"IMAGE","mimeType":"image/png"}),
+                malformed_audio.clone(),
+            ],
+            structured_content: None,
+            is_error: Some(false),
+            meta: None,
+        };
+
+        let payload = call_tool_result.into_function_call_output_payload();
+        let Some(items) = payload.content_items() else {
+            panic!("expected content items");
+        };
+        assert_eq!(
+            items,
+            [
+                FunctionCallOutputContentItem::InputImage {
+                    image_url: "data:image/png;base64,IMAGE".into(),
+                    detail: Some(DEFAULT_IMAGE_DETAIL),
+                },
+                FunctionCallOutputContentItem::InputText {
+                    text: serde_json::to_string(&malformed_audio).unwrap(),
+                },
+            ]
+        );
+    }
+
+    #[test]
+    fn structured_content_precedence_ignores_audio_content() {
+        let call_tool_result = CallToolResult {
+            content: vec![serde_json::json!({
+                "type": "audio",
+                "data": "BASE64",
+                "mimeType": "audio/wav",
+            })],
+            structured_content: Some(serde_json::json!({ "ok": true })),
+            is_error: Some(false),
+            meta: None,
+        };
+
+        let payload = call_tool_result.into_function_call_output_payload();
+        assert_eq!(
+            payload,
+            FunctionCallOutputPayload {
+                body: FunctionCallOutputBody::Text("{\"ok\":true}".to_string()),
+                success: Some(true),
+            }
+        );
+    }
+
     #[test]
     fn serializes_custom_tool_image_outputs_as_array() -> Result<()> {
         let item = ResponseInputItem::CustomToolCallOutput {
diff --git a/codex-rs/protocol/src/openai_models.rs b/codex-rs/protocol/src/openai_models.rs
index d51e70ddf16f..b130d58c5dbd 100644
--- a/codex-rs/protocol/src/openai_models.rs
+++ b/codex-rs/protocol/src/openai_models.rs
@@ -82,6 +82,8 @@ pub enum InputModality {
     Text,
     /// Image attachments included in user turns.
     Image,
+    /// Audio content included in tool payloads.
+    Audio,
 }
 
 /// Backward-compatible default when `input_modalities` is omitted on the wire.
diff --git a/codex-rs/rmcp-client/src/bin/test_stdio_server.rs b/codex-rs/rmcp-client/src/bin/test_stdio_server.rs
index 7add4d05f5af..fea5078b8d0c 100644
--- a/codex-rs/rmcp-client/src/bin/test_stdio_server.rs
+++ b/codex-rs/rmcp-client/src/bin/test_stdio_server.rs
@@ -71,6 +71,7 @@ impl TestToolServer {
             Self::cwd_tool(),
             Self::sync_tool(),
             Self::image_tool(),
+            Self::audio_tool(),
             Self::image_scenario_tool(),
             sandbox_meta_tool,
         ];
@@ -227,6 +228,24 @@ impl TestToolServer {
         tool
     }
 
+    fn audio_tool() -> Tool {
+        #[expect(clippy::expect_used)]
+        let schema: JsonObject = serde_json::from_value(serde_json::json!({
+            "type": "object",
+            "properties": {},
+            "additionalProperties": false
+        }))
+        .expect("audio tool schema should deserialize");
+
+        let mut tool = Tool::new(
+            Cow::Borrowed("audio"),
+            Cow::Borrowed("Return a single audio content block."),
+            Arc::new(schema),
+        );
+        tool.annotations = Some(ToolAnnotations::new().read_only(true));
+        tool
+    }
+
     /// Tool intended for manual testing of Codex TUI rendering for MCP image tool results.
     ///
     /// This exists to exercise edge cases where a `CallToolResult.content` includes image blocks
@@ -543,6 +562,20 @@ impl ServerHandler for TestToolServer {
                     data_b64, mime_type,
                 )]))
             }
+            "audio" => {
+                let data =
+                    std::env::var("MCP_TEST_AUDIO_DATA").unwrap_or_else(|_| "QkFTRTY0".to_string());
+                let mime_type = std::env::var("MCP_TEST_AUDIO_MIME_TYPE")
+                    .unwrap_or_else(|_| "audio/wav".to_string());
+
+                Ok(CallToolResult::success(vec![rmcp::model::Annotated::new(
+                    rmcp::model::RawContent::Audio(rmcp::model::RawAudioContent {
+                        data,
+                        mime_type,
+                    }),
+                    None,
+                )]))
+            }
             "image_scenario" => {
                 let args = Self::parse_call_args::<ImageScenarioArgs>(&request, "image_scenario")?;
                 Self::image_scenario_result(args)
diff --git a/codex-rs/tools/src/tool_config.rs b/codex-rs/tools/src/tool_config.rs
index ad884e5be023..5b513b469600 100644
--- a/codex-rs/tools/src/tool_config.rs
+++ b/codex-rs/tools/src/tool_config.rs
@@ -113,6 +113,7 @@ pub struct ToolsConfig {
     pub request_permissions_tool_enabled: bool,
     pub code_mode_enabled: bool,
     pub code_mode_only_enabled: bool,
+    pub supports_audio_input: bool,
     pub can_request_original_image_detail: bool,
     pub collab_tools: bool,
     pub goal_tools: bool,
@@ -188,6 +189,7 @@ impl ToolsConfig {
             && features.enabled(Feature::Apps)
             && features.enabled(Feature::Plugins);
         let include_original_image_detail = can_request_original_image_detail(model_info);
+        let supports_audio_input = model_info.input_modalities.contains(&InputModality::Audio);
         // API-key auth bypasses Codex backend entitlement/tool normalization, so
         // callers must confirm ChatGPT auth before exposing the built-in tool.
         let include_image_gen_tool = *image_generation_tool_auth_allowed
@@ -256,6 +258,7 @@ impl ToolsConfig {
             request_permissions_tool_enabled,
             code_mode_enabled: include_code_mode,
             code_mode_only_enabled: include_code_mode_only,
+            supports_audio_input,
             can_request_original_image_detail: include_original_image_detail,
             collab_tools: include_collab_tools,
             goal_tools: include_goal_tools,
diff --git a/codex-rs/tools/src/tool_config_tests.rs b/codex-rs/tools/src/tool_config_tests.rs
index 252ad7a3205a..496474090d92 100644
--- a/codex-rs/tools/src/tool_config_tests.rs
+++ b/codex-rs/tools/src/tool_config_tests.rs
@@ -265,6 +265,48 @@ fn image_generation_requires_feature_and_supported_model() {
     assert!(!unsupported_tools_config.image_gen_tool);
 }
 
+#[test]
+fn audio_input_support_tracks_model_modalities() {
+    let supported_model_info = ModelInfo {
+        input_modalities: vec![
+            InputModality::Text,
+            InputModality::Image,
+            InputModality::Audio,
+        ],
+        ..model_info()
+    };
+    let unsupported_model_info = ModelInfo {
+        input_modalities: vec![InputModality::Text, InputModality::Image],
+        ..model_info()
+    };
+    let features = Features::with_defaults();
+    let available_models = Vec::new();
+
+    let supported_tools_config = ToolsConfig::new(&ToolsConfigParams {
+        model_info: &supported_model_info,
+        available_models: &available_models,
+        features: &features,
+        image_generation_tool_auth_allowed: true,
+        web_search_mode: Some(WebSearchMode::Cached),
+        session_source: SessionSource::Cli,
+        permission_profile: &PermissionProfile::Disabled,
+        windows_sandbox_level: WindowsSandboxLevel::Disabled,
+    });
+    let unsupported_tools_config = ToolsConfig::new(&ToolsConfigParams {
+        model_info: &unsupported_model_info,
+        available_models: &available_models,
+        features: &features,
+        image_generation_tool_auth_allowed: true,
+        web_search_mode: Some(WebSearchMode::Cached),
+        session_source: SessionSource::Cli,
+        permission_profile: &PermissionProfile::Disabled,
+        windows_sandbox_level: WindowsSandboxLevel::Disabled,
+    });
+
+    assert!(supported_tools_config.supports_audio_input);
+    assert!(!unsupported_tools_config.supports_audio_input);
+}
+
 #[test]
 fn provider_capability_methods_disable_provider_bound_tool_surfaces() {
     let model_info = model_info();
diff --git a/codex-rs/tools/src/tool_output.rs b/codex-rs/tools/src/tool_output.rs
index 2044295174b7..ef13f30400b5 100644
--- a/codex-rs/tools/src/tool_output.rs
+++ b/codex-rs/tools/src/tool_output.rs
@@ -191,7 +191,8 @@ fn content_items_to_code_mode_result(items: &[FunctionCallOutputContentItem]) ->
                     Some(image_url.clone())
                 }
                 FunctionCallOutputContentItem::InputText { .. }
-                | FunctionCallOutputContentItem::InputImage { .. } => None,
+                | FunctionCallOutputContentItem::InputImage { .. }
+                | FunctionCallOutputContentItem::InputAudio { .. } => None,
             })
             .collect::<Vec<_>>()
             .join("\n"),
diff --git a/codex-rs/utils/output-truncation/src/lib.rs b/codex-rs/utils/output-truncation/src/lib.rs
index 24b1630da134..906d981ac333 100644
--- a/codex-rs/utils/output-truncation/src/lib.rs
+++ b/codex-rs/utils/output-truncation/src/lib.rs
@@ -34,7 +34,8 @@ pub fn formatted_truncate_text_content_items_with_policy(
         .iter()
         .filter_map(|item| match item {
             FunctionCallOutputContentItem::InputText { text } => Some(text.as_str()),
-            FunctionCallOutputContentItem::InputImage { .. } => None,
+            FunctionCallOutputContentItem::InputImage { .. }
+            | FunctionCallOutputContentItem::InputAudio { .. } => None,
         })
         .collect::<Vec<_>>();
 
@@ -64,6 +65,11 @@ pub fn formatted_truncate_text_content_items_with_policy(
                 detail: *detail,
             })
         }
+        FunctionCallOutputContentItem::InputAudio { input_audio } => {
+            Some(FunctionCallOutputContentItem::InputAudio {
+                input_audio: input_audio.clone(),
+            })
+        }
         FunctionCallOutputContentItem::InputText { .. } => None,
     }));
 
@@ -117,6 +123,11 @@ pub fn truncate_function_output_items_with_policy(
                     detail: *detail,
                 });
             }
+            FunctionCallOutputContentItem::InputAudio { input_audio } => {
+                out.push(FunctionCallOutputContentItem::InputAudio {
+                    input_audio: input_audio.clone(),
+                });
+            }
         }
     }
 
diff --git a/codex-rs/utils/output-truncation/src/truncate_tests.rs b/codex-rs/utils/output-truncation/src/truncate_tests.rs
index 74acb15ca3d2..ce1afcb0ca21 100644
--- a/codex-rs/utils/output-truncation/src/truncate_tests.rs
+++ b/codex-rs/utils/output-truncation/src/truncate_tests.rs
@@ -7,6 +7,7 @@ use crate::truncate_function_output_items_with_policy;
 use crate::truncate_text;
 use codex_protocol::models::DEFAULT_IMAGE_DETAIL;
 use codex_protocol::models::FunctionCallOutputContentItem;
+use codex_protocol::models::InputAudio;
 use pretty_assertions::assert_eq;
 
 #[test]
@@ -251,6 +252,43 @@ fn formatted_truncate_text_content_items_with_policy_merges_text_and_appends_ima
     assert_eq!(original_token_count, Some(4));
 }
 
+#[test]
+fn formatted_truncate_text_content_items_with_policy_preserves_audio_like_images() {
+    let items = vec![
+        FunctionCallOutputContentItem::InputText {
+            text: "abcd".to_string(),
+        },
+        FunctionCallOutputContentItem::InputAudio {
+            input_audio: InputAudio {
+                data: "UklGRg==".to_string(),
+                format: "wav".to_string(),
+            },
+        },
+        FunctionCallOutputContentItem::InputText {
+            text: "efgh".to_string(),
+        },
+    ];
+
+    let (output, original_token_count) =
+        formatted_truncate_text_content_items_with_policy(&items, TruncationPolicy::Bytes(4));
+
+    assert_eq!(
+        output,
+        vec![
+            FunctionCallOutputContentItem::InputText {
+                text: "Total output lines: 2\n\nab…5 chars truncated…gh".to_string(),
+            },
+            FunctionCallOutputContentItem::InputAudio {
+                input_audio: InputAudio {
+                    data: "UklGRg==".to_string(),
+                    format: "wav".to_string(),
+                },
+            },
+        ]
+    );
+    assert_eq!(original_token_count, Some(3));
+}
+
 #[test]
 fn formatted_truncate_text_content_items_with_policy_merges_all_text_for_token_budget() {
     let items = vec![