diff --git a/.github/workflows/deploy-pages.yml b/.github/workflows/deploy-pages.yml
index e5b05974c..81cef85c7 100644
--- a/.github/workflows/deploy-pages.yml
+++ b/.github/workflows/deploy-pages.yml
@@ -2,7 +2,7 @@ name: Deploy to GitHub Pages
 
 on:
   push:
-    branches: [ "main" ]
+    branches: [master]
   workflow_dispatch:
 
 # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
@@ -37,4 +37,4 @@ jobs:
           
       - name: Deploy to GitHub Pages
         id: deployment
-        uses: actions/deploy-pages@v4
\ No newline at end of file
+        uses: actions/deploy-pages@v4
diff --git a/AGENTS.md b/AGENTS.md
index d509d5b6b..ad2e36816 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -3,7 +3,7 @@
 ## Build, Lint, and Test Commands
 
 - **Build:**  
-  `bun run build` (uses tsup)
+  `bun run build` (uses tsdown)
 - **Dev:**  
   `bun run dev`
 - **Lint:**  
@@ -13,7 +13,7 @@
 - **Test all:**  
    `bun test`
 - **Test single file:**  
-   `bun test tests/claude-request.test.ts`
+   `bun test tests/anthropic-request.test.ts`
 - **Start (prod):**  
   `bun run start`
 
diff --git a/Dockerfile b/Dockerfile
index 1265220ef..1cbcabe8d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -13,13 +13,14 @@ WORKDIR /app
 COPY ./package.json ./bun.lock ./
 RUN bun install --frozen-lockfile --production --ignore-scripts --no-cache
 
-COPY --from=builder /app/dist ./dist
+COPY --from=builder --chown=bun:bun /app/dist ./dist
 
 EXPOSE 4141
 
 HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
   CMD wget --spider -q http://localhost:4141/ || exit 1
 
-COPY entrypoint.sh /entrypoint.sh
+COPY --chown=bun:bun entrypoint.sh /entrypoint.sh
 RUN chmod +x /entrypoint.sh
+USER bun
 ENTRYPOINT ["/entrypoint.sh"]
diff --git a/README.md b/README.md
index 0d36c13c9..307995dad 100644
--- a/README.md
+++ b/README.md
@@ -74,25 +74,25 @@ mkdir -p ./copilot-data
 # Run the container with a bind mount to persist the token
 # This ensures your authentication survives container restarts
 
-docker run -p 4141:4141 -v $(pwd)/copilot-data:/root/.local/share/copilot-api copilot-api
+docker run -p 4141:4141 -v $(pwd)/copilot-data:/home/bun/.local/share/copilot-api copilot-api
 ```
 
 > **Note:**
-> The GitHub token and related data will be stored in `copilot-data` on your host. This is mapped to `/root/.local/share/copilot-api` inside the container, ensuring persistence across restarts.
+> The GitHub token and related data will be stored in `copilot-data` on your host. This is mapped to `/home/bun/.local/share/copilot-api` inside the container, ensuring persistence across restarts.
 
 ### Docker with Environment Variables
 
 You can pass the GitHub token directly to the container using environment variables:
 
 ```sh
-# Build with GitHub token
-docker build --build-arg GH_TOKEN=your_github_token_here -t copilot-api .
+# Build the image
+docker build -t copilot-api .
 
 # Run with GitHub token
 docker run -p 4141:4141 -e GH_TOKEN=your_github_token_here copilot-api
 
 # Run with additional options
-docker run -p 4141:4141 -e GH_TOKEN=your_token copilot-api start --verbose --port 4141
+docker run -p 4141:4141 -e GH_TOKEN=your_token copilot-api --verbose --port 4141
 ```
 
 ### Docker Compose Example
@@ -142,6 +142,7 @@ Copilot API now uses a subcommand structure with these main commands:
 
 - `start`: Start the Copilot API server. This command will also handle authentication if needed.
 - `auth`: Run GitHub authentication flow without starting the server. This is typically used if you need to generate a token for use with the `--github-token` option, especially in non-interactive environments.
+- `models`: List the current GitHub Copilot models available to the API. This is useful for non-interactive deployments where you want to inspect model support without starting the server.
 - `check-usage`: Show your current GitHub Copilot usage and quota information directly in the terminal (no server required).
 - `debug`: Display diagnostic information including version, runtime details, file paths, and authentication status. Useful for troubleshooting and support.
 
@@ -154,6 +155,7 @@ The following command line options are available for the `start` command:
 | Option         | Description                                                                   | Default    | Alias |
 | -------------- | ----------------------------------------------------------------------------- | ---------- | ----- |
 | --port         | Port to listen on                                                             | 4141       | -p    |
+| --host         | Host to bind to. Defaults to local-only. Use 0.0.0.0 to expose to the network (e.g. inside Docker) | 127.0.0.1  | none  |
 | --verbose      | Enable verbose logging                                                        | false      | -v    |
 | --account-type | Account type to use (individual, business, enterprise)                        | individual | -a    |
 | --manual       | Enable manual request approval                                                | false      | none  |
@@ -171,6 +173,17 @@ The following command line options are available for the `start` command:
 | --verbose    | Enable verbose logging    | false   | -v    |
 | --show-token | Show GitHub token on auth | false   | none  |
 
+### Models Command Options
+
+| Option         | Description                                                                   | Default    | Alias |
+| -------------- | ----------------------------------------------------------------------------- | ---------- | ----- |
+| --verbose      | Enable verbose logging                                                        | false      | -v    |
+| --account-type | Account type to use (individual, business, enterprise)                        | individual | -a    |
+| --github-token | Provide GitHub token directly (must be generated using the `auth` subcommand) | none       | -g    |
+| --show-token   | Show GitHub and Copilot tokens on fetch                                       | false      | none  |
+| --proxy-env    | Initialize proxy from environment variables                                   | false      | none  |
+| --json         | Output raw model response as JSON                                             | false      | none  |
+
 ### Debug Command Options
 
 | Option | Description               | Default | Alias |
@@ -342,6 +355,12 @@ bun run dev
 bun run start
 ```
 
+### List Available Models
+
+```sh
+bun run models
+```
+
 ## Usage Tips
 
 - To avoid hitting GitHub Copilot's rate limits, you can use the following flags:
@@ -349,3 +368,5 @@ bun run start
   - `--rate-limit <seconds>`: Enforces a minimum time interval between requests. For example, `copilot-api start --rate-limit 30` will ensure there's at least a 30-second gap between requests.
   - `--wait`: Use this with `--rate-limit`. It makes the server wait for the cooldown period to end instead of rejecting the request with an error. This is useful for clients that don't automatically retry on rate limit errors.
 - If you have a GitHub business or enterprise plan account with Copilot, use the `--account-type` flag (e.g., `--account-type business`). See the [official documentation](https://docs.github.com/en/enterprise-cloud@latest/copilot/managing-copilot/managing-github-copilot-in-your-organization/managing-access-to-github-copilot-in-your-organization/managing-github-copilot-access-to-your-organizations-network#configuring-copilot-subscription-based-network-routing-for-your-enterprise-or-organization) for more details.
+- This server is designed for localhost or trusted local-network use. It is intentionally single-account: all clients share the same GitHub/Copilot token and model cache.
+- The server binds to `127.0.0.1` by default, so it is not reachable from other machines unless you explicitly pass `--host` (for example `--host 0.0.0.0`). The Docker entrypoint passes `--host 0.0.0.0` so published ports work; restrict exposure on the host side with `-p 127.0.0.1:4141:4141` if you want it local-only.
diff --git a/entrypoint.sh b/entrypoint.sh
index dfe63c902..7d5cc4fdc 100644
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -4,6 +4,8 @@ if [ "$1" = "--auth" ]; then
   exec bun run dist/main.js auth
 else
   # Default command
-  exec bun run dist/main.js start -g "$GH_TOKEN" "$@"
+  # Bind to 0.0.0.0 inside the container so published ports work;
+  # restrict exposure on the host side (e.g. -p 127.0.0.1:4141:4141).
+  exec bun run dist/main.js start --host 0.0.0.0 -g "$GH_TOKEN" "$@"
 fi
 
diff --git a/package.json b/package.json
index a5adbb8e7..3b2744fa0 100644
--- a/package.json
+++ b/package.json
@@ -27,6 +27,7 @@
     "knip": "knip-bun",
     "lint": "eslint --cache",
     "lint:all": "eslint --cache .",
+    "models": "bun run ./src/main.ts models",
     "prepack": "bun run build",
     "prepare": "simple-git-hooks",
     "release": "bumpp && bun publish --access public",
diff --git a/src/main.ts b/src/main.ts
index 4f6ca784b..1aa5a5e9f 100644
--- a/src/main.ts
+++ b/src/main.ts
@@ -5,6 +5,7 @@ import { defineCommand, runMain } from "citty"
 import { auth } from "./auth"
 import { checkUsage } from "./check-usage"
 import { debug } from "./debug"
+import { models } from "./models"
 import { start } from "./start"
 
 const main = defineCommand({
@@ -13,7 +14,7 @@ const main = defineCommand({
     description:
       "A wrapper around GitHub Copilot API to make it OpenAI compatible, making it usable for other tools.",
   },
-  subCommands: { auth, start, "check-usage": checkUsage, debug },
+  subCommands: { auth, start, models, "check-usage": checkUsage, debug },
 })
 
 await runMain(main)
diff --git a/src/models.ts b/src/models.ts
new file mode 100644
index 000000000..52b0b3e4d
--- /dev/null
+++ b/src/models.ts
@@ -0,0 +1,120 @@
+#!/usr/bin/env node
+
+import { defineCommand } from "citty"
+import consola from "consola"
+
+import { ensurePaths } from "./lib/paths"
+import { initProxyFromEnv } from "./lib/proxy"
+import { state } from "./lib/state"
+import { setupGitHubToken } from "./lib/token"
+import { cacheVSCodeVersion } from "./lib/utils"
+import { getModels } from "./services/copilot/get-models"
+import { getCopilotToken } from "./services/github/get-copilot-token"
+
+interface RunModelsOptions {
+  verbose: boolean
+  accountType: string
+  githubToken?: string
+  showToken: boolean
+  proxyEnv: boolean
+  json: boolean
+}
+
+export async function runModels(options: RunModelsOptions): Promise<void> {
+  if (options.proxyEnv) {
+    initProxyFromEnv()
+  }
+
+  if (options.verbose) {
+    consola.level = 5
+    consola.info("Verbose logging enabled")
+  }
+
+  state.accountType = options.accountType
+  state.showToken = options.showToken
+
+  await ensurePaths()
+  await cacheVSCodeVersion()
+
+  if (options.githubToken) {
+    state.githubToken = options.githubToken
+    consola.info("Using provided GitHub token")
+  } else {
+    await setupGitHubToken()
+  }
+
+  const { token } = await getCopilotToken()
+  state.copilotToken = token
+
+  if (state.showToken) {
+    consola.info("Copilot token:", token)
+  }
+
+  const models = await getModels()
+
+  if (options.json) {
+    console.log(JSON.stringify(models, null, 2))
+    return
+  }
+
+  consola.box(
+    models.data
+      .map((model) => {
+        const endpoints = model.supported_endpoints?.join(", ") ?? "default"
+        return `${model.id} (${model.vendor}; ${endpoints})`
+      })
+      .join("\n"),
+  )
+}
+
+export const models = defineCommand({
+  meta: {
+    name: "models",
+    description: "List the current GitHub Copilot models available to the API",
+  },
+  args: {
+    verbose: {
+      alias: "v",
+      type: "boolean",
+      default: false,
+      description: "Enable verbose logging",
+    },
+    "account-type": {
+      alias: "a",
+      type: "string",
+      default: "individual",
+      description: "Account type to use (individual, business, enterprise)",
+    },
+    "github-token": {
+      alias: "g",
+      type: "string",
+      description:
+        "Provide GitHub token directly (must be generated using the `auth` subcommand)",
+    },
+    "show-token": {
+      type: "boolean",
+      default: false,
+      description: "Show GitHub and Copilot tokens on fetch",
+    },
+    "proxy-env": {
+      type: "boolean",
+      default: false,
+      description: "Initialize proxy from environment variables",
+    },
+    json: {
+      type: "boolean",
+      default: false,
+      description: "Output raw model response as JSON",
+    },
+  },
+  run({ args }) {
+    return runModels({
+      verbose: args.verbose,
+      accountType: args["account-type"],
+      githubToken: args["github-token"],
+      showToken: args["show-token"],
+      proxyEnv: args["proxy-env"],
+      json: args.json,
+    })
+  },
+})
diff --git a/src/routes/chat-completions/handler.ts b/src/routes/chat-completions/handler.ts
index 04a5ae9ed..c17cccf3b 100644
--- a/src/routes/chat-completions/handler.ts
+++ b/src/routes/chat-completions/handler.ts
@@ -13,6 +13,14 @@ import {
   type ChatCompletionResponse,
   type ChatCompletionsPayload,
 } from "~/services/copilot/create-chat-completions"
+import {
+  createResponsesFromChatCompletions,
+  responseEventToChatChunks,
+  responseToChatCompletion,
+  shouldUseResponsesEndpoint,
+  type ResponseApiResponse,
+  type ResponsesStreamState,
+} from "~/services/copilot/create-responses"
 
 export async function handleCompletion(c: Context) {
   await checkRateLimit(state)
@@ -47,6 +55,36 @@ export async function handleCompletion(c: Context) {
     consola.debug("Set max_tokens to:", JSON.stringify(payload.max_tokens))
   }
 
+  if (shouldUseResponsesEndpoint(selectedModel?.supported_endpoints)) {
+    const response = await createResponsesFromChatCompletions(payload)
+
+    if (isNonStreamingResponse(response)) {
+      consola.debug("Non-streaming response:", JSON.stringify(response))
+      return c.json(responseToChatCompletion(response))
+    }
+
+    consola.debug("Streaming response from responses endpoint")
+    return streamSSE(c, async (stream) => {
+      const streamState: ResponsesStreamState = {
+        id: "",
+        model: payload.model,
+        created: Math.floor(Date.now() / 1000),
+        roleSent: false,
+      }
+
+      for await (const event of response) {
+        if (!event.data) continue
+
+        const chunks = responseEventToChatChunks(event.data, streamState)
+        for (const chunk of chunks) {
+          await stream.writeSSE({
+            data: chunk === "[DONE]" ? chunk : JSON.stringify(chunk),
+          })
+        }
+      }
+    })
+  }
+
   const response = await createChatCompletions(payload)
 
   if (isNonStreaming(response)) {
@@ -66,3 +104,7 @@ export async function handleCompletion(c: Context) {
 const isNonStreaming = (
   response: Awaited<ReturnType<typeof createChatCompletions>>,
 ): response is ChatCompletionResponse => Object.hasOwn(response, "choices")
+
+const isNonStreamingResponse = (
+  response: Awaited<ReturnType<typeof createResponsesFromChatCompletions>>,
+): response is ResponseApiResponse => !(Symbol.asyncIterator in response)
diff --git a/src/routes/messages/non-stream-translation.ts b/src/routes/messages/non-stream-translation.ts
index dc41e6382..3f3939b60 100644
--- a/src/routes/messages/non-stream-translation.ts
+++ b/src/routes/messages/non-stream-translation.ts
@@ -352,6 +352,20 @@ function getAnthropicToolUseBlocks(
     type: "tool_use",
     id: toolCall.id,
     name: toolCall.function.name,
-    input: JSON.parse(toolCall.function.arguments) as Record<string, unknown>,
+    input: safeParseToolInput(toolCall.function.arguments),
   }))
 }
+
+function safeParseToolInput(args: string): Record<string, unknown> {
+  try {
+    const parsed: unknown = JSON.parse(args)
+    if (parsed !== null && typeof parsed === "object") {
+      return parsed as Record<string, unknown>
+    }
+    return {}
+  } catch {
+    // Models occasionally emit malformed JSON for tool arguments.
+    // Return an empty input instead of crashing the whole response.
+    return {}
+  }
+}
diff --git a/src/services/copilot/create-responses.ts b/src/services/copilot/create-responses.ts
new file mode 100644
index 000000000..d42be3df9
--- /dev/null
+++ b/src/services/copilot/create-responses.ts
@@ -0,0 +1,463 @@
+import consola from "consola"
+import { events } from "fetch-event-stream"
+
+import { copilotBaseUrl, copilotHeaders } from "~/lib/api-config"
+import { HTTPError } from "~/lib/error"
+import { state } from "~/lib/state"
+
+import type {
+  ChatCompletionChunk,
+  ChatCompletionResponse,
+  ChatCompletionsPayload,
+  ContentPart,
+  Message,
+  Tool,
+  ToolCall,
+} from "./create-chat-completions"
+
+export const createResponsesFromChatCompletions = async (
+  payload: ChatCompletionsPayload,
+) => {
+  if (!state.copilotToken) throw new Error("Copilot token not found")
+
+  const response = await fetch(`${copilotBaseUrl(state)}/responses`, {
+    method: "POST",
+    headers: copilotHeaders(state),
+    body: JSON.stringify(toResponsesPayload(payload)),
+  })
+
+  if (!response.ok) {
+    throw new HTTPError("Failed to create response", response)
+  }
+
+  if (payload.stream) {
+    return events(response)
+  }
+
+  return (await response.json()) as ResponseApiResponse
+}
+
+export const shouldUseResponsesEndpoint = (endpoints?: Array<string>) =>
+  endpoints ?
+    endpoints.includes("/responses") && !endpoints.includes("/chat/completions")
+  : false
+
+export function responseToChatCompletion(
+  response: ResponseApiResponse,
+): ChatCompletionResponse {
+  const toolCalls = getResponseToolCalls(response)
+  const text = getResponseText(response)
+
+  return {
+    id: response.id,
+    object: "chat.completion",
+    created: response.created_at,
+    model: response.model,
+    choices: [
+      {
+        index: 0,
+        message: {
+          role: "assistant",
+          // Chat Completions convention: content is null when the turn is
+          // purely tool calls.
+          content: text || (toolCalls.length > 0 ? null : ""),
+          ...(toolCalls.length > 0 && { tool_calls: toolCalls }),
+        },
+        logprobs: null,
+        finish_reason: nonStreamFinishReason(response, toolCalls.length > 0),
+      },
+    ],
+    usage: toChatUsage(response.usage),
+  }
+}
+
+function nonStreamFinishReason(
+  response: ResponseApiResponse,
+  hasToolCalls: boolean,
+): "stop" | "length" | "tool_calls" {
+  if (hasToolCalls) return "tool_calls"
+  return response.status === "completed" ? "stop" : "length"
+}
+
+function getResponseToolCalls(response: ResponseApiResponse): Array<ToolCall> {
+  return (
+    response.output
+      ?.filter((item) => item.type === "function_call")
+      .map((item) => ({
+        id: item.call_id ?? item.id ?? "",
+        type: "function" as const,
+        function: {
+          name: item.name ?? "",
+          arguments: item.arguments ?? "",
+        },
+      })) ?? []
+  )
+}
+
+type ChatChunkDelta = ChatCompletionChunk["choices"][number]["delta"]
+type ChatChunkFinish = ChatCompletionChunk["choices"][number]["finish_reason"]
+
+interface ChunkParts {
+  delta: ChatChunkDelta
+  finishReason: ChatChunkFinish
+  usage?: ChatCompletionChunk["usage"]
+}
+
+function buildChunk(
+  state: ResponsesStreamState,
+  { delta, finishReason, usage }: ChunkParts,
+): ChatCompletionChunk {
+  return {
+    id: state.id,
+    object: "chat.completion.chunk",
+    created: state.created,
+    model: state.model,
+    choices: [{ index: 0, delta, finish_reason: finishReason, logprobs: null }],
+    ...(usage && { usage }),
+  }
+}
+
+function consumeRole(state: ResponsesStreamState): "assistant" | undefined {
+  const role = state.roleSent ? undefined : "assistant"
+  state.roleSent = true
+  return role
+}
+
+function handleTextDelta(
+  event: ResponseStreamEvent,
+  state: ResponsesStreamState,
+): Array<ChatCompletionChunk | "[DONE]"> {
+  if (!event.delta) return []
+  return [
+    buildChunk(state, {
+      delta: { role: consumeRole(state), content: event.delta },
+      finishReason: null,
+    }),
+  ]
+}
+
+function handleFunctionCallAdded(
+  event: ResponseStreamEvent,
+  state: ResponsesStreamState,
+): Array<ChatCompletionChunk | "[DONE]"> {
+  if (event.item?.type !== "function_call") return []
+
+  state.toolCalls ??= {}
+  const index = state.toolCallCount ?? 0
+  state.toolCallCount = index + 1
+  state.hasToolCalls = true
+  state.toolCalls[event.item.id ?? event.item_id ?? String(index)] = { index }
+
+  return [
+    buildChunk(state, {
+      delta: {
+        role: consumeRole(state),
+        tool_calls: [
+          {
+            index,
+            id: event.item.call_id ?? event.item.id ?? "",
+            type: "function",
+            function: { name: event.item.name ?? "", arguments: "" },
+          },
+        ],
+      },
+      finishReason: null,
+    }),
+  ]
+}
+
+function handleFunctionCallArgsDelta(
+  event: ResponseStreamEvent,
+  state: ResponsesStreamState,
+): Array<ChatCompletionChunk | "[DONE]"> {
+  if (!event.item_id || !event.delta) return []
+  const toolCall = state.toolCalls?.[event.item_id]
+  if (!toolCall) return []
+
+  return [
+    buildChunk(state, {
+      delta: {
+        tool_calls: [
+          { index: toolCall.index, function: { arguments: event.delta } },
+        ],
+      },
+      finishReason: null,
+    }),
+  ]
+}
+
+function handleCompleted(
+  event: ResponseStreamEvent,
+  state: ResponsesStreamState,
+): Array<ChatCompletionChunk | "[DONE]"> {
+  if (!event.response) return []
+  state.id = event.response.id
+  state.model = event.response.model
+  state.created = event.response.created_at
+  state.roleSent = true
+
+  return [
+    buildChunk(state, {
+      delta: { content: null },
+      finishReason: state.hasToolCalls ? "tool_calls" : "stop",
+      usage: toChatUsage(event.response.usage),
+    }),
+    "[DONE]",
+  ]
+}
+
+export function responseEventToChatChunks(
+  eventData: string,
+  state: ResponsesStreamState,
+): Array<ChatCompletionChunk | "[DONE]"> {
+  let event: ResponseStreamEvent
+  try {
+    event = JSON.parse(eventData) as ResponseStreamEvent
+  } catch {
+    // Skip malformed or non-JSON SSE events (e.g. keepalives)
+    // instead of killing the whole stream.
+    consola.debug("Skipping non-JSON responses stream event:", eventData)
+    return []
+  }
+
+  if (event.type === "response.created" && event.response) {
+    state.id = event.response.id
+    state.model = event.response.model
+    state.created = event.response.created_at
+    return []
+  }
+
+  switch (event.type) {
+    case "response.output_text.delta": {
+      return handleTextDelta(event, state)
+    }
+    case "response.output_item.added": {
+      return handleFunctionCallAdded(event, state)
+    }
+    case "response.function_call_arguments.delta": {
+      return handleFunctionCallArgsDelta(event, state)
+    }
+    case "response.completed": {
+      return handleCompleted(event, state)
+    }
+    default: {
+      return []
+    }
+  }
+}
+
+export interface ResponsesStreamState {
+  created: number
+  id: string
+  model: string
+  roleSent: boolean
+  // Maps a streamed function_call item id to its chat-completions tool_calls
+  // array index. Lazily initialized so existing construction sites still work.
+  toolCalls?: Record<string, { index: number }>
+  toolCallCount?: number
+  hasToolCalls?: boolean
+}
+
+interface ResponsesPayload {
+  input: Array<ResponseInputItem>
+  max_output_tokens?: number | null
+  model: string
+  stream?: boolean | null
+  temperature?: number | null
+  top_p?: number | null
+  tools?: Array<ResponsesFunctionTool>
+  tool_choice?: ResponsesToolChoice
+}
+
+interface ResponseInputMessage {
+  type?: "message"
+  content: string
+  role: "assistant" | "developer" | "system" | "user"
+}
+
+interface ResponseFunctionCallInput {
+  type: "function_call"
+  call_id: string
+  name: string
+  arguments: string
+}
+
+interface ResponseFunctionCallOutputInput {
+  type: "function_call_output"
+  call_id: string
+  output: string
+}
+
+type ResponseInputItem =
+  | ResponseInputMessage
+  | ResponseFunctionCallInput
+  | ResponseFunctionCallOutputInput
+
+interface ResponsesFunctionTool {
+  type: "function"
+  name: string
+  description?: string
+  parameters: Record<string, unknown>
+}
+
+type ResponsesToolChoice =
+  | "auto"
+  | "none"
+  | "required"
+  | { type: "function"; name: string }
+
+export interface ResponseApiResponse {
+  created_at: number
+  error?: unknown
+  id: string
+  model: string
+  object: "response"
+  output?: Array<ResponseOutputItem>
+  status: string
+  usage?: ResponseUsage | null
+}
+
+interface ResponseOutputItem {
+  type?: string
+  content?: Array<ResponseContentPart>
+  // Present when type === "function_call"
+  id?: string
+  call_id?: string
+  name?: string
+  arguments?: string
+}
+
+interface ResponseContentPart {
+  text?: string
+  type: string
+}
+
+interface ResponseUsage {
+  input_tokens?: number
+  output_tokens?: number
+  total_tokens?: number
+}
+
+interface ResponseStreamEvent {
+  delta?: string
+  response?: ResponseApiResponse
+  type: string
+  // Function-call streaming fields
+  item?: ResponseOutputItem
+  item_id?: string
+  output_index?: number
+}
+
+export function toResponsesPayload(
+  payload: ChatCompletionsPayload,
+): ResponsesPayload {
+  return {
+    model: payload.model,
+    input: payload.messages.flatMap((message) => toResponseInputItems(message)),
+    max_output_tokens: payload.max_tokens,
+    stream: payload.stream,
+    temperature: payload.temperature,
+    top_p: payload.top_p,
+    tools: toResponsesTools(payload.tools),
+    tool_choice: toResponsesToolChoice(payload.tool_choice),
+  }
+}
+
+function toResponsesTools(
+  tools: ChatCompletionsPayload["tools"],
+): Array<ResponsesFunctionTool> | undefined {
+  if (!tools || tools.length === 0) return undefined
+  // Chat Completions nests the function under `function`; the Responses API
+  // expects the name/description/parameters flattened onto the tool itself.
+  return tools.map((tool: Tool) => ({
+    type: "function",
+    name: tool.function.name,
+    description: tool.function.description,
+    parameters: tool.function.parameters,
+  }))
+}
+
+function toResponsesToolChoice(
+  toolChoice: ChatCompletionsPayload["tool_choice"],
+): ResponsesToolChoice | undefined {
+  if (toolChoice === null || toolChoice === undefined) return undefined
+  if (typeof toolChoice === "string") return toolChoice
+  return { type: "function", name: toolChoice.function.name }
+}
+
+function toResponseInputItems(message: Message): Array<ResponseInputItem> {
+  // A tool result becomes a function_call_output keyed by its call id.
+  if (message.role === "tool") {
+    return [
+      {
+        type: "function_call_output",
+        call_id: message.tool_call_id ?? "",
+        output: contentToText(message.content),
+      },
+    ]
+  }
+
+  // An assistant turn that issued tool calls becomes optional text followed by
+  // one function_call item per call.
+  if (
+    message.role === "assistant"
+    && message.tool_calls
+    && message.tool_calls.length > 0
+  ) {
+    const items: Array<ResponseInputItem> = []
+    const text = contentToText(message.content)
+    if (text) {
+      items.push({ type: "message", role: "assistant", content: text })
+    }
+    for (const toolCall of message.tool_calls) {
+      items.push({
+        type: "function_call",
+        call_id: toolCall.id,
+        name: toolCall.function.name,
+        arguments: toolCall.function.arguments,
+      })
+    }
+    return items
+  }
+
+  return [
+    {
+      type: "message",
+      role: message.role,
+      content: contentToText(message.content),
+    },
+  ]
+}
+
+function contentToText(content: Message["content"]): string {
+  if (typeof content === "string") return content
+  if (!content) return ""
+
+  return content.map((part) => contentPartToText(part)).join("\n")
+}
+
+function contentPartToText(part: ContentPart): string {
+  if (part.type === "text") return part.text
+
+  return `[image: ${part.image_url.url}]`
+}
+
+function getResponseText(response: ResponseApiResponse): string {
+  return (
+    response.output
+      ?.flatMap((item) => item.content ?? [])
+      .filter((part) => part.type === "output_text")
+      .map((part) => part.text ?? "")
+      .join("") ?? ""
+  )
+}
+
+function toChatUsage(usage?: ResponseUsage | null) {
+  if (!usage) return undefined
+
+  return {
+    prompt_tokens: usage.input_tokens ?? 0,
+    completion_tokens: usage.output_tokens ?? 0,
+    total_tokens: usage.total_tokens ?? 0,
+  }
+}
diff --git a/src/services/copilot/get-models.ts b/src/services/copilot/get-models.ts
index 3cfa30af0..efd61f276 100644
--- a/src/services/copilot/get-models.ts
+++ b/src/services/copilot/get-models.ts
@@ -46,6 +46,7 @@ export interface Model {
   name: string
   object: string
   preview: boolean
+  supported_endpoints?: Array<string>
   vendor: string
   version: string
   policy?: {
diff --git a/src/services/get-vscode-version.ts b/src/services/get-vscode-version.ts
index 6078f09b5..51ea5e24f 100644
--- a/src/services/get-vscode-version.ts
+++ b/src/services/get-vscode-version.ts
@@ -29,5 +29,3 @@ export async function getVSCodeVersion() {
     clearTimeout(timeout)
   }
 }
-
-await getVSCodeVersion()
diff --git a/src/start.ts b/src/start.ts
index 14abbbdff..1e8a48320 100644
--- a/src/start.ts
+++ b/src/start.ts
@@ -16,6 +16,7 @@ import { server } from "./server"
 
 interface RunServerOptions {
   port: number
+  host: string
   verbose: boolean
   accountType: string
   manual: boolean
@@ -64,7 +65,14 @@ export async function runServer(options: RunServerOptions): Promise<void> {
     `Available models: \n${state.models?.data.map((model) => `- ${model.id}`).join("\n")}`,
   )
 
-  const serverUrl = `http://localhost:${options.port}`
+  const displayHost = options.host === "0.0.0.0" ? "localhost" : options.host
+  const serverUrl = `http://${displayHost}:${options.port}`
+
+  if (options.host !== "127.0.0.1" && options.host !== "localhost") {
+    consola.warn(
+      `Server will listen on ${options.host} and may be reachable from other machines. Use the default host (127.0.0.1) for local-only access.`,
+    )
+  }
 
   if (options.claudeCode) {
     invariant(state.models, "Models should be loaded by now")
@@ -116,6 +124,7 @@ export async function runServer(options: RunServerOptions): Promise<void> {
 
   serve({
     fetch: server.fetch as ServerHandler,
+    hostname: options.host,
     port: options.port,
   })
 }
@@ -132,6 +141,12 @@ export const start = defineCommand({
       default: "4141",
       description: "Port to listen on",
     },
+    host: {
+      type: "string",
+      default: "127.0.0.1",
+      description:
+        "Host to bind to. Defaults to 127.0.0.1 (local-only). Use 0.0.0.0 to expose to the network (e.g. inside Docker)",
+    },
     verbose: {
       alias: "v",
       type: "boolean",
@@ -193,6 +208,7 @@ export const start = defineCommand({
 
     return runServer({
       port: Number.parseInt(args.port, 10),
+      host: args.host,
       verbose: args.verbose,
       accountType: args["account-type"],
       manual: args.manual,
diff --git a/tests/anthropic-edge-cases.test.ts b/tests/anthropic-edge-cases.test.ts
new file mode 100644
index 000000000..aa4f35dc3
--- /dev/null
+++ b/tests/anthropic-edge-cases.test.ts
@@ -0,0 +1,494 @@
+import { describe, test, expect } from "bun:test"
+
+import type {
+  AnthropicMessagesPayload,
+  AnthropicStreamState,
+  AnthropicTextBlock,
+  AnthropicToolUseBlock,
+} from "~/routes/messages/anthropic-types"
+import type {
+  ChatCompletionChunk,
+  ChatCompletionResponse,
+  ContentPart,
+} from "~/services/copilot/create-chat-completions"
+
+import {
+  translateToAnthropic,
+  translateToOpenAI,
+} from "../src/routes/messages/non-stream-translation"
+import { translateChunkToAnthropicEvents } from "../src/routes/messages/stream-translation"
+
+function freshStreamState(): AnthropicStreamState {
+  return {
+    messageStartSent: false,
+    contentBlockIndex: 0,
+    contentBlockOpen: false,
+    toolCalls: {},
+  }
+}
+
+function makeChunk(partial: Partial<ChatCompletionChunk>): ChatCompletionChunk {
+  return {
+    id: "chunk-1",
+    object: "chat.completion.chunk",
+    created: 0,
+    model: "gpt-4o",
+    choices: [],
+    ...partial,
+  }
+}
+
+describe("Anthropic request edge cases", () => {
+  test("translates image blocks into image_url content parts", () => {
+    const payload: AnthropicMessagesPayload = {
+      model: "gpt-4o",
+      max_tokens: 100,
+      messages: [
+        {
+          role: "user",
+          content: [
+            { type: "text", text: "What is in this image?" },
+            {
+              type: "image",
+              source: {
+                type: "base64",
+                media_type: "image/png",
+                data: "iVBORw0KGgo=",
+              },
+            },
+          ],
+        },
+      ],
+    }
+
+    const result = translateToOpenAI(payload)
+    const userMessage = result.messages.find((m) => m.role === "user")
+    expect(Array.isArray(userMessage?.content)).toBe(true)
+
+    const parts = userMessage?.content as Array<ContentPart>
+    const imagePart = parts.find((p) => p.type === "image_url")
+    expect(imagePart).toBeDefined()
+    if (imagePart?.type === "image_url") {
+      expect(imagePart.image_url.url).toBe("data:image/png;base64,iVBORw0KGgo=")
+    }
+    const textPart = parts.find((p) => p.type === "text")
+    expect(textPart).toBeDefined()
+  })
+
+  test("translates tool_result blocks into tool role messages before user content", () => {
+    const payload: AnthropicMessagesPayload = {
+      model: "gpt-4o",
+      max_tokens: 100,
+      messages: [
+        { role: "user", content: "What's the weather?" },
+        {
+          role: "assistant",
+          content: [
+            {
+              type: "tool_use",
+              id: "toolu_1",
+              name: "get_weather",
+              input: { location: "Boston" },
+            },
+          ],
+        },
+        {
+          role: "user",
+          content: [
+            { type: "text", text: "Thanks, summarize that." },
+            {
+              type: "tool_result",
+              tool_use_id: "toolu_1",
+              content: "Sunny, 75F",
+            },
+          ],
+        },
+      ],
+    }
+
+    const result = translateToOpenAI(payload)
+    const roles = result.messages.map((m) => m.role)
+
+    // tool_result must directly follow the assistant tool_call message
+    const assistantIndex = roles.indexOf("assistant")
+    expect(roles[assistantIndex + 1]).toBe("tool")
+
+    const toolMessage = result.messages[assistantIndex + 1]
+    expect(toolMessage.tool_call_id).toBe("toolu_1")
+    expect(toolMessage.content).toBe("Sunny, 75F")
+
+    // remaining user text still arrives after the tool message
+    expect(roles[assistantIndex + 2]).toBe("user")
+  })
+
+  test("handles multiple tool_result blocks in a single user message", () => {
+    const payload: AnthropicMessagesPayload = {
+      model: "gpt-4o",
+      max_tokens: 100,
+      messages: [
+        {
+          role: "assistant",
+          content: [
+            { type: "tool_use", id: "toolu_a", name: "a", input: {} },
+            { type: "tool_use", id: "toolu_b", name: "b", input: {} },
+          ],
+        },
+        {
+          role: "user",
+          content: [
+            { type: "tool_result", tool_use_id: "toolu_a", content: "A" },
+            { type: "tool_result", tool_use_id: "toolu_b", content: "B" },
+          ],
+        },
+      ],
+    }
+
+    const result = translateToOpenAI(payload)
+    const toolMessages = result.messages.filter((m) => m.role === "tool")
+    expect(toolMessages).toHaveLength(2)
+    expect(toolMessages.map((m) => m.tool_call_id)).toEqual([
+      "toolu_a",
+      "toolu_b",
+    ])
+  })
+})
+
+function makeResponse(
+  overrides: Partial<ChatCompletionResponse>,
+): ChatCompletionResponse {
+  return {
+    id: "resp-1",
+    object: "chat.completion",
+    created: 0,
+    model: "gpt-4o",
+    choices: [],
+    ...overrides,
+  } as ChatCompletionResponse
+}
+
+describe("Anthropic response edge cases", () => {
+  test("does not crash on invalid tool call JSON and falls back to empty input", () => {
+    const response = makeResponse({
+      choices: [
+        {
+          index: 0,
+          logprobs: null,
+          finish_reason: "tool_calls",
+          message: {
+            role: "assistant",
+            content: null,
+            tool_calls: [
+              {
+                id: "call_bad",
+                type: "function",
+                function: { name: "broken_tool", arguments: "{not json" },
+              },
+            ],
+          },
+        },
+      ],
+    })
+
+    const result = translateToAnthropic(response)
+    const toolUse = result.content.find(
+      (b): b is AnthropicToolUseBlock => b.type === "tool_use",
+    )
+    expect(toolUse).toBeDefined()
+    expect(toolUse?.input).toEqual({})
+    expect(result.stop_reason).toBe("tool_use")
+  })
+
+  test("falls back to empty input when tool arguments parse to a non-object", () => {
+    const response = makeResponse({
+      choices: [
+        {
+          index: 0,
+          logprobs: null,
+          finish_reason: "tool_calls",
+          message: {
+            role: "assistant",
+            content: null,
+            tool_calls: [
+              {
+                id: "call_scalar",
+                type: "function",
+                function: { name: "scalar_tool", arguments: "42" },
+              },
+            ],
+          },
+        },
+      ],
+    })
+
+    const result = translateToAnthropic(response)
+    const toolUse = result.content.find(
+      (b): b is AnthropicToolUseBlock => b.type === "tool_use",
+    )
+    expect(toolUse?.input).toEqual({})
+  })
+
+  test("subtracts cached tokens from input_tokens and reports cache_read_input_tokens", () => {
+    const response = makeResponse({
+      choices: [
+        {
+          index: 0,
+          logprobs: null,
+          finish_reason: "stop",
+          message: { role: "assistant", content: "Hi" },
+        },
+      ],
+      usage: {
+        prompt_tokens: 1000,
+        completion_tokens: 5,
+        total_tokens: 1005,
+        prompt_tokens_details: { cached_tokens: 800 },
+      },
+    })
+
+    const result = translateToAnthropic(response)
+    expect(result.usage.input_tokens).toBe(200)
+    expect(result.usage.cache_read_input_tokens).toBe(800)
+    expect(result.usage.output_tokens).toBe(5)
+  })
+
+  test("omits cache_read_input_tokens when no cached token details exist", () => {
+    const response = makeResponse({
+      choices: [
+        {
+          index: 0,
+          logprobs: null,
+          finish_reason: "stop",
+          message: { role: "assistant", content: "Hi" },
+        },
+      ],
+      usage: { prompt_tokens: 100, completion_tokens: 5, total_tokens: 105 },
+    })
+
+    const result = translateToAnthropic(response)
+    expect(result.usage.input_tokens).toBe(100)
+    expect("cache_read_input_tokens" in result.usage).toBe(false)
+  })
+})
+
+describe("Anthropic stream translation edge cases", () => {
+  test("closes a text block before opening a tool block, and vice versa", () => {
+    const state = freshStreamState()
+
+    // 1. text chunk
+    const textEvents = translateChunkToAnthropicEvents(
+      makeChunk({
+        choices: [
+          {
+            index: 0,
+            logprobs: null,
+            finish_reason: null,
+            delta: { role: "assistant", content: "Let me check." },
+          },
+        ],
+      }),
+      state,
+    )
+    expect(textEvents.map((e) => e.type)).toEqual([
+      "message_start",
+      "content_block_start",
+      "content_block_delta",
+    ])
+
+    // 2. tool call chunk: must close the text block first
+    const toolEvents = translateChunkToAnthropicEvents(
+      makeChunk({
+        choices: [
+          {
+            index: 0,
+            logprobs: null,
+            finish_reason: null,
+            delta: {
+              tool_calls: [
+                {
+                  index: 0,
+                  id: "call_1",
+                  type: "function",
+                  function: { name: "get_weather", arguments: "" },
+                },
+              ],
+            },
+          },
+        ],
+      }),
+      state,
+    )
+    expect(toolEvents.map((e) => e.type)).toEqual([
+      "content_block_stop",
+      "content_block_start",
+    ])
+    const blockStart = toolEvents.find((e) => e.type === "content_block_start")
+    expect(
+      blockStart?.type === "content_block_start"
+        && blockStart.content_block.type === "tool_use"
+        && blockStart.content_block.name,
+    ).toBe("get_weather")
+
+    // 3. text again after the tool block: closes tool block, opens new text block
+    const backToTextEvents = translateChunkToAnthropicEvents(
+      makeChunk({
+        choices: [
+          {
+            index: 0,
+            logprobs: null,
+            finish_reason: null,
+            delta: { content: "Done." },
+          },
+        ],
+      }),
+      state,
+    )
+    expect(backToTextEvents.map((e) => e.type)).toEqual([
+      "content_block_stop",
+      "content_block_start",
+      "content_block_delta",
+    ])
+
+    // Block indices must be strictly increasing across the three blocks
+    expect(state.contentBlockIndex).toBe(2)
+  })
+
+  test("streams partial tool JSON as input_json_delta tied to the right block", () => {
+    const state = freshStreamState()
+
+    translateChunkToAnthropicEvents(
+      makeChunk({
+        choices: [
+          {
+            index: 0,
+            logprobs: null,
+            finish_reason: null,
+            delta: {
+              role: "assistant",
+              tool_calls: [
+                {
+                  index: 0,
+                  id: "call_1",
+                  type: "function",
+                  function: { name: "get_weather", arguments: "" },
+                },
+              ],
+            },
+          },
+        ],
+      }),
+      state,
+    )
+
+    const argEvents = translateChunkToAnthropicEvents(
+      makeChunk({
+        choices: [
+          {
+            index: 0,
+            logprobs: null,
+            finish_reason: null,
+            delta: {
+              tool_calls: [
+                { index: 0, function: { arguments: '{"location":"Bo' } },
+              ],
+            },
+          },
+        ],
+      }),
+      state,
+    )
+
+    expect(argEvents).toHaveLength(1)
+    const deltaEvent = argEvents[0]
+    expect(
+      deltaEvent.type === "content_block_delta"
+        && deltaEvent.delta.type === "input_json_delta"
+        && deltaEvent.delta.partial_json,
+    ).toBe('{"location":"Bo')
+  })
+
+  test("ignores argument deltas for unknown tool call indices", () => {
+    const state = freshStreamState()
+    state.messageStartSent = true
+
+    const events = translateChunkToAnthropicEvents(
+      makeChunk({
+        choices: [
+          {
+            index: 0,
+            logprobs: null,
+            finish_reason: null,
+            delta: {
+              tool_calls: [{ index: 7, function: { arguments: '{"x":1}' } }],
+            },
+          },
+        ],
+      }),
+      state,
+    )
+
+    expect(events).toHaveLength(0)
+  })
+
+  test("reports cache-aware usage in message_delta on finish", () => {
+    const state = freshStreamState()
+    state.messageStartSent = true
+    state.contentBlockOpen = true
+
+    const events = translateChunkToAnthropicEvents(
+      makeChunk({
+        choices: [
+          {
+            index: 0,
+            logprobs: null,
+            finish_reason: "stop",
+            delta: {},
+          },
+        ],
+        usage: {
+          prompt_tokens: 500,
+          completion_tokens: 20,
+          total_tokens: 520,
+          prompt_tokens_details: { cached_tokens: 300 },
+        },
+      }),
+      state,
+    )
+
+    const messageDelta = events.find((e) => e.type === "message_delta")
+    expect(messageDelta?.type).toBe("message_delta")
+    if (messageDelta?.type === "message_delta") {
+      expect(messageDelta.usage?.input_tokens).toBe(200)
+      expect(messageDelta.usage?.cache_read_input_tokens).toBe(300)
+      expect(messageDelta.usage?.output_tokens).toBe(20)
+    }
+    expect(events.at(-1)?.type).toBe("message_stop")
+  })
+
+  test("handles empty choices chunks without emitting events", () => {
+    const state = freshStreamState()
+    const events = translateChunkToAnthropicEvents(
+      makeChunk({ choices: [] }),
+      state,
+    )
+    expect(events).toHaveLength(0)
+    expect(state.messageStartSent).toBe(false)
+  })
+})
+
+describe("system prompt translation", () => {
+  test("joins array-form system prompts into a single system message", () => {
+    const payload: AnthropicMessagesPayload = {
+      model: "gpt-4o",
+      max_tokens: 10,
+      system: [
+        { type: "text", text: "You are helpful." },
+        { type: "text", text: "Be concise." },
+      ] as Array<AnthropicTextBlock>,
+      messages: [{ role: "user", content: "Hi" }],
+    }
+
+    const result = translateToOpenAI(payload)
+    expect(result.messages[0].role).toBe("system")
+    expect(result.messages[0].content).toBe("You are helpful.\n\nBe concise.")
+  })
+})
diff --git a/tests/count-tokens-handler.test.ts b/tests/count-tokens-handler.test.ts
new file mode 100644
index 000000000..08088ac1f
--- /dev/null
+++ b/tests/count-tokens-handler.test.ts
@@ -0,0 +1,185 @@
+import { afterEach, describe, expect, test } from "bun:test"
+
+import type { AnthropicMessagesPayload } from "~/routes/messages/anthropic-types"
+import type { Model } from "~/services/copilot/get-models"
+
+import { state } from "~/lib/state"
+import { getTokenCount } from "~/lib/tokenizer"
+import { translateToOpenAI } from "~/routes/messages/non-stream-translation"
+import { messageRoutes } from "~/routes/messages/route"
+
+function makeModel(id: string): Model {
+  return {
+    id,
+    name: id,
+    object: "model",
+    vendor: "test",
+    version: "1",
+    preview: false,
+    model_picker_enabled: true,
+    capabilities: {
+      family: id,
+      object: "model_capabilities",
+      type: "chat",
+      tokenizer: "o200k_base",
+      limits: { max_output_tokens: 4096 },
+      supports: { tool_calls: true },
+    },
+  }
+}
+
+function setModels(...ids: Array<string>): void {
+  state.models = { object: "list", data: ids.map((id) => makeModel(id)) }
+}
+
+async function countTokens(
+  payload: AnthropicMessagesPayload,
+  headers: Record<string, string> = {},
+): Promise<number> {
+  const res = await messageRoutes.request("/count_tokens", {
+    method: "POST",
+    headers: { "content-type": "application/json", ...headers },
+    body: JSON.stringify(payload),
+  })
+  const json = (await res.json()) as { input_tokens: number }
+  return json.input_tokens
+}
+
+/** Re-derive the base count the handler starts from, before overhead/multiplier. */
+async function baseCount(payload: AnthropicMessagesPayload): Promise<number> {
+  const model = state.models?.data.find((m) => m.id === payload.model)
+  if (!model) throw new Error("model not seeded for baseCount")
+  const openAIPayload = translateToOpenAI(payload)
+  const { input, output } = await getTokenCount(openAIPayload, model)
+  return input + output
+}
+
+afterEach(() => {
+  state.models = undefined
+})
+
+describe("count_tokens handler", () => {
+  test("returns default count of 1 when the model is unknown", async () => {
+    setModels("some-other-model")
+    const result = await countTokens({
+      model: "nonexistent-model",
+      max_tokens: 10,
+      messages: [{ role: "user", content: "Hello there" }],
+    })
+    expect(result).toBe(1)
+  })
+
+  test("applies the 1.15 multiplier for claude models", async () => {
+    setModels("claude-sonnet-4")
+    const payload: AnthropicMessagesPayload = {
+      model: "claude-sonnet-4",
+      max_tokens: 10,
+      messages: [{ role: "user", content: "Hello there, how are you?" }],
+    }
+    const base = await baseCount(payload)
+    const result = await countTokens(payload)
+    expect(result).toBe(Math.round(base * 1.15))
+  })
+
+  test("applies the 1.03 multiplier for grok models", async () => {
+    setModels("grok-code")
+    const payload: AnthropicMessagesPayload = {
+      model: "grok-code",
+      max_tokens: 10,
+      messages: [{ role: "user", content: "Hello there, how are you?" }],
+    }
+    const base = await baseCount(payload)
+    const result = await countTokens(payload)
+    expect(result).toBe(Math.round(base * 1.03))
+  })
+
+  test("adds 346-token tool overhead for claude before the multiplier", async () => {
+    setModels("claude-sonnet-4")
+    const payload: AnthropicMessagesPayload = {
+      model: "claude-sonnet-4",
+      max_tokens: 10,
+      messages: [{ role: "user", content: "What's the weather?" }],
+      tools: [
+        {
+          name: "get_weather",
+          description: "Get weather",
+          input_schema: { type: "object", properties: {} },
+        },
+      ],
+    }
+    const base = await baseCount(payload)
+    const result = await countTokens(payload)
+    expect(result).toBe(Math.round((base + 346) * 1.15))
+  })
+
+  test("adds 480-token tool overhead for grok before the multiplier", async () => {
+    setModels("grok-code")
+    const payload: AnthropicMessagesPayload = {
+      model: "grok-code",
+      max_tokens: 10,
+      messages: [{ role: "user", content: "What's the weather?" }],
+      tools: [
+        {
+          name: "get_weather",
+          description: "Get weather",
+          input_schema: { type: "object", properties: {} },
+        },
+      ],
+    }
+    const base = await baseCount(payload)
+    const result = await countTokens(payload)
+    expect(result).toBe(Math.round((base + 480) * 1.03))
+  })
+
+  test("skips tool overhead when an mcp__ tool is present under the claude-code beta", async () => {
+    setModels("claude-sonnet-4")
+    const payload: AnthropicMessagesPayload = {
+      model: "claude-sonnet-4",
+      max_tokens: 10,
+      messages: [{ role: "user", content: "Use a tool" }],
+      tools: [
+        {
+          name: "mcp__server__do_thing",
+          description: "An MCP tool",
+          input_schema: { type: "object", properties: {} },
+        },
+      ],
+    }
+    const base = await baseCount(payload)
+    const result = await countTokens(payload, {
+      "anthropic-beta": "claude-code-20250101",
+    })
+    // No 346 overhead because an mcp__ tool exists under the claude-code beta
+    expect(result).toBe(Math.round(base * 1.15))
+  })
+
+  test("still adds overhead for an mcp__ tool when the claude-code beta is absent", async () => {
+    setModels("claude-sonnet-4")
+    const payload: AnthropicMessagesPayload = {
+      model: "claude-sonnet-4",
+      max_tokens: 10,
+      messages: [{ role: "user", content: "Use a tool" }],
+      tools: [
+        {
+          name: "mcp__server__do_thing",
+          description: "An MCP tool",
+          input_schema: { type: "object", properties: {} },
+        },
+      ],
+    }
+    const base = await baseCount(payload)
+    const result = await countTokens(payload)
+    expect(result).toBe(Math.round((base + 346) * 1.15))
+  })
+
+  test("falls back to a count of 1 when the body is not valid JSON", async () => {
+    setModels("claude-sonnet-4")
+    const res = await messageRoutes.request("/count_tokens", {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: "{ not valid json",
+    })
+    const json = (await res.json()) as { input_tokens: number }
+    expect(json.input_tokens).toBe(1)
+  })
+})
diff --git a/tests/create-responses.test.ts b/tests/create-responses.test.ts
new file mode 100644
index 000000000..468b36216
--- /dev/null
+++ b/tests/create-responses.test.ts
@@ -0,0 +1,457 @@
+import { describe, expect, test } from "bun:test"
+
+import {
+  responseEventToChatChunks,
+  responseToChatCompletion,
+  shouldUseResponsesEndpoint,
+  toResponsesPayload,
+  type ResponseApiResponse,
+  type ResponsesStreamState,
+} from "~/services/copilot/create-responses"
+
+describe("Responses API adapter", () => {
+  test("uses responses only for models that do not support chat completions", () => {
+    expect(shouldUseResponsesEndpoint(["/responses"])).toBe(true)
+    expect(
+      shouldUseResponsesEndpoint(["/responses", "/chat/completions"]),
+    ).toBe(false)
+    expect(shouldUseResponsesEndpoint(["/chat/completions"])).toBe(false)
+    expect(shouldUseResponsesEndpoint()).toBe(false)
+  })
+
+  test("converts a non-streaming response to a chat completion", () => {
+    const response: ResponseApiResponse = {
+      id: "resp_123",
+      object: "response",
+      created_at: 1700000000,
+      model: "gpt-test",
+      status: "completed",
+      output: [
+        {
+          content: [
+            { type: "output_text", text: "Hello" },
+            { type: "output_text", text: " there" },
+          ],
+        },
+      ],
+      usage: {
+        input_tokens: 2,
+        output_tokens: 3,
+        total_tokens: 5,
+      },
+    }
+
+    const chatCompletion = responseToChatCompletion(response)
+
+    expect(chatCompletion.id).toBe("resp_123")
+    expect(chatCompletion.object).toBe("chat.completion")
+    expect(chatCompletion.choices[0].message.content).toBe("Hello there")
+    expect(chatCompletion.choices[0].finish_reason).toBe("stop")
+    expect(chatCompletion.usage).toEqual({
+      prompt_tokens: 2,
+      completion_tokens: 3,
+      total_tokens: 5,
+    })
+  })
+
+  test("converts response stream events to chat completion chunks", () => {
+    const streamState: ResponsesStreamState = {
+      id: "",
+      model: "gpt-test",
+      created: 0,
+      roleSent: false,
+    }
+
+    expect(
+      responseEventToChatChunks(
+        JSON.stringify({
+          type: "response.created",
+          response: {
+            id: "resp_123",
+            object: "response",
+            created_at: 1700000000,
+            model: "gpt-test",
+            status: "in_progress",
+          },
+        }),
+        streamState,
+      ),
+    ).toEqual([])
+
+    const deltaChunks = responseEventToChatChunks(
+      JSON.stringify({
+        type: "response.output_text.delta",
+        delta: "Hello",
+      }),
+      streamState,
+    )
+
+    expect(deltaChunks).toHaveLength(1)
+    expect(deltaChunks[0]).toMatchObject({
+      id: "resp_123",
+      object: "chat.completion.chunk",
+      choices: [
+        {
+          delta: {
+            role: "assistant",
+            content: "Hello",
+          },
+          finish_reason: null,
+        },
+      ],
+    })
+
+    const completedChunks = responseEventToChatChunks(
+      JSON.stringify({
+        type: "response.completed",
+        response: {
+          id: "resp_123",
+          object: "response",
+          created_at: 1700000000,
+          model: "gpt-test",
+          status: "completed",
+          usage: {
+            input_tokens: 2,
+            output_tokens: 3,
+            total_tokens: 5,
+          },
+        },
+      }),
+      streamState,
+    )
+
+    expect(completedChunks.at(-1)).toBe("[DONE]")
+    expect(completedChunks[0]).toMatchObject({
+      choices: [
+        {
+          delta: {
+            content: null,
+          },
+          finish_reason: "stop",
+        },
+      ],
+      usage: {
+        prompt_tokens: 2,
+        completion_tokens: 3,
+        total_tokens: 5,
+      },
+    })
+  })
+
+  test("skips malformed stream events instead of throwing", () => {
+    const streamState: ResponsesStreamState = {
+      id: "resp_123",
+      model: "gpt-test",
+      created: 1700000000,
+      roleSent: true,
+    }
+
+    expect(responseEventToChatChunks("not json {", streamState)).toEqual([])
+    expect(responseEventToChatChunks("[DONE]", streamState)).toEqual([])
+    expect(responseEventToChatChunks("", streamState)).toEqual([])
+
+    // Stream state must be untouched so later valid events still work
+    const deltaChunks = responseEventToChatChunks(
+      JSON.stringify({ type: "response.output_text.delta", delta: "Hi" }),
+      streamState,
+    )
+    expect(deltaChunks).toHaveLength(1)
+  })
+
+  test("ignores unknown event types", () => {
+    const streamState: ResponsesStreamState = {
+      id: "resp_123",
+      model: "gpt-test",
+      created: 1700000000,
+      roleSent: false,
+    }
+
+    expect(
+      responseEventToChatChunks(
+        JSON.stringify({ type: "response.output_item.added" }),
+        streamState,
+      ),
+    ).toEqual([])
+    expect(streamState.roleSent).toBe(false)
+  })
+})
+
+describe("Responses API tool forwarding", () => {
+  test("flattens chat-completions tools into responses function tools", () => {
+    const payload = toResponsesPayload({
+      model: "gpt-test",
+      messages: [{ role: "user", content: "hi" }],
+      tools: [
+        {
+          type: "function",
+          function: {
+            name: "get_weather",
+            description: "Get weather",
+            parameters: { type: "object", properties: { city: {} } },
+          },
+        },
+      ],
+    }) as unknown as {
+      tools?: Array<Record<string, unknown>>
+    }
+
+    expect(payload.tools).toEqual([
+      {
+        type: "function",
+        name: "get_weather",
+        description: "Get weather",
+        parameters: { type: "object", properties: { city: {} } },
+      },
+    ])
+  })
+
+  test("maps tool_choice variants", () => {
+    const auto = toResponsesPayload({
+      model: "m",
+      messages: [],
+      tool_choice: "auto",
+    }) as unknown as { tool_choice?: unknown }
+    expect(auto.tool_choice).toBe("auto")
+
+    const forced = toResponsesPayload({
+      model: "m",
+      messages: [],
+      tool_choice: { type: "function", function: { name: "get_weather" } },
+    }) as unknown as { tool_choice?: unknown }
+    expect(forced.tool_choice).toEqual({
+      type: "function",
+      name: "get_weather",
+    })
+
+    const none = toResponsesPayload({
+      model: "m",
+      messages: [],
+    }) as unknown as { tool_choice?: unknown }
+    expect(none.tool_choice).toBeUndefined()
+  })
+
+  test("converts assistant tool_calls and tool results into input items", () => {
+    const payload = toResponsesPayload({
+      model: "m",
+      messages: [
+        { role: "user", content: "weather?" },
+        {
+          role: "assistant",
+          content: "Let me check.",
+          tool_calls: [
+            {
+              id: "call_1",
+              type: "function",
+              function: {
+                name: "get_weather",
+                arguments: '{"city":"Boston"}',
+              },
+            },
+          ],
+        },
+        { role: "tool", tool_call_id: "call_1", content: "Sunny" },
+      ],
+    }) as unknown as { input: Array<Record<string, unknown>> }
+
+    expect(payload.input).toEqual([
+      { type: "message", role: "user", content: "weather?" },
+      { type: "message", role: "assistant", content: "Let me check." },
+      {
+        type: "function_call",
+        call_id: "call_1",
+        name: "get_weather",
+        arguments: '{"city":"Boston"}',
+      },
+      { type: "function_call_output", call_id: "call_1", output: "Sunny" },
+    ])
+  })
+
+  test("omits assistant text when the turn is purely a tool call", () => {
+    const payload = toResponsesPayload({
+      model: "m",
+      messages: [
+        {
+          role: "assistant",
+          content: null,
+          tool_calls: [
+            {
+              id: "call_9",
+              type: "function",
+              function: { name: "noop", arguments: "{}" },
+            },
+          ],
+        },
+      ],
+    }) as unknown as { input: Array<Record<string, unknown>> }
+
+    expect(payload.input).toEqual([
+      {
+        type: "function_call",
+        call_id: "call_9",
+        name: "noop",
+        arguments: "{}",
+      },
+    ])
+  })
+})
+
+describe("Responses API tool forwarding (responses)", () => {
+  test("converts a non-streaming function_call output into tool_calls", () => {
+    const response: ResponseApiResponse = {
+      id: "resp_1",
+      object: "response",
+      created_at: 1700000000,
+      model: "gpt-test",
+      status: "completed",
+      output: [
+        {
+          type: "function_call",
+          id: "fc_1",
+          call_id: "call_1",
+          name: "get_weather",
+          arguments: '{"city":"Boston"}',
+        },
+      ],
+    }
+
+    const chat = responseToChatCompletion(response)
+    expect(chat.choices[0].finish_reason).toBe("tool_calls")
+    expect(chat.choices[0].message.content).toBeNull()
+    expect(chat.choices[0].message.tool_calls).toEqual([
+      {
+        id: "call_1",
+        type: "function",
+        function: { name: "get_weather", arguments: '{"city":"Boston"}' },
+      },
+    ])
+  })
+
+  test("streams a function call as opening + argument chunks", () => {
+    const streamState: ResponsesStreamState = {
+      id: "resp_1",
+      model: "gpt-test",
+      created: 1700000000,
+      roleSent: false,
+    }
+
+    const openChunks = responseEventToChatChunks(
+      JSON.stringify({
+        type: "response.output_item.added",
+        output_index: 0,
+        item: {
+          type: "function_call",
+          id: "fc_1",
+          call_id: "call_1",
+          name: "get_weather",
+          arguments: "",
+        },
+      }),
+      streamState,
+    )
+
+    expect(openChunks).toHaveLength(1)
+    expect(openChunks[0]).toMatchObject({
+      choices: [
+        {
+          delta: {
+            role: "assistant",
+            tool_calls: [
+              {
+                index: 0,
+                id: "call_1",
+                type: "function",
+                function: { name: "get_weather", arguments: "" },
+              },
+            ],
+          },
+          finish_reason: null,
+        },
+      ],
+    })
+
+    const argChunks = responseEventToChatChunks(
+      JSON.stringify({
+        type: "response.function_call_arguments.delta",
+        item_id: "fc_1",
+        delta: '{"city":',
+      }),
+      streamState,
+    )
+    expect(argChunks).toHaveLength(1)
+    expect(argChunks[0]).toMatchObject({
+      choices: [
+        {
+          delta: {
+            tool_calls: [{ index: 0, function: { arguments: '{"city":' } }],
+          },
+        },
+      ],
+    })
+
+    const completed = responseEventToChatChunks(
+      JSON.stringify({
+        type: "response.completed",
+        response: {
+          id: "resp_1",
+          object: "response",
+          created_at: 1700000000,
+          model: "gpt-test",
+          status: "completed",
+        },
+      }),
+      streamState,
+    )
+    expect(completed[0]).toMatchObject({
+      choices: [{ finish_reason: "tool_calls" }],
+    })
+    expect(completed.at(-1)).toBe("[DONE]")
+  })
+
+  test("ignores argument deltas for unknown function-call item ids", () => {
+    const streamState: ResponsesStreamState = {
+      id: "resp_1",
+      model: "gpt-test",
+      created: 1700000000,
+      roleSent: true,
+    }
+
+    expect(
+      responseEventToChatChunks(
+        JSON.stringify({
+          type: "response.function_call_arguments.delta",
+          item_id: "unknown",
+          delta: "{}",
+        }),
+        streamState,
+      ),
+    ).toEqual([])
+  })
+
+  test("assigns increasing indices to parallel function calls", () => {
+    const streamState: ResponsesStreamState = {
+      id: "resp_1",
+      model: "gpt-test",
+      created: 1700000000,
+      roleSent: false,
+    }
+
+    responseEventToChatChunks(
+      JSON.stringify({
+        type: "response.output_item.added",
+        item: { type: "function_call", id: "fc_1", call_id: "c1", name: "a" },
+      }),
+      streamState,
+    )
+    const second = responseEventToChatChunks(
+      JSON.stringify({
+        type: "response.output_item.added",
+        item: { type: "function_call", id: "fc_2", call_id: "c2", name: "b" },
+      }),
+      streamState,
+    )
+
+    expect(second[0]).toMatchObject({
+      choices: [{ delta: { tool_calls: [{ index: 1, id: "c2" }] } }],
+    })
+  })
+})