diff --git a/demos/continuous_batching/agentic_ai/README.md b/demos/continuous_batching/agentic_ai/README.md index 4963110568..e2d831019e 100644 --- a/demos/continuous_batching/agentic_ai/README.md +++ b/demos/continuous_batching/agentic_ai/README.md @@ -10,166 +10,32 @@ Here are presented required steps to deploy language models trained for tools su The application employing OpenAI agent SDK is using MCP server. It is equipped with a set of tools to providing context for the content generation. The tools can also be used for automation purposes based on input in text format. -> **Note:** On Windows, make sure to use the weekly or 2025.4 release packages for proper functionality. -## Export LLM model -Currently supported models: -- Qwen/Qwen3-8B -- Qwen/Qwen3-4B -- meta-llama/Llama-3.1-8B-Instruct -- meta-llama/Llama-3.2-3B-Instruct -- NousResearch/Hermes-3-Llama-3.1-8B -- mistralai/Mistral-7B-Instruct-v0.3 -- microsoft/Phi-4-mini-instruct -- Qwen/Qwen3-Coder-30B-A3B-Instruct -- openai/gpt-oss-20b - - -### Export using python script - -Use those steps to convert the model from HugginFace Hub to OpenVINO format and export it to a local storage. - -```console -# Download export script, install its dependencies and create directory for the models -curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py -pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt -mkdir models -``` -Run `export_model.py` script to download and quantize the model: - -> **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" or "https://www.modelscope.cn/models" before running the export script to connect to the HF Hub. - -::::{tab-set} -:::{tab-item} Qwen3-8B -:sync: Qwen3-8B -```console -python export_model.py text_generation --source_model Qwen/Qwen3-8B --weight-format int8 --config_file_path models/config.json --model_repository_path models --tool_parser hermes3 -``` -::: -:::{tab-item} Qwen3-4B -:sync: Qwen3-4B -```console -python export_model.py text_generation --source_model Qwen/Qwen3-4B --weight-format int8 --config_file_path models/config.json --model_repository_path models --tool_parser hermes3 -``` -::: -:::{tab-item} Llama-3.1-8B-Instruct -:sync: Llama-3.1-8B-Instruct -```console -python export_model.py text_generation --source_model meta-llama/Llama-3.1-8B-Instruct --weight-format int8 --config_file_path models/config.json --model_repository_path models --tool_parser llama3 -curl -L -o models/meta-llama/Llama-3.1-8B-Instruct/chat_template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.9.0/examples/tool_chat_template_llama3.1_json.jinja -``` -::: -:::{tab-item} Llama-3.2-3B-Instruct -:sync: Llama-3.2-3B-Instruct -```console -python export_model.py text_generation --source_model meta-llama/Llama-3.2-3B-Instruct --weight-format int8 --config_file_path models/config.json --model_repository_path models --tool_parser llama3 -curl -L -o models/meta-llama/Llama-3.2-3B-Instruct/chat_template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.9.0/examples/tool_chat_template_llama3.2_json.jinja -``` -::: -:::{tab-item} Hermes-3-Llama-3.1-8B -:sync: Hermes-3-Llama-3.1-8B -```console -python export_model.py text_generation --source_model NousResearch/Hermes-3-Llama-3.1-8B --weight-format int8 --config_file_path models/config.json --model_repository_path models --tool_parser hermes3 -curl -L -o models/NousResearch/Hermes-3-Llama-3.1-8B/chat_template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.9.0/examples/tool_chat_template_hermes.jinja -``` -::: -:::{tab-item} Mistral-7B-Instruct-v0.3 -:sync: Mistral-7B-Instruct-v0.3 -```console -python export_model.py text_generation --source_model mistralai/Mistral-7B-Instruct-v0.3 --weight-format int8 --config_file_path models/config.json --model_repository_path models --tool_parser mistral --extra_quantization_params "--task text-generation-with-past" -curl -L -o models/mistralai/Mistral-7B-Instruct-v0.3/chat_template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.10.1.1/examples/tool_chat_template_mistral_parallel.jinja -``` -::: -:::{tab-item} Qwen3-Coder-30B-A3B-Instruct -:sync: Qwen3-Coder-30B-A3B-Instruct -```console -python export_model.py text_generation --source_model Qwen/Qwen3-Coder-30B-A3B-Instruct --weight-format int4 --config_file_path models/config.json --model_repository_path models --tool_parser qwen3coder -curl -L -o models/Qwen/Qwen3-Coder-30B-A3B-Instruct/chat_template.jinja https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/extras/chat_template_examples/chat_template_qwen3coder_instruct.jinja -``` -::: -:::{tab-item} gpt-oss-20b -:sync: gpt-oss-20b -```console -python export_model.py text_generation --source_model openai/gpt-oss-20b --weight-format int4 --config_file_path models/config.json --model_repository_path models --tool_parser gptoss --reasoning_parser gptoss -curl -L -o models/openai/gpt-oss-20b/chat_template.jinja https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/extras/chat_template_examples/chat_template_gpt_oss.jinja -``` -> **Note:** Continuous batching and paged attention are supported for GPT‑OSS. However, when deployed on GPU, the model may experience reduced accuracy under high‑concurrency workloads. This issue will be resolved in version 2026.1 and in the upcoming weekly release. CPU execution is not affected. - -::: -:::{tab-item} Phi-4-mini-instruct -:sync: microsoft/Phi-4-mini-instruct -Note: This model requires a fix in optimum-intel which is currently on a fork. -```console -pip3 install transformers==4.53.3 --force-reinstall -pip3 install "optimum-intel[openvino]"@git+https://github.com/helena-intel/optimum-intel/@ea/lonrope_exp -python export_model.py text_generation --source_model microsoft/Phi-4-mini-instruct --weight-format int4 --config_file_path models/config.json --model_repository_path models --tool_parser phi4 --max_num_batched_tokens 99999 -curl -L -o models/microsoft/Phi-4-mini-instruct/chat_template.jinja https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/extras/chat_template_examples/chat_template_phi4_mini.jinja -``` -::: -:::: - -> **Note:** To use these models on NPU, set `--weight-format` to either **int4** or **nf4**. When specifying `--extra_quantization_params`, ensure that `ratio` is set to **1.0** and `group_size` is set to **-1** or **128**. For more details, see [OpenVINO GenAI on NPU](https://docs.openvino.ai/nightly/openvino-workflow-generative/inference-with-genai/inference-with-genai-on-npu.html). - -### Direct pulling of pre-configured HuggingFace models from docker containers +## Start MCP server with SSE interface -This procedure can be used to pull preconfigured models from OpenVINO organization in HuggingFace Hub -::::{tab-set} -:::{tab-item} Qwen3-8B-int4-ov -:sync: Qwen3-8B-int4-ov -```bash -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:weekly --pull --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-ov --task text_generation --tool_parser hermes3 -``` -::: -:::{tab-item} Mistral-7B-Instruct-v0.3-int4-ov -:sync: Mistral-7B-Instruct-v0.3-int4-ov -```bash -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:weekly --pull --model_repository_path /models --source_model OpenVINO/Mistral-7B-Instruct-v0.3-int4-ov --task text_generation --tool_parser mistral -curl -L -o models/OpenVINO/Mistral-7B-Instruct-v0.3-int4-ov/chat_template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.10.1.1/examples/tool_chat_template_mistral_parallel.jinja -``` -::: -:::{tab-item} Phi-4-mini-instruct-int4-ov -:sync: Phi-4-mini-instruct-int4-ov +### Linux ```bash -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:weekly --pull --model_repository_path /models --source_model OpenVINO/Phi-4-mini-instruct-int4-ov --task text_generation --tool_parser phi4 -curl -L -o models/OpenVINO/Phi-4-mini-instruct-int4-ov/chat_template.jinja https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/extras/chat_template_examples/chat_template_phi4_mini.jinja +git clone https://github.com/isdaniel/mcp_weather_server +cd mcp_weather_server && git checkout v0.5.0 +docker build -t mcp-weather-server:sse . +docker run -d -p 8080:8080 -e PORT=8080 mcp-weather-server:sse uv run python -m mcp_weather_server --mode sse ``` -::: -:::: - - -### Direct pulling of pre-configured HuggingFace models on Windows -Assuming you have unpacked model server package with python enabled version, make sure to run `setupvars` script -as mentioned in [deployment guide](../../../docs/deploying_server_baremetal.md), in every new shell that will start OpenVINO Model Server. - -::::{tab-set} -:::{tab-item} Qwen3-8B-int4-ov -:sync: Qwen3-8B-int4-ov -```bat -ovms.exe --pull --model_repository_path models --source_model OpenVINO/Qwen3-8B-int4-ov --task text_generation --tool_parser hermes3 --enable_prefix_caching true -``` -::: -:::{tab-item} Mistral-7B-Instruct-v0.3-int4-ov -:sync: Mistral-7B-Instruct-v0.3-int4-ov -```bat -ovms.exe --pull --model_repository_path models --source_model OpenVINO/Mistral-7B-Instruct-v0.3-int4-ov --task text_generation --tool_parser mistral --enable_prefix_caching true -curl -L -o models\OpenVINO\Mistral-7B-Instruct-v0.3-int4-ov\chat_template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.10.1.1/examples/tool_chat_template_mistral_parallel.jinja -``` -::: -:::{tab-item} Phi-4-mini-instruct-int4-ov -:sync: Phi-4-mini-instruct-int4-ov -```bat -ovms.exe --pull --model_repository_path models --source_model OpenVINO/Phi-4-mini-instruct-int4-ov --task text_generation --tool_parser phi4 --enable_prefix_caching true -curl -L -o models\OpenVINO\Phi-4-mini-instruct-int4-ov\chat_template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.9.0/examples/tool_chat_template_phi4_mini.jinja +### Windows +On Windows the MCP server will be demonstrated as an instance with stdio interface inside the agent application. +File system MCP server requires NodeJS and npx, visit https://nodejs.org/en/download. The weather MCP should be installed as python package: +```bat +pip install python-dateutil mcp_weather_server ``` -::: -:::: -You can use similar commands for different models and precision. Change the source_model and other configuration parameters. -> **Note:** Some models give more reliable responses with tuned chat template. -> **Note:** Currently tool parsers are supported for formats compatible with Phi4, Llama3, Mistral, Devstral, Hermes3 or GPT-OSS. +## Prepare the agent +Install the application requirements +```console +curl https://raw.githubusercontent.com/openvinotoolkit/model_server/main/demos/continuous_batching/agentic_ai/openai_agent.py -O -L +pip install openai-agents openai +``` ## Start OVMS @@ -183,72 +49,102 @@ as mentioned in [deployment guide](../../../docs/deploying_server_baremetal.md), ::::{tab-set} :::{tab-item} Qwen3-8B :sync: Qwen3-8B +Pull and start OVMS: ```bat -ovms.exe --rest_port 8000 --source_model Qwen/Qwen3-8B --model_repository_path models --tool_parser hermes3 --target_device GPU --task text_generation --cache_dir .cache --enable_prefix_caching true +ovms.exe --rest_port 8000 --source_model OpenVINO/Qwen3-8B-int4-ov --model_repository_path c:\models --tool_parser hermes3 --target_device GPU --task text_generation --cache_dir .cache ``` -::: -:::{tab-item} Qwen3-4B -:sync: Qwen3-4B + +Use MCP server: ```bat -ovms.exe --rest_port 8000 --source_model Qwen/Qwen3-4B --model_repository_path models --tool_parser hermes3 --target_device GPU --task text_generation --cache_dir .cache --enable_prefix_caching true +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-8B-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather ``` -::: -:::{tab-item} Llama-3.1-8B-Instruct -:sync: Llama-3.1-8B-Instruct -```bat -ovms.exe --rest_port 8000 --source_model meta-llama/Llama-3.1-8B-Instruct --model_repository_path models --tool_parser llama3 --target_device GPU --task text_generation --enable_tool_guided_generation true --cache_dir .cache --enable_prefix_caching true + +Exemplary output: +```text +The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is at 42%, and the dew point is at -2.9°C. Wind is blowing from the NE at 3.6 km/h with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa with 84% cloud cover. Visibility is 24.1 km. ``` ::: -:::{tab-item} Llama-3.2-3B-Instruct -:sync: Llama-3.2-3B-Instruct +:::{tab-item} Qwen3-4B +:sync: Qwen3-4B +Pull and start OVMS: ```bat -ovms.exe --rest_port 8000 --source_model meta-llama/Llama-3.2-3B-Instruct --model_repository_path models --tool_parser llama3 --target_device GPU --task text_generation --enable_tool_guided_generation true --cache_dir .cache --enable_prefix_caching true +ovms.exe --rest_port 8000 --source_model OpenVINO/Qwen3-4B-int4-ov --model_repository_path c:\models --tool_parser hermes3 --target_device GPU --task text_generation --cache_dir .cache ``` -::: -:::{tab-item} Mistral-7B-Instruct-v0.3 -:sync: Mistral-7B-Instruct-v0.3 + +Use MCP server: ```bat -ovms.exe --rest_port 8000 --source_model mistralai/Mistral-7B-Instruct-v0.3 --model_repository_path models --tool_parser mistral --target_device GPU --task text_generation --cache_dir .cache --enable_prefix_caching true +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-4B-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather +``` + +Exemplary output: +```text +The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is at 42%, and the dew point is at -2.9°C. Wind is blowing from the NE at 3.6 km/h with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa with 84% cloud cover. Visibility is 24.1 km. ``` ::: :::{tab-item} Phi-4-mini-instruct :sync: Phi-4-mini-instruct +Pull and start OVMS: ```bat -ovms.exe --rest_port 8000 --source_model microsoft/Phi-4-mini-instruct --model_repository_path models --tool_parser phi4 --target_device GPU --task text_generation --enable_tool_guided_generation true --cache_dir .cache --max_num_batched_tokens 99999 --enable_prefix_caching true +ovms.exe --rest_port 8000 --source_model OpenVINO/Phi-4-mini-instruct-int4-ov --model_repository_path c:\models --tool_parser phi4 --target_device GPU --task text_generation --enable_tool_guided_generation true --cache_dir .cache --max_num_batched_tokens 99999 ``` -::: -:::{tab-item} Qwen3-8B-int4-ov -:sync: Qwen3-8B-int4-ov + +Use MCP server: ```bat -ovms.exe --rest_port 8000 --source_model OpenVINO/Qwen3-8B-int4-ov --model_repository_path models --tool_parser hermes3 --target_device GPU --task text_generation --cache_dir .cache --enable_prefix_caching true +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Phi-4-mini-instruct-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather --tool-choice required ``` -::: -:::{tab-item} Mistral-7B-Instruct-v0.3-int4-ov -:sync: Mistral-7B-Instruct-v0.3-int4-ov -```bat -ovms.exe --rest_port 8000 --source_model OpenVINO/Mistral-7B-Instruct-v0.3-int4-ov --model_repository_path models --tool_parser mistral --target_device GPU --task text_generation --cache_dir .cache --enable_prefix_caching true + +Exemplary output: +```text +The current weather in Tokyo is Overcast with a temperature of 9.4°C (feels like 6.4°C), relative humidity at 42%, and dew point at -2.9°C. Wind is blowing from the NE at 3.6 km/h with gusts up to 24.8 km/h. Atmospheric pressure is 1018.9 hPa with 84% cloud cover. Visibility is 24.1 km. ``` ::: -:::{tab-item} Phi-4-mini-instruct-int4-ov -:sync: Phi-4-mini-instruct-int4-ov +:::{tab-item} Qwen3-30B-A3B-Instruct-2507 +:sync: Qwen3-30B-A3B-Instruct-2507 +Pull and start OVMS: ```bat -ovms.exe --rest_port 8000 --source_model OpenVINO/Phi-4-mini-instruct-int4-ov --model_repository_path models --tool_parser phi4 --target_device GPU --task text_generation --enable_tool_guided_generation true --cache_dir .cache --enable_prefix_caching true +set MOE_USE_MICRO_GEMM_PREFILL=0 +ovms.exe --rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path c:\models --tool_parser hermes3 --target_device GPU --task text_generation --cache_dir .cache ``` -::: -:::{tab-item} Qwen3-Coder-30B-A3B-Instruct -:sync: Qwen3-Coder-30B-A3B-Instruct + +Use MCP server: ```bat -set MOE_USE_MICRO_GEMM_PREFILL=0 -ovms.exe --rest_port 8000 --source_model Qwen/Qwen3-Coder-30B-A3B-Instruct --model_repository_path models --tool_parser qwen3coder --target_device GPU --task text_generation --cache_dir .cache --enable_prefix_caching true +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather +``` + +Exemplary output: +```text +The current weather in Tokyo is Overcast with a temperature of 9.4°C (feels like 6.4°C), relative humidity at 42%, and dew point at -2.9°C. The wind is blowing from the northeast at 3.6 km/h with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa with 84% cloud cover. Visibility is 24.1 km. ``` ::: :::{tab-item} gpt-oss-20b :sync: gpt-oss-20b +Pull and start OVMS: ```bat -ovms.exe --rest_port 8000 --source_model openai/gpt-oss-20b --model_repository_path models --tool_parser gptoss --reasoning_parser gptoss --task text_generation --enable_prefix_caching true --target_device GPU +ovms.exe --rest_port 8000 --source_model OpenVINO/gpt-oss-20b-int4-ov --model_repository_path c:\models --tool_parser gptoss --reasoning_parser gptoss --task text_generation --target_device GPU ``` > **Note:** Continuous batching and paged attention are supported for GPT‑OSS. However, when deployed on GPU, the model may experience reduced accuracy under high‑concurrency workloads. This issue will be resolved in version 2026.1 and in the upcoming weekly release. CPU execution is not affected. +Use MCP server: +```bat +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/gpt-oss-20b-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather +``` + +Exemplary output: +```text +**Tokyo Current Weather** + +- **Condition:** Overcast +- **Temperature:** 9.4°C (feels like 6.4°C) +- **Humidity:** 42% +- **Dew Point:** 2.9°C +- **Wind:** 3.6km/h from the NE, gusts up to 24.8km/h +- **Pressure:** 1018.9hPa +- **Cloud Cover:** 84% +- **Visibility:** 24.1km + +Let me know if you'd like forecast details or anything else! +``` + ::: :::: @@ -257,44 +153,36 @@ ovms.exe --rest_port 8000 --source_model openai/gpt-oss-20b --model_repository_p ::::{tab-set} :::{tab-item} Qwen3-8B :sync: Qwen3-8B +Pull and start OVMS: ```bat -ovms.exe --rest_port 8000 --source_model Qwen/Qwen3-8B --model_repository_path models --tool_parser hermes3 --target_device NPU --task text_generation --enable_prefix_caching true --cache_dir .cache --max_prompt_len 4000 -``` -::: -:::{tab-item} Qwen3-4B -:sync: Qwen3-4B -```bat -ovms.exe --rest_port 8000 --source_model Qwen/Qwen3-4B --model_repository_path models --tool_parser hermes3 --target_device NPU --task text_generation --enable_prefix_caching true --cache_dir .cache --max_prompt_len 4000 +ovms.exe --rest_port 8000 --source_model OpenVINO/Qwen3-8B-int4-cw-ov --model_repository_path c:\models --tool_parser hermes3 --target_device NPU --task text_generation --cache_dir .cache --max_prompt_len 8000 ``` -::: -:::{tab-item} Llama-3.1-8B-Instruct -:sync: Llama-3.1-8B-Instruct + +Use MCP server: ```bat -ovms.exe --rest_port 8000 --source_model meta-llama/Llama-3.1-8B-Instruct --model_repository_path models --tool_parser llama3 --target_device NPU --task text_generation --enable_tool_guided_generation true --enable_prefix_caching true --cache_dir .cache --max_prompt_len 4000 +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-8B-int4-cw-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather ``` -::: -:::{tab-item} Llama-3.2-3B-Instruct -:sync: Llama-3.2-3B-Instruct -```bat -ovms.exe --rest_port 8000 --source_model meta-llama/Llama-3.2-3B-Instruct --model_repository_path models --tool_parser llama3 --target_device NPU --task text_generation --enable_tool_guided_generation true --enable_prefix_caching true --cache_dir .cache --max_prompt_len 4000 + +Exemplary output: +```text +The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is at 42%, and the dew point is at -2.9°C. The wind is blowing from the NE at 3.6 km/h, with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa, and there is 84% cloud cover. Visibility is 24.1 km. ``` ::: -:::{tab-item} Mistral-7B-Instruct-v0.3 -:sync: Mistral-7B-Instruct-v0.3 +:::{tab-item} Qwen3-4B +:sync: Qwen3-4B +Pull and start OVMS: ```bat -ovms.exe --rest_port 8000 --source_model mistralai/Mistral-7B-Instruct-v0.3 --model_repository_path models --tool_parser mistral --target_device NPU --task text_generation --enable_prefix_caching true --cache_dir .cache --max_prompt_len 4000 +ovms.exe --rest_port 8000 --source_model FluidInference/qwen3-4b-int4-ov-npu --model_repository_path c:\models --tool_parser hermes3 --target_device NPU --task text_generation --cache_dir .cache --max_prompt_len 8000 ``` -::: -:::{tab-item} Qwen3-4B-int4-ov -:sync: Qwen3-4B-int4-ov + +Use MCP server: ```bat -ovms.exe --rest_port 8000 --source_model OpenVINO/Qwen3-4B-int4-ov --model_repository_path models --tool_parser hermes3 --target_device NPU --task text_generation --enable_prefix_caching true --cache_dir .cache --max_prompt_len 4000 +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-8B-int4-cw-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather ``` -::: -:::{tab-item} Mistral-7B-Instruct-v0.3-cw-int4-ov -:sync: Mistral-7B-Instruct-v0.3-cw-int4-ov -```bat -ovms.exe --rest_port 8000 --source_model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --model_repository_path models --tool_parser mistral --target_device NPU --task text_generation --enable_prefix_caching true --cache_dir .cache --max_prompt_len 4000 + +Exemplary output: +```text +The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is at 42%, and the dew point is at -2.9°C. The wind is blowing from the NE at 3.6 km/h, with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa, and there is 84% cloud cover. Visibility is 24.1 km. ``` ::: :::: @@ -306,79 +194,109 @@ ovms.exe --rest_port 8000 --source_model OpenVINO/Mistral-7B-Instruct-v0.3-int4- ::::{tab-set} :::{tab-item} Qwen3-8B :sync: Qwen3-8B +Pull and start OVMS: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model Qwen/Qwen3-8B --tool_parser hermes3 --task text_generation --enable_prefix_caching true +mkdir -p ${HOME}/models +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ +--rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-ov --tool_parser hermes3 --task text_generation ``` -::: -:::{tab-item} Qwen3-4B -:sync: Qwen3-4B + +Use MCP server: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model Qwen/Qwen3-4B --tool_parser hermes3 --task text_generation --enable_prefix_caching true +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-8B-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather ``` -::: -:::{tab-item} Llama-3.1-8B-Instruct -:sync: Llama-3.1-8B-Instruct -```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model meta-llama/Llama-3.1-8B-Instruct --tool_parser llama3 --task text_generation --enable_prefix_caching true --enable_tool_guided_generation true + +Exemplary output: +```text +The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is at 42%, and the dew point is -2.9°C. Wind is blowing from the northeast at 3.6 km/h with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa, with 84% cloud cover and visibility of 24.1 km. ``` ::: -:::{tab-item} Llama-3.2-3B-Instruct -:sync: Llama-3.2-3B-Instruct +:::{tab-item} Qwen3-4B +:sync: Qwen3-4B +Pull and start OVMS: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model meta-llama/Llama-3.2-3B-Instruct --tool_parser llama3 --task text_generation --enable_prefix_caching true --enable_tool_guided_generation true +mkdir -p ${HOME}/models +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ +--rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-4B-int4-ov --tool_parser hermes3 --task text_generation ``` -::: -:::{tab-item} Hermes-3-Llama-3.1-8B -:sync: Hermes-3-Llama-3.1-8B + +Use MCP server: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model NousResearch/Hermes-3-Llama-3.1-8B --tool_parser hermes3 --task text_generation --enable_prefix_caching true --enable_tool_guided_generation true +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-4B-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather ``` -::: -:::{tab-item} Mistral-7B-Instruct-v0.3 -:sync: Mistral-7B-Instruct-v0.3 -```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model mistralai/Mistral-7B-Instruct-v0.3 --tool_parser mistral --task text_generation --enable_prefix_caching true + +Exemplary output: +```text +The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is at 42%, and the dew point is at -2.9°C. Wind is blowing from the NE at 3.6 km/h with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa with 84% cloud cover. Visibility is 24.1 km. ``` ::: :::{tab-item} Phi-4-mini-instruct :sync: Phi-4-mini-instruct +Pull and start OVMS: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model microsoft/Phi-4-mini-instruct --tool_parser phi4 --task text_generation --enable_prefix_caching true --max_num_batched_tokens 99999 --enable_tool_guided_generation true +mkdir -p ${HOME}/models +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ +--rest_port 8000 --model_repository_path /models --source_model OpenVINO/Phi-4-mini-instruct-int4-ov --tool_parser phi4 --task text_generation --max_num_batched_tokens 99999 ``` -::: -:::{tab-item} Qwen3-8B-int4-ov -:sync: Qwen3-8B-int4-ov + +Use MCP server: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model OpenVINO/Qwen3-8B-int4-ov --tool_parser hermes3 --task text_generation --enable_prefix_caching true +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Phi-4-mini-instruct-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather --tool-choice required +``` + +Exemplary output: +```text +The current weather in Tokyo is as follows: The sky is mostly covered with clouds, and the temperature is 9.4°C, which feels like 6.4°C due to the humidity. The air is relatively dry with a humidity level of 42%, and the dew point is -2.9°C, indicating that the air is not very moist. The wind is coming from the northeast at a gentle pace of 3.6 km/h, but it can get quite gusty, reaching speeds of up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa, which is slightly lower than average, and the sky is mostly cloudy with 84% cloud cover. Visibility is good at 24.1 km, so you can see quite a distance. ``` ::: -:::{tab-item} Mistral-7B-Instruct-v0.3-int4-ov -:sync: Mistral-7B-Instruct-v0.3-int4-ov +:::{tab-item} Qwen3-30B-A3B-Instruct-2507 +:sync: Qwen3-30B-A3B-Instruct-2507 +Pull and start OVMS: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model OpenVINO/Mistral-7B-Instruct-v0.3-int4-ov --tool_parser mistral --task text_generation --enable_prefix_caching true +mkdir -p ${HOME}/models +docker run -d --user $(id -u):$(id -g) --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ +--rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path /models --tool_parser hermes3 --task text_generation ``` -::: -:::{tab-item} Phi-4-mini-instruct-int4-ov -:sync: Phi-4-mini-instruct-int4-ov + +Use MCP server: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model OpenVINO/Phi-4-mini-instruct-int4-ov --tool_parser phi4 --task text_generation --enable_prefix_caching true +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather +``` + +Exemplary output: +```text +The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is 42%, and the dew point is -2.9°C. Wind is blowing from the northeast at 3.6 km/h, with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa, and there is 84% cloud cover. Visibility is 24.1 km. ``` ::: -:::{tab-item} Qwen3-Coder-30B-A3B-Instruct -:sync: Qwen3-Coder-30B-A3B-Instruct +:::{tab-item} gpt-oss-20b +:sync: gpt-oss-20b +Pull and start OVMS: +```bash +mkdir -p ${HOME}/models +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ +--rest_port 8000 --source_model OpenVINO/gpt-oss-20b-int4-ov --model_repository_path /models \ +--tool_parser gptoss --reasoning_parser gptoss --task text_generation +``` + +Use MCP server: ```bash -docker run -d --user $(id -u):$(id -g) --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly \ ---rest_port 8000 --source_model Qwen/Qwen3-Coder-30B-A3B-Instruct --model_repository_path models --tool_parser qwen3coder --task text_generation --enable_prefix_caching true +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/gpt-oss-20b-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather +``` + +Exemplary output: +```text +**Tokyo Current Weather** + +- **Condition:** Overcast +- **Temperature:** 9.4°C (feels like 6.4°C) +- **Humidity:** 42% +- **Dew Point:** 2.9°C +- **Wind:** 3.6km/h from the NE, gusts up to 24.8km/h +- **Pressure:** 1018.9hPa +- **Cloud Cover:** 84% +- **Visibility:** 24.1km + +Let me know if you’d like more details (e.g., forecast, precipitation chance, or air‑quality info). ``` ::: :::: @@ -393,90 +311,111 @@ It can be applied using the commands below: ::::{tab-set} :::{tab-item} Qwen3-8B :sync: Qwen3-8B +Pull and start OVMS: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model Qwen/Qwen3-8B --tool_parser hermes3 --target_device GPU --task text_generation --enable_prefix_caching true +mkdir -p ${HOME}/models +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +--rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-ov --tool_parser hermes3 --target_device GPU --task text_generation ``` -::: -:::{tab-item} Qwen3-4B -:sync: Qwen3-4B + +Use MCP server: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model Qwen/Qwen3-4B --tool_parser hermes3 --target_device GPU --task text_generation --enable_prefix_caching true +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-8B-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather ``` -::: -:::{tab-item} Llama-3.1-8B-Instruct -:sync: Llama-3.1-8B-Instruct -```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model meta-llama/Llama-3.1-8B-Instruct --tool_parser llama3 --target_device GPU --task text_generation --enable_tool_guided_generation true --enable_prefix_caching true + +Exemplary output: +```text +The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is at 42%, and the dew point is -2.9°C. Wind is blowing from the northeast at 3.6 km/h with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa with 84% cloud cover, and visibility is 24.1 km. ``` ::: -:::{tab-item} Llama-3.2-3B-Instruct -:sync: Llama-3.2-3B-Instruct +:::{tab-item} Qwen3-4B +:sync: Qwen3-4B +Pull and start OVMS: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model meta-llama/Llama-3.2-3B-Instruct --tool_parser llama3 --target_device GPU --task text_generation --enable_tool_guided_generation true --enable_prefix_caching true +mkdir -p ${HOME}/models +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +--rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-4B-int4-ov --tool_parser hermes3 --target_device GPU --task text_generation ``` -::: -:::{tab-item} Hermes-3-Llama-3.1-8B -:sync: Hermes-3-Llama-3.1-8B + +Use MCP server: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model NousResearch/Hermes-3-Llama-3.1-8B --tool_parser llama3 --target_device GPU --task text_generation --enable_tool_guided_generation true --enable_prefix_caching true +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-4B-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather ``` -::: -:::{tab-item} Mistral-7B-Instruct-v0.3 -:sync: Mistral-7B-Instruct-v0.3 -```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model mistralai/Mistral-7B-Instruct-v0.3 --tool_parser mistral --target_device GPU --task text_generation --enable_prefix_caching true + +Exemplary output: +```text +The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is at 42%, and the dew point is at -2.9°C. Wind is blowing from the NE at 3.6 km/h with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa with 84% cloud cover. Visibility is 24.1 km. ``` ::: :::{tab-item} Phi-4-mini-instruct :sync: Phi-4-mini-instruct +Pull and start OVMS: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model microsoft/Phi-4-mini-instruct --tool_parser phi4 --target_device GPU --task text_generation --max_num_batched_tokens 99999 --enable_prefix_caching true +mkdir -p ${HOME}/models +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +--rest_port 8000 --model_repository_path /models --source_model OpenVINO/Phi-4-mini-instruct-int4-ov --tool_parser phi4 --task text_generation --target_device GPU --max_num_batched_tokens 99999 ``` -::: -:::{tab-item} Qwen3-8B-int4-ov -:sync: Qwen3-8B-int4-ov + +Use MCP server: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model OpenVINO/Qwen3-8B-int4-ov --tool_parser hermes3 --target_device GPU --task text_generation --enable_prefix_caching true +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Phi-4-mini-instruct-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather --tool-choice required ``` -::: -:::{tab-item} Mistral-7B-Instruct-v0.3-int4-ov -:sync: Mistral-7B-Instruct-v0.3-int4-ov -```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model OpenVINO/Mistral-7B-Instruct-v0.3-int4-ov --tool_parser mistral --target_device GPU --task text_generation --enable_prefix_caching true + +Exemplary output: +```text +The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is 42%, and the dew point is -2.9°C. Wind is blowing from the northeast at 3.6 km/h, with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa, and there is 84% cloud cover. Visibility is 24.1 km. ``` ::: -:::{tab-item} Phi-4-mini-instruct-int4-ov -:sync: Phi-4-mini-instruct-int4-ov +:::{tab-item} Qwen3-30B-A3B-Instruct-2507-int4-ov +:sync: Qwen3-30B-A3B-Instruct-2507-int4-ov +Pull and start OVMS: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model OpenVINO/Phi-4-mini-instruct-int4-ov --tool_parser phi4 --target_device GPU --task text_generation --enable_tool_guided_generation true --enable_prefix_caching true +mkdir -p ${HOME}/models +docker run -d --user $(id -u):$(id -g) -e MOE_USE_MICRO_GEMM_PREFILL=0 --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +--rest_port 8000 --source_model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --model_repository_path /models --tool_parser hermes3 --target_device GPU --task text_generation --enable_tool_guided_generation true ``` -::: -:::{tab-item} Qwen3-Coder-30B-A3B-Instruct -:sync: Qwen3-Coder-30B-A3B-Instruct + +Use MCP server: ```bash -docker run -d --user $(id -u):$(id -g) -e MOE_USE_MICRO_GEMM_PREFILL=0 --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --source_model Qwen/Qwen3-Coder-30B-A3B-Instruct --model_repository_path models --tool_parser qwen3coder --target_device GPU --task text_generation --enable_tool_guided_generation true --enable_prefix_caching true +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-30B-A3B-Instruct-2507-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather +``` + +Exemplary output: +```text +The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is 42%, and the dew point is -2.9°C. Wind is blowing from the northeast at 3.6 km/h, with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa, and there is 84% cloud cover. Visibility is 24.1 km. ``` ::: :::{tab-item} gpt-oss-20b :sync: gpt-oss-20b +Pull and start OVMS: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --source_model openai/gpt-oss-20b --model_repository_path models \ ---tool_parser gptoss --reasoning_parser gptoss --target_device GPU --task text_generation --enable_prefix_caching true +mkdir -p ${HOME}/models +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +--rest_port 8000 --source_model OpenVINO/gpt-oss-20b-int4-ov --model_repository_path /models \ +--tool_parser gptoss --reasoning_parser gptoss --target_device GPU --task text_generation ``` > **Note:** Continuous batching and paged attention are supported for GPT‑OSS. However, when deployed on GPU, the model may experience reduced accuracy under high‑concurrency workloads. This issue will be resolved in version 2026.1 and in the upcoming weekly release. CPU execution is not affected. +Use MCP server: +```bash +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/gpt-oss-20b-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather +``` + +Exemplary output: +```text +**Tokyo Current Weather** + +- **Condition:** Overcast +- **Temperature:** 9.4°C (feels like 6.4°C) +- **Humidity:** 42% +- **Dew Point:** 2.9°C +- **Wind:** 3.6km/h from the NE, gusts up to 24.8km/h +- **Pressure:** 1018.9hPa +- **Cloud Cover:** 84% +- **Visibility:** 24.1km + +Let me know if you'd like forecast details or anything else! +``` ::: :::: @@ -489,161 +428,40 @@ It can be applied using the commands below: ::::{tab-set} :::{tab-item} Qwen3-8B :sync: Qwen3-8B +Pull and start OVMS: ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model Qwen/Qwen3-8B --tool_parser hermes3 --target_device NPU --task text_generation --enable_prefix_caching true --max_prompt_len 4000 -``` -::: -:::{tab-item} Qwen3-4B -:sync: Qwen3-4B -```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model Qwen/Qwen3-4B --tool_parser hermes3 --target_device NPU --task text_generation --enable_prefix_caching true --max_prompt_len 4000 -``` -::: -:::{tab-item} Llama-3.1-8B-Instruct -:sync: Llama-3.1-8B-Instruct -```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model meta-llama/Llama-3.1-8B-Instruct --tool_parser llama3 --target_device NPU --task text_generation --enable_tool_guided_generation true --enable_prefix_caching true --max_prompt_len 4000 -``` -::: -:::{tab-item} Llama-3.2-3B-Instruct -:sync: Llama-3.2-3B-Instruct -```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model meta-llama/Llama-3.2-3B-Instruct --tool_parser llama3 --target_device NPU --task text_generation --enable_tool_guided_generation true --enable_prefix_caching true --max_prompt_len 4000 -``` -::: -:::{tab-item} Mistral-7B-Instruct-v0.3 -:sync: Mistral-7B-Instruct-v0.3 -```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model mistralai/Mistral-7B-Instruct-v0.3 --tool_parser mistral --target_device NPU --task text_generation --enable_prefix_caching true --max_prompt_len 4000 +mkdir -p ${HOME}/models +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -1) openvino/model_server:weekly \ +--rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-cw-ov --tool_parser hermes3 --target_device NPU --task text_generation --max_prompt_len 8000 ``` -::: -:::{tab-item} Qwen3-8B-int4-cw-ov -:sync: Qwen3-8B-int4-cw-ov -```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model OpenVINO/Qwen3-8B-int4-cw-ov --tool_parser hermes3 --target_device NPU --task text_generation --enable_prefix_caching true --max_prompt_len 4000 -``` -::: -:::{tab-item} Mistral-7B-Instruct-v0.3-int4-cw-ov -:sync: Mistral-7B-Instruct-v0.3-int4-cw-ov -```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path models --source_model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --tool_parser mistral --target_device NPU --task text_generation --enable_prefix_caching true --max_prompt_len 4000 -``` -::: -:::: -### Deploy all models in a single container -Those steps deploy all the models exported earlier. The python script added the models to `models/config.json` so just the remaining models pulled directly from HuggingFace Hub are to be added: +Use MCP server: ```bash -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:weekly --add_to_config --model_name OpenVINO/Qwen3-8B-int4-ov --model_path OpenVINO/Qwen3-8B-int4-ov --config_path /models/config.json -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:weekly --add_to_config --model_name OpenVINO/Phi-4-mini-instruct-int4-ov --model_path OpenVINO/Phi-4-mini-instruct-int4-ov --config_path /models/config.json -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:weekly --add_to_config --model_name OpenVINO/Mistral-7B-Instruct-v0.3-int4-ov --model_path OpenVINO/Mistral-7B-Instruct-v0.3-int4-ov--config_path /models/config.json -docker run -d --rm -p 8000:8000 -v $(pwd)/models:/models:ro openvino/model_server:weekly --rest_port 8000 --config_path /models/config.json +python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-8B-int4-cw-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather ``` - -## Start MCP server with SSE interface - -### Linux -```bash -git clone https://github.com/isdaniel/mcp_weather_server -cd mcp_weather_server && git checkout v0.5.0 -docker build -t mcp-weather-server:sse . -docker run -d -p 8080:8080 -e PORT=8080 mcp-weather-server:sse uv run python -m mcp_weather_server --mode sse -``` - -> **Note:** On Windows the MCP server will be demonstrated as an instance with stdio interface inside the agent application - -## Start the agent - -Install the application requirements - -```console -curl https://raw.githubusercontent.com/openvinotoolkit/model_server/main/demos/continuous_batching/agentic_ai/openai_agent.py -o openai_agent.py -pip install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/main/demos/continuous_batching/agentic_ai/requirements.txt -``` -Make sure nodejs and npx are installed. On ubuntu it would require `sudo apt install nodejs npm`. On windows, visit https://nodejs.org/en/download. It is needed for the `file system` MCP server. - -Run the agentic application: - - -::::{tab-set} -:::{tab-item} Qwen3-8B -:sync: Qwen3-8B -```bash -python openai_agent.py --query "What is the current weather in Tokyo?" --model Qwen/Qwen3-8B --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather --stream --enable-thinking -``` -```bash -python openai_agent.py --query "List the files in folder /root" --model Qwen/Qwen3-8B --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server all +Exemplary output: +```text +The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is at 42%, and the dew point is at -2.9°C. The wind is blowing from the NE at 3.6 km/h with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa with 84% cloud cover, and the visibility is 24.1 km. ``` ::: -:::{tab-item} Qwen3-4B +:::{tab-item} Qwen3-4B :sync: Qwen3-4B +Pull and start OVMS: ```bash -python openai_agent.py --query "What is the current weather in Tokyo?" --model Qwen/Qwen3-4B --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather --stream -``` -```bash -python openai_agent.py --query "List the files in folder /root" --model Qwen/Qwen3-4B --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server all -``` -::: -:::{tab-item} Llama-3.1-8B-Instruct -:sync: Llama-3.1-8B-Instruct -```bash -python openai_agent.py --query "List the files in folder /root" --model meta-llama/Llama-3.1-8B-Instruct --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server all -``` -::: -:::{tab-item} Mistral-7B-Instruct-v0.3 -:sync: Mistral-7B-Instruct-v0.3 -```bash -python openai_agent.py --query "List the files in folder /root" --model mistralai/Mistral-7B-Instruct-v0.3 --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server all --tool_choice required -``` -::: -:::{tab-item} Llama-3.2-3B-Instruct -:sync: Llama-3.2-3B-Instruct -```bash -python openai_agent.py --query "List the files in folder /root" --model meta-llama/Llama-3.2-3B-Instruct --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server all -``` -::: -:::{tab-item} Phi-4-mini-instruct -:sync: Phi-4-mini-instruct -```bash -python openai_agent.py --query "What is the current weather in Tokyo?" --model microsoft/Phi-4-mini-instruct --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather -``` -::: -:::{tab-item} Qwen3-8B-int4-ov -:sync: Qwen3-8B-int4-ov -```bash -python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-8B-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather -``` -::: -:::{tab-item} OpenVINO/Mistral-7B-Instruct-v0.3-int4-ov -:sync: OpenVINO/Mistral-7B-Instruct-v0.3-int4-ov -```bash -python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Mistral-7B-Instruct-v0.3-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather --tool-choice required +mkdir -p ${HOME}/models +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +--rest_port 8000 --model_repository_path /models --source_model FluidInference/qwen3-4b-int4-ov-npu --tool_parser hermes3 --target_device NPU --task text_generation --max_prompt_len 8000 ``` -::: -:::{tab-item} Phi-4-mini-instruct-int4-ov -:sync: Phi-4-mini-instruct-int4-ov -```bash -python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Phi-4-mini-instruct-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather -``` -::: -:::{tab-item} Qwen3-Coder-30B-A3B-Instruct -:sync: Qwen3-Coder-30B-A3B-Instruct + +Use MCP server: ```bash -python openai_agent.py --query "What is the current weather in Tokyo?" --model Qwen3/Qwen3-Coder-30B-A3B-Instruct --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather +python openai_agent.py --query "What is the current weather in Tokyo?" --model FluidInference/qwen3-4b-int4-ov-npu --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather --stream ``` -::: -:::{tab-item} gpt-oss-20b -:sync: gpt-oss-20b -```console -python openai_agent.py --query "What is the current weather in Tokyo?" --model openai/gpt-oss-20b --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather + +Exemplary output: +```text +The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is at 42%, and the dew point is at -2.9°C. The wind is blowing from the NE at 3.6 km/h with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa with 84% cloud cover, and the visibility is 24.1 km. ``` ::: :::: @@ -652,69 +470,22 @@ python openai_agent.py --query "What is the current weather in Tokyo?" --model o > **Note:** For more interactive mode you can run the application with streaming enabled by providing `--stream` parameter to the script. -You can try also similar implementation based on llama_index library working the same way: -```bash -pip install llama-index-llms-openai-like==0.5.3 llama-index-core==0.14.5 llama-index-tools-mcp==0.4.2 -curl https://raw.githubusercontent.com/openvinotoolkit/model_server/main/demos/continuous_batching/agentic_ai/llama_index_agent.py -o llama_index_agent.py -python llama_index_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-8B-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather --stream --enable-thinking +### Using Llama index agentic framework +Pull and start OVMS: +```bash +mkdir -p ${HOME}/models +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \ +--rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-ov --tool_parser hermes3 --task text_generation ``` - -## Testing efficiency in agentic use case - -Using LLM models with AI agents has a unique load characteristics with multi-turn communication and resending bit parts of the prompt as the previous conversation. -To simulate such type of load, we should use a dedicated tool [multi_turn benchmark](https://github.com/vllm-project/vllm/tree/main/benchmarks/multi_turn). +You can try also similar implementation based on llama_index library working the same way like openai-agent: ```bash -git clone -b v0.10.2 https://github.com/vllm-project/vllm -cd vllm/benchmarks/multi_turn -pip install -r requirements.txt -sed -i -e 's/if not os.path.exists(args.model)/if 1 == 0/g' benchmark_serving_multi_turn.py - -#Download the following text file (used for generation of synthetic conversations) -wget https://www.gutenberg.org/ebooks/1184.txt.utf-8 -mv 1184.txt.utf-8 pg1184.txt - -# Testing single client scenario, for example with GPU execution -docker run -d --name ovms --user $(id -u):$(id -g) --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly \ ---rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-ov --enable_prefix_caching true --task text_generation --target_device GPU - -python benchmark_serving_multi_turn.py -m Qwen/Qwen3-8B --url http://localhost:8000/v3 -i generate_multi_turn.json --served-model-name OpenVINO/Qwen3-8B-int4-ov --num-clients 1 -n 50 - -# Testing high concurrency, for example on Xeon CPU with constrained resources (in case of memory constrains, reduce cache_size) -docker run -d --name ovms --cpuset-cpus 0-15 --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-ov --enable_prefix_caching true --cache_size 20 --task text_generation - -python benchmark_serving_multi_turn.py -m Qwen/Qwen3-8B --url http://localhost:8000/v3 -i generate_multi_turn.json --served-model-name OpenVINO/Qwen3-8B-int4-ov --num-clients 24 -``` -Below is an example of the output captured on iGPU: -``` -Parameters: -model=OpenVINO/Qwen3-8B-int4-ov -num_clients=1 -num_conversations=100 -active_conversations=None -seed=0 -Conversations Generation Parameters: -text_files=pg1184.txt -input_num_turns=UniformDistribution[12, 18] -input_common_prefix_num_tokens=Constant[500] -input_prefix_num_tokens=LognormalDistribution[6, 4] -input_num_tokens=UniformDistribution[120, 160] -output_num_tokens=UniformDistribution[80, 120] ----------------------------------------------------------------------------------------------------- -Statistics summary: -runtime_sec = 307.569 -requests_per_sec = 0.163 ----------------------------------------------------------------------------------------------------- - count mean std min 25% 50% 75% 90% max -ttft_ms 50.0 1052.97 987.30 200.61 595.29 852.08 1038.50 1193.38 4265.27 -tpot_ms 50.0 51.37 2.37 47.03 49.67 51.45 53.16 54.42 55.23 -latency_ms 50.0 6128.26 1093.40 4603.86 5330.43 5995.30 6485.20 7333.73 9505.51 -input_num_turns 50.0 7.64 4.72 1.00 3.00 7.00 11.00 15.00 17.00 -input_num_tokens 50.0 2298.92 973.02 520.00 1556.50 2367.00 3100.75 3477.70 3867.00 +pip install llama-index-llms-openai-like==0.5.3 llama-index-core==0.14.5 llama-index-tools-mcp==0.4.2 +curl https://raw.githubusercontent.com/openvinotoolkit/model_server/main/demos/continuous_batching/agentic_ai/llama_index_agent.py -o llama_index_agent.py +python llama_index_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-8B-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather --stream --enable-thinking ``` - ## Testing accuracy Testing model accuracy is critical for a successful adoption in AI application. The recommended methodology is to use BFCL tool like describe in the [testing guide](../accuracy/README.md#running-the-tests-for-agentic-models-with-function-calls). @@ -735,3 +506,28 @@ Here is example of the response from the OpenVINO/Qwen3-8B-int4-ov model: ``` Models can be also compared using the [leaderboard reports](https://gorilla.cs.berkeley.edu/leaderboard.html#leaderboard). + +### Export and quantize model + +Use those steps to convert the model from HuggingFace Hub to OpenVINO format and export it to a local storage. + +```text +# Download export script, install its dependencies and create directory for the models +curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py +pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt +mkdir models +``` +Run `export_model.py` script to download and quantize the model: + +> **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" or "https://www.modelscope.cn/models" before running the export script to connect to the HF Hub. + +```text +python export_model.py text_generation --source_model meta-llama/Llama-3.2-3B-Instruct --weight-format int4 --config_file_path models/config.json --model_repository_path models --tool_parser llama3 +curl -L -o models/meta-llama/Llama-3.2-3B-Instruct/chat_template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.9.0/examples/tool_chat_template_llama3.2_json.jinja +``` + +> **Note:** To use these models on NPU, set `--weight-format` to either **int4** or **nf4**. When specifying `--extra_quantization_params`, ensure that `ratio` is set to **1.0** and `group_size` is set to **-1** or **128**. For example: +```text +python export_model.py text_generation --source_model meta-llama/Llama-3.2-3B-Instruct --weight-format nf4 --config_file_path models/config.json --model_repository_path models --tool_parser llama3 --extra_quantization_params "--library transformers --sym group_size -1" +``` +For more details, see [OpenVINO GenAI on NPU](https://docs.openvino.ai/nightly/openvino-workflow-generative/inference-with-genai/inference-with-genai-on-npu.html). \ No newline at end of file diff --git a/demos/continuous_batching/agentic_ai/openai_agent.py b/demos/continuous_batching/agentic_ai/openai_agent.py index 859d154b28..25db857cf1 100644 --- a/demos/continuous_batching/agentic_ai/openai_agent.py +++ b/demos/continuous_batching/agentic_ai/openai_agent.py @@ -19,6 +19,7 @@ import asyncio import os import platform +import sys from openai import AsyncOpenAI from agents import Agent, Runner, RunConfig @@ -37,6 +38,7 @@ ) API_KEY = "not_used" +os.environ["PYTHONUTF8"] = "1" env_proxy = {} http_proxy = os.environ.get("http_proxy") https_proxy = os.environ.get("https_proxy") @@ -45,7 +47,14 @@ if https_proxy: env_proxy["https_proxy"] = https_proxy -RunConfig.tracing_disabled = False # Disable tracing for this example +RunConfig.tracing_disabled = False # Enable tracing for this example + +def check_if_tool_calls_present(result) -> bool: + if hasattr(result, 'new_items') and result.new_items: + for item in result.new_items: + if hasattr(item, 'type') and item.type == "tool_call_item": + return True + return False async def run(query, agent, OVMS_MODEL_PROVIDER, stream: bool = False): for server in agent.mcp_servers: @@ -76,6 +85,8 @@ async def run(query, agent, OVMS_MODEL_PROVIDER, stream: bool = False): else: result = await Runner.run(starting_agent=agent, input=query, run_config=RunConfig(model_provider=OVMS_MODEL_PROVIDER, tracing_disabled=True)) print(result.final_output) + + return check_if_tool_calls_present(result) if __name__ == "__main__": @@ -126,4 +137,11 @@ def get_model(self, _) -> Model: model_settings=ModelSettings(tool_choice=args.tool_choice, temperature=0.0, max_tokens=1000, extra_body={"chat_template_kwargs": {"enable_thinking": args.enable_thinking}}), ) loop = asyncio.new_event_loop() - loop.run_until_complete(run(args.query, agent, OVMS_MODEL_PROVIDER, args.stream)) + + is_tool_call_present = loop.run_until_complete(run(args.query, agent, OVMS_MODEL_PROVIDER, args.stream)) + + # for testing purposes, exit codes are dependent on whether a tool call was present in the agent's reasoning process + if is_tool_call_present: + sys.exit(0) + else: + sys.exit(1) \ No newline at end of file diff --git a/demos/continuous_batching/agentic_ai/requirements.txt b/demos/continuous_batching/agentic_ai/requirements.txt deleted file mode 100644 index 5147552b66..0000000000 --- a/demos/continuous_batching/agentic_ai/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -openai-agents==0.2.11 -openai==1.107.0 -python-dateutil -mcp_weather_server \ No newline at end of file