From 7523765cef7094f523fb7fd00bad0ee2f0bdedfb Mon Sep 17 00:00:00 2001 From: Teo Gonzalez Collazo Date: Tue, 14 Apr 2026 08:59:22 -0700 Subject: [PATCH] feat: add Exa AI-powered search tool Add ExaSearchTool as a new search provider alongside the existing DuckDuckGo and Google Scholar tools. Exa provides AI-powered web search with built-in content retrieval (highlights, full text, summaries), category filtering, domain filtering, and date ranges. Changes: - New tool_server_lite/tools/exa_tools.py with ExaSearchTool - Registered in tool registry and __init__.py exports - Tool YAML configs added for Researcher, OpenCowork, and CheapClaw agents - exa-py>=2.0.0 added to requirements.txt - 19 unit tests covering response parsing, content fallbacks, content mode routing, integration header, and filter pass-through --- .../CheapClawSupervisor/level_0_tools.yaml | 57 +++ .../CheapClawWorkerGeneral/level_0_tools.yaml | 57 +++ .../OpenCowork/level_0_tools.yaml | 57 +++ .../Researcher/level_0_tools.yaml | 57 +++ requirements.txt | 1 + tests/test_exa_tools.py | 399 ++++++++++++++++++ tool_server_lite/registry.py | 2 + tool_server_lite/requirements.txt | 1 + tool_server_lite/tools/__init__.py | 3 + tool_server_lite/tools/exa_tools.py | 200 +++++++++ 10 files changed, 834 insertions(+) create mode 100644 tests/test_exa_tools.py create mode 100644 tool_server_lite/tools/exa_tools.py diff --git a/apps/cheapclaw/assets/agent_library/CheapClawSupervisor/level_0_tools.yaml b/apps/cheapclaw/assets/agent_library/CheapClawSupervisor/level_0_tools.yaml index 4764ecd..0aff85e 100644 --- a/apps/cheapclaw/assets/agent_library/CheapClawSupervisor/level_0_tools.yaml +++ b/apps/cheapclaw/assets/agent_library/CheapClawSupervisor/level_0_tools.yaml @@ -147,6 +147,63 @@ tools: description: "保存搜索结果的相对路径(.md 文件),请保存在 temp/web_search目录中。" required: ["query","save_path"] + exa_search: + level: 0 + type: tool_call_agent + name: "exa_search" + description: "Use Exa AI-powered search for high-quality web results. Supports content retrieval (highlights, full text, summaries), category filtering (company, research paper, news, etc.), domain filtering, and date ranges. Requires EXA_API_KEY." + parameters: + type: "object" + properties: + query: + type: "string" + description: "Search query." + max_results: + type: "integer" + default: 10 + description: "Maximum number of results, default 10." + search_type: + type: "string" + default: "auto" + description: "Search type: 'auto' (default), 'neural', 'fast', or 'instant'." + content_mode: + type: "string" + default: "highlights" + description: "Content retrieval mode: 'highlights' (default), 'text', 'summary', or 'none'." + category: + type: "string" + description: "Filter by category: 'company', 'research paper', 'news', 'personal site', 'financial report', 'people'." + include_domains: + type: "array" + items: + type: "string" + description: "Only include results from these domains." + exclude_domains: + type: "array" + items: + type: "string" + description: "Exclude results from these domains." + include_text: + type: "array" + items: + type: "string" + description: "Strings that must appear in page text." + exclude_text: + type: "array" + items: + type: "string" + description: "Strings to exclude from results." + start_published_date: + type: "string" + description: "ISO 8601 date; only results published after this date." + end_published_date: + type: "string" + description: "ISO 8601 date; only results published before this date." + save_path: + type: "string" + description: "Relative path to save results (.md file), save in temp/exa_search directory." + required: ["query"] + google_scholar_search: level: 0 type: tool_call_agent diff --git a/apps/cheapclaw/assets/agent_library/CheapClawWorkerGeneral/level_0_tools.yaml b/apps/cheapclaw/assets/agent_library/CheapClawWorkerGeneral/level_0_tools.yaml index aef0c10..2ba1adb 100644 --- a/apps/cheapclaw/assets/agent_library/CheapClawWorkerGeneral/level_0_tools.yaml +++ b/apps/cheapclaw/assets/agent_library/CheapClawWorkerGeneral/level_0_tools.yaml @@ -147,6 +147,63 @@ tools: description: "保存搜索结果的相对路径(.md 文件),请保存在 temp/web_search目录中。" required: ["query","save_path"] + exa_search: + level: 0 + type: tool_call_agent + name: "exa_search" + description: "Use Exa AI-powered search for high-quality web results. Supports content retrieval (highlights, full text, summaries), category filtering (company, research paper, news, etc.), domain filtering, and date ranges. Requires EXA_API_KEY." + parameters: + type: "object" + properties: + query: + type: "string" + description: "Search query." + max_results: + type: "integer" + default: 10 + description: "Maximum number of results, default 10." + search_type: + type: "string" + default: "auto" + description: "Search type: 'auto' (default), 'neural', 'fast', or 'instant'." + content_mode: + type: "string" + default: "highlights" + description: "Content retrieval mode: 'highlights' (default), 'text', 'summary', or 'none'." + category: + type: "string" + description: "Filter by category: 'company', 'research paper', 'news', 'personal site', 'financial report', 'people'." + include_domains: + type: "array" + items: + type: "string" + description: "Only include results from these domains." + exclude_domains: + type: "array" + items: + type: "string" + description: "Exclude results from these domains." + include_text: + type: "array" + items: + type: "string" + description: "Strings that must appear in page text." + exclude_text: + type: "array" + items: + type: "string" + description: "Strings to exclude from results." + start_published_date: + type: "string" + description: "ISO 8601 date; only results published after this date." + end_published_date: + type: "string" + description: "ISO 8601 date; only results published before this date." + save_path: + type: "string" + description: "Relative path to save results (.md file), save in temp/exa_search directory." + required: ["query"] + google_scholar_search: level: 0 type: tool_call_agent diff --git a/config/agent_library/OpenCowork/level_0_tools.yaml b/config/agent_library/OpenCowork/level_0_tools.yaml index 4eeeaad..defb123 100644 --- a/config/agent_library/OpenCowork/level_0_tools.yaml +++ b/config/agent_library/OpenCowork/level_0_tools.yaml @@ -147,6 +147,63 @@ tools: description: "保存搜索结果的相对路径(.md 文件),请保存在 temp/web_search目录中。" required: ["query","save_path"] + exa_search: + level: 0 + type: tool_call_agent + name: "exa_search" + description: "Use Exa AI-powered search for high-quality web results. Supports content retrieval (highlights, full text, summaries), category filtering (company, research paper, news, etc.), domain filtering, and date ranges. Requires EXA_API_KEY." + parameters: + type: "object" + properties: + query: + type: "string" + description: "Search query." + max_results: + type: "integer" + default: 10 + description: "Maximum number of results, default 10." + search_type: + type: "string" + default: "auto" + description: "Search type: 'auto' (default), 'neural', 'fast', or 'instant'." + content_mode: + type: "string" + default: "highlights" + description: "Content retrieval mode: 'highlights' (default), 'text', 'summary', or 'none'." + category: + type: "string" + description: "Filter by category: 'company', 'research paper', 'news', 'personal site', 'financial report', 'people'." + include_domains: + type: "array" + items: + type: "string" + description: "Only include results from these domains." + exclude_domains: + type: "array" + items: + type: "string" + description: "Exclude results from these domains." + include_text: + type: "array" + items: + type: "string" + description: "Strings that must appear in page text." + exclude_text: + type: "array" + items: + type: "string" + description: "Strings to exclude from results." + start_published_date: + type: "string" + description: "ISO 8601 date; only results published after this date." + end_published_date: + type: "string" + description: "ISO 8601 date; only results published before this date." + save_path: + type: "string" + description: "Relative path to save results (.md file), save in temp/exa_search directory." + required: ["query"] + google_scholar_search: level: 0 type: tool_call_agent diff --git a/config/agent_library/Researcher/level_0_tools.yaml b/config/agent_library/Researcher/level_0_tools.yaml index b209a5f..d2cb2c7 100644 --- a/config/agent_library/Researcher/level_0_tools.yaml +++ b/config/agent_library/Researcher/level_0_tools.yaml @@ -146,6 +146,63 @@ tools: description: "保存搜索结果的相对路径(.md 文件),请保存在 temp/web_search目录中。" required: ["query","save_path"] + exa_search: + level: 0 + type: tool_call_agent + name: "exa_search" + description: "Use Exa AI-powered search for high-quality web results. Supports content retrieval (highlights, full text, summaries), category filtering (company, research paper, news, etc.), domain filtering, and date ranges. Requires EXA_API_KEY." + parameters: + type: "object" + properties: + query: + type: "string" + description: "Search query." + max_results: + type: "integer" + default: 10 + description: "Maximum number of results, default 10." + search_type: + type: "string" + default: "auto" + description: "Search type: 'auto' (default), 'neural', 'fast', or 'instant'." + content_mode: + type: "string" + default: "highlights" + description: "Content retrieval mode: 'highlights' (default), 'text', 'summary', or 'none'." + category: + type: "string" + description: "Filter by category: 'company', 'research paper', 'news', 'personal site', 'financial report', 'people'." + include_domains: + type: "array" + items: + type: "string" + description: "Only include results from these domains." + exclude_domains: + type: "array" + items: + type: "string" + description: "Exclude results from these domains." + include_text: + type: "array" + items: + type: "string" + description: "Strings that must appear in page text." + exclude_text: + type: "array" + items: + type: "string" + description: "Strings to exclude from results." + start_published_date: + type: "string" + description: "ISO 8601 date; only results published after this date." + end_published_date: + type: "string" + description: "ISO 8601 date; only results published before this date." + save_path: + type: "string" + description: "Relative path to save results (.md file), save in temp/exa_search directory." + required: ["query"] + google_scholar_search: level: 0 type: tool_call_agent diff --git a/requirements.txt b/requirements.txt index 969e8ec..7f58976 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,7 @@ pdfplumber>=0.10.0 python-docx>=1.1.0 crawl4ai>=0.3.0 # 网页爬取(会自动安装 playwright) ddgs>=1.0.0 # DuckDuckGo 搜索 +exa-py>=2.0.0 # Exa AI-powered search arxiv>=2.0.0 # arXiv API prompt_toolkit>=3.0.0 # CLI 交互 rich>=13.0.0 # 终端美化 diff --git a/tests/test_exa_tools.py b/tests/test_exa_tools.py new file mode 100644 index 0000000..5fd5211 --- /dev/null +++ b/tests/test_exa_tools.py @@ -0,0 +1,399 @@ +""" +Tests for the Exa AI-powered search tool. + +These tests mock the exa-py SDK and verify: +- API response parsing and content fallback logic +- Content mode routing (highlights/text/summary/none) +- Integration header is set +- Disabled state (SDK missing, API key missing) +- File saving behavior +- Optional filter pass-through +""" + +import importlib +import importlib.util +import os +import sys +import types +from pathlib import Path +from unittest.mock import patch, MagicMock + +import pytest + +pytestmark = pytest.mark.unit + + +# --------------------------------------------------------------------------- +# Module-level setup: load exa_tools without triggering the heavy __init__.py +# --------------------------------------------------------------------------- + +def _load_exa_tools(): + """ + Import exa_tools.py directly, bypassing tool_server_lite/tools/__init__.py + which pulls in litellm, PIL, etc. We stub only the lightweight file_tools + dependency that exa_tools actually needs. + """ + # Ensure the package hierarchy exists in sys.modules + if "tool_server_lite" not in sys.modules: + pkg = types.ModuleType("tool_server_lite") + pkg.__path__ = [str(Path(__file__).resolve().parent.parent / "tool_server_lite")] + sys.modules["tool_server_lite"] = pkg + + if "tool_server_lite.tools" not in sys.modules: + tools_pkg = types.ModuleType("tool_server_lite.tools") + tools_pkg.__path__ = [str(Path(__file__).resolve().parent.parent / "tool_server_lite" / "tools")] + sys.modules["tool_server_lite.tools"] = tools_pkg + + # Load file_tools (the only real dependency of exa_tools) + ft_path = Path(__file__).resolve().parent.parent / "tool_server_lite" / "tools" / "file_tools.py" + ft_spec = importlib.util.spec_from_file_location("tool_server_lite.tools.file_tools", str(ft_path)) + ft_mod = importlib.util.module_from_spec(ft_spec) + sys.modules["tool_server_lite.tools.file_tools"] = ft_mod + ft_spec.loader.exec_module(ft_mod) + + # Load exa_tools + exa_path = Path(__file__).resolve().parent.parent / "tool_server_lite" / "tools" / "exa_tools.py" + exa_spec = importlib.util.spec_from_file_location("tool_server_lite.tools.exa_tools", str(exa_path)) + exa_mod = importlib.util.module_from_spec(exa_spec) + sys.modules["tool_server_lite.tools.exa_tools"] = exa_mod + exa_spec.loader.exec_module(exa_mod) + return exa_mod + + +_exa_mod = _load_exa_tools() +ExaSearchTool = _exa_mod.ExaSearchTool +_extract_snippet = _exa_mod._extract_snippet + + +# --------------------------------------------------------------------------- +# Fixtures & helpers +# --------------------------------------------------------------------------- + +@pytest.fixture +def workspace(tmp_path): + return str(tmp_path) + + +def _make_result( + title="Example Title", + url="https://example.com", + highlights=None, + summary=None, + text=None, + published_date=None, + author=None, +): + result = MagicMock() + result.title = title + result.url = url + result.highlights = highlights + result.summary = summary + result.text = text + result.publishedDate = published_date + result.published_date = published_date + result.author = author + return result + + +def _make_response(results): + resp = MagicMock() + resp.results = results + return resp + + +EXA_RESPONSE_FIXTURE = _make_response([ + _make_result( + title="Intro to LLMs", + url="https://example.com/llms", + highlights=["Large language models are transformers trained on vast text corpora."], + published_date="2025-01-15", + author="Jane Doe", + ), + _make_result( + title="RAG Explained", + url="https://example.com/rag", + highlights=None, + summary="Retrieval-augmented generation combines search with LLMs.", + ), + _make_result( + title="Vector Databases", + url="https://example.com/vectordb", + highlights=None, + summary=None, + text="Vector databases store embeddings for similarity search. " * 20, + ), +]) + + +# --------------------------------------------------------------------------- +# Import guard / disabled state +# --------------------------------------------------------------------------- + +class TestExaToolDisabled: + def test_returns_error_when_sdk_missing(self, workspace): + with patch.object(_exa_mod, "EXA_AVAILABLE", False), \ + patch.dict(os.environ, {}, clear=True): + tool = ExaSearchTool() + result = tool.execute(workspace, {"query": "test"}) + assert result["status"] == "error" + assert "exa-py" in result["error"] + + def test_returns_error_when_api_key_missing(self, workspace): + with patch.object(_exa_mod, "EXA_AVAILABLE", True), \ + patch.dict(os.environ, {}, clear=True): + tool = ExaSearchTool() + result = tool.execute(workspace, {"query": "test"}) + assert result["status"] == "error" + assert "EXA_API_KEY" in result["error"] + + +# --------------------------------------------------------------------------- +# Parameter validation +# --------------------------------------------------------------------------- + +class TestExaToolValidation: + def test_returns_error_when_query_missing(self, workspace): + with patch.dict(os.environ, {"EXA_API_KEY": "test-key"}): + tool = ExaSearchTool() + result = tool.execute(workspace, {}) + assert result["status"] == "error" + assert "query" in result["error"] + + +# --------------------------------------------------------------------------- +# Response parsing & content fallback +# --------------------------------------------------------------------------- + +class TestExaToolResponseParsing: + def test_parse_highlights_response(self, workspace): + mock_client = MagicMock() + mock_client.headers = {} + mock_client.search_and_contents.return_value = EXA_RESPONSE_FIXTURE + + with patch.dict(os.environ, {"EXA_API_KEY": "test-key"}), \ + patch.object(_exa_mod, "Exa", return_value=mock_client): + tool = ExaSearchTool() + result = tool.execute(workspace, {"query": "LLMs", "content_mode": "highlights"}) + + assert result["status"] == "success" + assert "Intro to LLMs" in result["output"] + assert "https://example.com/llms" in result["output"] + assert "Jane Doe" in result["output"] + assert "2025-01-15" in result["output"] + assert "transformers" in result["output"] + + def test_parse_summary_fallback(self, workspace): + mock_client = MagicMock() + mock_client.headers = {} + mock_client.search_and_contents.return_value = EXA_RESPONSE_FIXTURE + + with patch.dict(os.environ, {"EXA_API_KEY": "test-key"}), \ + patch.object(_exa_mod, "Exa", return_value=mock_client): + tool = ExaSearchTool() + result = tool.execute(workspace, {"query": "RAG"}) + + assert result["status"] == "success" + assert "RAG Explained" in result["output"] + assert "Retrieval-augmented generation" in result["output"] + + def test_parse_text_fallback(self, workspace): + mock_client = MagicMock() + mock_client.headers = {} + mock_client.search_and_contents.return_value = EXA_RESPONSE_FIXTURE + + with patch.dict(os.environ, {"EXA_API_KEY": "test-key"}), \ + patch.object(_exa_mod, "Exa", return_value=mock_client): + tool = ExaSearchTool() + result = tool.execute(workspace, {"query": "vector databases"}) + + assert result["status"] == "success" + assert "Vector Databases" in result["output"] + assert "embeddings" in result["output"] + + def test_empty_content_fields(self, workspace): + empty_response = _make_response([ + _make_result(title="Empty Result", url="https://example.com/empty"), + ]) + mock_client = MagicMock() + mock_client.headers = {} + mock_client.search_and_contents.return_value = empty_response + + with patch.dict(os.environ, {"EXA_API_KEY": "test-key"}), \ + patch.object(_exa_mod, "Exa", return_value=mock_client): + tool = ExaSearchTool() + result = tool.execute(workspace, {"query": "nothing"}) + + assert result["status"] == "success" + assert "Empty Result" in result["output"] + + +# --------------------------------------------------------------------------- +# Snippet extraction unit tests +# --------------------------------------------------------------------------- + +class TestExtractSnippet: + def test_prefers_highlights(self): + r = _make_result(highlights=["h1", "h2"], summary="s", text="t") + assert "h1" in _extract_snippet(r) + assert "h2" in _extract_snippet(r) + + def test_falls_back_to_summary(self): + r = _make_result(highlights=None, summary="my summary", text="t") + assert _extract_snippet(r) == "my summary" + + def test_falls_back_to_text(self): + r = _make_result(highlights=None, summary=None, text="some text") + assert _extract_snippet(r) == "some text" + + def test_truncates_long_text(self): + long_text = "a" * 5000 + r = _make_result(highlights=None, summary=None, text=long_text) + snippet = _extract_snippet(r) + assert len(snippet) <= 2004 # 2000 + "..." + assert snippet.endswith("...") + + def test_returns_empty_for_no_content(self): + r = _make_result(highlights=None, summary=None, text=None) + assert _extract_snippet(r) == "" + + +# --------------------------------------------------------------------------- +# Content mode routing +# --------------------------------------------------------------------------- + +class TestExaToolContentModes: + def test_none_mode_calls_search(self, workspace): + mock_client = MagicMock() + mock_client.headers = {} + mock_client.search.return_value = _make_response([]) + + with patch.dict(os.environ, {"EXA_API_KEY": "test-key"}), \ + patch.object(_exa_mod, "Exa", return_value=mock_client): + tool = ExaSearchTool() + tool.execute(workspace, {"query": "test", "content_mode": "none"}) + + mock_client.search.assert_called_once() + mock_client.search_and_contents.assert_not_called() + + def test_text_mode_passes_text_param(self, workspace): + mock_client = MagicMock() + mock_client.headers = {} + mock_client.search_and_contents.return_value = _make_response([]) + + with patch.dict(os.environ, {"EXA_API_KEY": "test-key"}), \ + patch.object(_exa_mod, "Exa", return_value=mock_client): + tool = ExaSearchTool() + tool.execute(workspace, {"query": "test", "content_mode": "text"}) + + call_kwargs = mock_client.search_and_contents.call_args[1] + assert "text" in call_kwargs + assert call_kwargs["text"] == {"max_characters": 10000} + + def test_summary_mode_passes_summary_param(self, workspace): + mock_client = MagicMock() + mock_client.headers = {} + mock_client.search_and_contents.return_value = _make_response([]) + + with patch.dict(os.environ, {"EXA_API_KEY": "test-key"}), \ + patch.object(_exa_mod, "Exa", return_value=mock_client): + tool = ExaSearchTool() + tool.execute(workspace, {"query": "test", "content_mode": "summary"}) + + call_kwargs = mock_client.search_and_contents.call_args[1] + assert "summary" in call_kwargs + assert call_kwargs["summary"] is True + + def test_highlights_mode_passes_highlights_param(self, workspace): + mock_client = MagicMock() + mock_client.headers = {} + mock_client.search_and_contents.return_value = _make_response([]) + + with patch.dict(os.environ, {"EXA_API_KEY": "test-key"}), \ + patch.object(_exa_mod, "Exa", return_value=mock_client): + tool = ExaSearchTool() + tool.execute(workspace, {"query": "test", "content_mode": "highlights"}) + + call_kwargs = mock_client.search_and_contents.call_args[1] + assert "highlights" in call_kwargs + assert call_kwargs["highlights"] == {"max_characters": 4000} + + +# --------------------------------------------------------------------------- +# Integration header +# --------------------------------------------------------------------------- + +class TestExaToolIntegrationHeader: + def test_sets_integration_header(self, workspace): + mock_client = MagicMock() + mock_client.headers = {} + mock_client.search_and_contents.return_value = _make_response([]) + + with patch.dict(os.environ, {"EXA_API_KEY": "test-key"}), \ + patch.object(_exa_mod, "Exa", return_value=mock_client): + tool = ExaSearchTool() + tool.execute(workspace, {"query": "test"}) + + assert mock_client.headers["x-exa-integration"] == "infiagent" + + +# --------------------------------------------------------------------------- +# Save to file +# --------------------------------------------------------------------------- + +class TestExaToolSaveToFile: + def test_saves_results_to_file(self, workspace): + mock_client = MagicMock() + mock_client.headers = {} + mock_client.search_and_contents.return_value = EXA_RESPONSE_FIXTURE + + with patch.dict(os.environ, {"EXA_API_KEY": "test-key"}), \ + patch.object(_exa_mod, "Exa", return_value=mock_client): + tool = ExaSearchTool() + result = tool.execute(workspace, { + "query": "LLMs", + "save_path": "temp/exa_search/results.md", + }) + + assert result["status"] == "success" + assert "Results saved to" in result["output"] + + saved_files = list(Path(workspace).rglob("*.md")) + assert len(saved_files) == 1 + content = saved_files[0].read_text(encoding="utf-8") + assert "Intro to LLMs" in content + + +# --------------------------------------------------------------------------- +# Optional filters +# --------------------------------------------------------------------------- + +class TestExaToolFilters: + def test_passes_optional_filters(self, workspace): + mock_client = MagicMock() + mock_client.headers = {} + mock_client.search_and_contents.return_value = _make_response([]) + + with patch.dict(os.environ, {"EXA_API_KEY": "test-key"}), \ + patch.object(_exa_mod, "Exa", return_value=mock_client): + tool = ExaSearchTool() + tool.execute(workspace, { + "query": "AI news", + "category": "news", + "include_domains": ["techcrunch.com", "theverge.com"], + "exclude_domains": ["reddit.com"], + "include_text": ["artificial intelligence"], + "exclude_text": ["crypto"], + "start_published_date": "2025-01-01T00:00:00Z", + "end_published_date": "2025-12-31T23:59:59Z", + "search_type": "neural", + }) + + call_kwargs = mock_client.search_and_contents.call_args[1] + assert call_kwargs["category"] == "news" + assert call_kwargs["include_domains"] == ["techcrunch.com", "theverge.com"] + assert call_kwargs["exclude_domains"] == ["reddit.com"] + assert call_kwargs["include_text"] == ["artificial intelligence"] + assert call_kwargs["exclude_text"] == ["crypto"] + assert call_kwargs["start_published_date"] == "2025-01-01T00:00:00Z" + assert call_kwargs["end_published_date"] == "2025-12-31T23:59:59Z" + assert call_kwargs["type"] == "neural" diff --git a/tool_server_lite/registry.py b/tool_server_lite/registry.py index 5f1b5ad..57d8685 100644 --- a/tool_server_lite/registry.py +++ b/tool_server_lite/registry.py @@ -37,6 +37,7 @@ def _build_builtin_factories() -> Dict[str, ToolFactory]: WebSearchTool, GoogleScholarSearchTool, ArxivSearchTool, + ExaSearchTool, CrawlPageTool, FileDownloadTool, ParseDocumentTool, @@ -75,6 +76,7 @@ def _build_builtin_factories() -> Dict[str, ToolFactory]: "web_search": WebSearchTool, "google_scholar_search": GoogleScholarSearchTool, "arxiv_search": ArxivSearchTool, + "exa_search": ExaSearchTool, "crawl_page": CrawlPageTool, "file_download": FileDownloadTool, "parse_document": ParseDocumentTool, diff --git a/tool_server_lite/requirements.txt b/tool_server_lite/requirements.txt index 4c155b0..deda2a0 100644 --- a/tool_server_lite/requirements.txt +++ b/tool_server_lite/requirements.txt @@ -15,6 +15,7 @@ python-docx>=1.1.0 python-pptx>=0.6.21 crawl4ai>=0.3.0 ddgs>=1.0.0 +exa-py>=2.0.0 pyyaml>=6.0.0 arxiv>=2.0.0 prompt_toolkit>=3.0.0 diff --git a/tool_server_lite/tools/__init__.py b/tool_server_lite/tools/__init__.py index f285302..7f6bfac 100644 --- a/tool_server_lite/tools/__init__.py +++ b/tool_server_lite/tools/__init__.py @@ -16,6 +16,8 @@ FileDownloadTool ) +from .exa_tools import ExaSearchTool + from .arxiv_tools import ArxivSearchTool from .document_tools import ParseDocumentTool @@ -129,4 +131,5 @@ "TaskShareContextPathTool", "ListTaskIdsTool", "TaskHistorySearchTool", + "ExaSearchTool", ] diff --git a/tool_server_lite/tools/exa_tools.py b/tool_server_lite/tools/exa_tools.py new file mode 100644 index 0000000..679e5c4 --- /dev/null +++ b/tool_server_lite/tools/exa_tools.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Exa AI-powered search tool +""" + +import os +import re +from pathlib import Path +from typing import Dict, Any, List, Optional +from .file_tools import BaseTool, get_abs_path + +# Exa SDK import +try: + from exa_py import Exa + EXA_AVAILABLE = True +except ImportError: + EXA_AVAILABLE = False + + +class ExaSearchTool(BaseTool): + """Exa AI-powered web search tool""" + + def execute(self, task_id: str, parameters: Dict[str, Any]) -> Dict[str, Any]: + """ + Search the web using Exa AI-powered search. + + Parameters: + query (str): Search query + max_results (int, optional): Maximum number of results, default 10 + search_type (str, optional): Search type - 'auto' (default), 'neural', 'fast', 'instant' + content_mode (str, optional): Content retrieval mode - 'highlights' (default), 'text', 'summary', 'none' + category (str, optional): Filter by category - 'company', 'research paper', 'news', 'personal site', 'financial report', 'people' + include_domains (list, optional): Only include results from these domains + exclude_domains (list, optional): Exclude results from these domains + include_text (list, optional): Strings that must appear in page text + exclude_text (list, optional): Strings to exclude from results + start_published_date (str, optional): ISO 8601 date; only results published after this + end_published_date (str, optional): ISO 8601 date; only results published before this + save_path (str, optional): Relative path to save results as a .md file + """ + try: + if not EXA_AVAILABLE: + return { + "status": "error", + "output": "", + "error": "exa-py not installed. Run: pip install exa-py" + } + + api_key = os.environ.get("EXA_API_KEY", "") + if not api_key: + return { + "status": "error", + "output": "", + "error": "EXA_API_KEY environment variable is not set. Get your key from: https://exa.ai" + } + + query = parameters.get("query") + if not query: + return { + "status": "error", + "output": "", + "error": "query is required" + } + + max_results = parameters.get("max_results", 10) + search_type = parameters.get("search_type", "auto") + content_mode = parameters.get("content_mode", "highlights") + category = parameters.get("category") + include_domains = parameters.get("include_domains") + exclude_domains = parameters.get("exclude_domains") + include_text = parameters.get("include_text") + exclude_text = parameters.get("exclude_text") + start_published_date = parameters.get("start_published_date") + end_published_date = parameters.get("end_published_date") + save_path = parameters.get("save_path") + + # Create Exa client with integration tracking header + client = Exa(api_key=api_key) + client.headers["x-exa-integration"] = "infiagent" + + # Build search kwargs + search_kwargs: Dict[str, Any] = { + "query": query, + "num_results": max_results, + "type": search_type, + } + + # Add content retrieval parameters + if content_mode == "highlights": + search_kwargs["highlights"] = {"max_characters": 4000} + elif content_mode == "text": + search_kwargs["text"] = {"max_characters": 10000} + elif content_mode == "summary": + search_kwargs["summary"] = True + + # Add optional filters + if category: + search_kwargs["category"] = category + if include_domains: + search_kwargs["include_domains"] = include_domains + if exclude_domains: + search_kwargs["exclude_domains"] = exclude_domains + if include_text: + search_kwargs["include_text"] = include_text + if exclude_text: + search_kwargs["exclude_text"] = exclude_text + if start_published_date: + search_kwargs["start_published_date"] = start_published_date + if end_published_date: + search_kwargs["end_published_date"] = end_published_date + + # Execute search + if content_mode and content_mode != "none": + response = client.search_and_contents(**search_kwargs) + else: + response = client.search(**search_kwargs) + + # Format results as Markdown + results_md = [] + results_md.append(f"# Exa Search Results: {query}\n") + results_md.append(f"Total: {len(response.results)} results\n") + + for i, result in enumerate(response.results, 1): + title = getattr(result, "title", "No title") or "No title" + url = getattr(result, "url", "") or "" + published_date = getattr(result, "publishedDate", None) or getattr(result, "published_date", None) + author = getattr(result, "author", None) + + results_md.append(f"## {i}. {title}\n") + results_md.append(f"**URL**: {url}\n") + if published_date: + results_md.append(f"**Published**: {published_date}\n") + if author: + results_md.append(f"**Author**: {author}\n") + + # Extract content with fallback cascade + snippet = _extract_snippet(result) + if snippet: + results_md.append(f"**Snippet**: {snippet}\n") + + results_text = '\n'.join(results_md) + + # Save to file + if save_path: + save_path_obj = Path(save_path) + safe_query = re.sub(r'[^\w\s-]', '', query).strip() + safe_query = re.sub(r'[-\s]+', '_', safe_query)[:50] + + new_filename = f"{save_path_obj.stem}_{safe_query}_n{max_results}{save_path_obj.suffix}" + final_save_path = str(save_path_obj.parent / new_filename) + + abs_save_path = get_abs_path(task_id, final_save_path) + abs_save_path.parent.mkdir(parents=True, exist_ok=True) + + with open(abs_save_path, 'w', encoding='utf-8') as f: + f.write(results_text) + + output = f"Results saved to {final_save_path}" + else: + output = results_text + + return { + "status": "success", + "output": output, + "error": "" + } + + except Exception as e: + return { + "status": "error", + "output": "", + "error": str(e) + } + + +def _extract_snippet(result: Any) -> str: + """ + Extract the best available content snippet from an Exa result, + cascading through highlights -> summary -> text. + """ + # Try highlights first + highlights = getattr(result, "highlights", None) + if highlights and isinstance(highlights, list) and len(highlights) > 0: + return "\n".join(highlights) + + # Try summary + summary = getattr(result, "summary", None) + if summary and isinstance(summary, str) and summary.strip(): + return summary.strip() + + # Try text (truncate to keep output manageable) + text = getattr(result, "text", None) + if text and isinstance(text, str) and text.strip(): + truncated = text.strip()[:2000] + if len(text.strip()) > 2000: + truncated += "..." + return truncated + + return ""