Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .flake8
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[flake8]
max-line-length = 119
ignore = E203, W503
max-line-length = 200
ignore = E203, W503, E501, E402, F401, F541, F811, F841, E704, E713, E712, E231, E731, E226, W291, W293, W292, E302, W504
exclude = vendor
2 changes: 2 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,15 @@ repos:
rev: 7.3.0
hooks:
- id: flake8
exclude: ^vendor/
args: [--max-line-length=119, --max-complexity=100, "--ignore=E402,F401,F541,W503,E203,F811,E226,F841,E704,E713,E712,E231,E731,E501"]
# additional_dependencies: [flake8-docstrings, flake8-import-order] # Optional: add flake8 plugins

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.17.0
hooks:
- id: mypy
exclude: ^vendor/
args: [--ignore-missing-imports, --install-types, --non-interactive]
additional_dependencies:
- types-requests
Expand Down
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
SOFTWARE.
35 changes: 24 additions & 11 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,32 +1,45 @@
PYTHON_DIRS = tests examples scripts eval_protocol

.PHONY: clean build dist upload test lint typecheck format release sync-docs version tag-version show-version bump-major bump-minor bump-patch full-release quick-release
# Prefer tools from local virtualenv if present
VENV ?= .venv
VENV_BIN := $(VENV)/bin
PYTHON := $(if $(wildcard $(VENV_BIN)/python),$(VENV_BIN)/python,python)
FLAKE8 := $(if $(wildcard $(VENV_BIN)/flake8),$(VENV_BIN)/flake8,flake8)
MYPY := $(if $(wildcard $(VENV_BIN)/mypy),$(VENV_BIN)/mypy,mypy)
BLACK := $(if $(wildcard $(VENV_BIN)/black),$(VENV_BIN)/black,black)
PRE_COMMIT := $(if $(wildcard $(VENV_BIN)/pre-commit),$(VENV_BIN)/pre-commit,pre-commit)
PYTEST := $(if $(wildcard $(VENV_BIN)/pytest),$(VENV_BIN)/pytest,pytest)
TWINE := $(if $(wildcard $(VENV_BIN)/twine),$(VENV_BIN)/twine,twine)

.PHONY: clean build dist upload test lint typecheck format release sync-docs version tag-version show-version bump-major bump-minor bump-patch full-release quick-release pre-commit help

clean:
rm -rf build/ dist/ *.egg-info/

# Run all pre-commit hooks (if installed)
pre-commit:
pre-commit run --all-files
$(PRE_COMMIT) run --all-files

build: clean
python -m build
$(PYTHON) -m build

dist: build

upload:
twine upload dist/*
$(TWINE) upload dist/*

test:
pytest
$(PYTEST)

lint:
flake8 $(PYTHON_DIRS)
$(PRE_COMMIT) run flake8 --all-files

typecheck:
mypy $(PYTHON_DIRS)
$(PRE_COMMIT) run mypy --all-files

format:
black $(PYTHON_DIRS)
$(PRE_COMMIT) run black --all-files && \
$(PRE_COMMIT) run isort --all-files

validate-docs:
@echo "Validating documentation links..."
Expand Down Expand Up @@ -140,9 +153,9 @@ help:
@echo " dist - Alias for build"
@echo " upload - Upload to PyPI (make sure to bump version first)"
@echo " test - Run tests"
@echo " lint - Run flake8 linter"
@echo " typecheck - Run mypy type checker"
@echo " format - Run black code formatter"
@echo " lint - Run flake8 via pre-commit"
@echo " typecheck - Run mypy via pre-commit"
@echo " format - Run black + isort via pre-commit"
@echo " validate-docs - Validate all documentation links in docs.json"
@echo " sync-docs - Sync docs to ~/home/docs with links under 'evaluators'"
@echo " release - Run lint, typecheck, test, build, then upload"
Expand Down
6 changes: 3 additions & 3 deletions development/notes/pytest_integration_proposal.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def tau2_rollout_processor(row: EvaluationRow, model: str, input_params: Dict, *
# from the dataset and provide a simulated tool response.
# 4. Call the model again with the tool response.
# 5. Construct a final EvaluationRow with the full transcript.

# The logic is encapsulated here, away from the test definition.
processed_row = ep.default_rollout_processor(row, model, input_params)[0] # Simplified for example
return [processed_row]
Expand Down Expand Up @@ -186,11 +186,11 @@ def best_of_n_processor(row: EvaluationRow, model: str, input_params: Dict, **kw

# Then, apply a reward function to score each candidate.
scored_rows = ep.evaluate(candidate_rows, score_politeness)

# Finally, select the best row.
# This logic could be encapsulated in a helper, e.g., ep.select_best().
best_row = select_best_by_group(scored_rows, score_key='politeness')

return [best_row]

@evaluation_test(
Expand Down
82 changes: 41 additions & 41 deletions eval_protocol/adapters/CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,36 +37,36 @@ except ImportError:

class YourCustomAdapter:
"""Adapter for integrating with Your Custom Data Source.

This adapter loads data from Your Custom Data Source and converts it
to EvaluationRow format for use in evaluation pipelines.

Examples:
Basic usage:
>>> adapter = YourCustomAdapter(api_key="your_key")
>>> rows = list(adapter.get_evaluation_rows(limit=10))
"""

def __init__(self, **config):
"""Initialize the adapter with configuration."""
if not DEPENDENCY_AVAILABLE:
raise ImportError("your_external_library not installed")

# Initialize your client/connection here
self.client = your_external_library.Client(**config)

def get_evaluation_rows(self, **kwargs) -> Iterator[EvaluationRow]:
"""Main method to fetch and convert data to EvaluationRow format.

Args:
**kwargs: Adapter-specific parameters

Yields:
EvaluationRow: Converted evaluation rows
"""
# Implement your data fetching logic
raw_data = self.client.fetch_data(**kwargs)

for item in raw_data:
try:
eval_row = self._convert_to_evaluation_row(item)
Expand All @@ -75,51 +75,51 @@ class YourCustomAdapter:
except Exception as e:
logger.warning(f"Failed to convert item: {e}")
continue

def _convert_to_evaluation_row(self, raw_item: Any) -> Optional[EvaluationRow]:
"""Convert a raw data item to EvaluationRow format.

Args:
raw_item: Raw data item from your source

Returns:
EvaluationRow or None if conversion fails
"""
# Extract messages from your data format
messages = self._extract_messages(raw_item)

# Extract metadata
input_metadata = self._create_input_metadata(raw_item)

# Extract ground truth if available
ground_truth = self._extract_ground_truth(raw_item)

# Extract tools if available (for tool calling scenarios)
tools = self._extract_tools(raw_item)

return EvaluationRow(
messages=messages,
tools=tools,
input_metadata=input_metadata,
ground_truth=ground_truth,
)

def _extract_messages(self, raw_item: Any) -> List[Message]:
"""Extract conversation messages from raw data."""
# Implement message extraction logic
# Convert your data format to List[Message]
pass

def _create_input_metadata(self, raw_item: Any) -> InputMetadata:
"""Create InputMetadata from raw data."""
# Implement metadata extraction
pass

def _extract_ground_truth(self, raw_item: Any) -> Optional[str]:
"""Extract ground truth if available."""
# Implement ground truth extraction
pass

def _extract_tools(self, raw_item: Any) -> Optional[List[Dict[str, Any]]]:
"""Extract tool definitions if available."""
# Implement tool extraction for tool calling scenarios
Expand Down Expand Up @@ -149,7 +149,7 @@ message = Message(
content="I'll help you with that calculation.",
tool_calls=[{
"id": "call_123",
"type": "function",
"type": "function",
"function": {
"name": "calculate",
"arguments": '{"x": 5, "y": 3}'
Expand Down Expand Up @@ -185,7 +185,7 @@ input_metadata = InputMetadata(
},
session_data={
"user_id": "user123",
"session_id": "session456",
"session_id": "session456",
"timestamp": "2024-01-01T00:00:00Z",
}
)
Expand Down Expand Up @@ -259,7 +259,7 @@ def get_evaluation_rows(self, **kwargs) -> Iterator[EvaluationRow]:
except Exception as e:
logger.error(f"Failed to fetch data: {e}")
return

for item in data:
try:
row = self._convert_to_evaluation_row(item)
Expand Down Expand Up @@ -298,36 +298,36 @@ from eval_protocol.models import EvaluationRow

class TestYourCustomAdapter:
"""Test suite for YourCustomAdapter."""

def test_initialization(self):
"""Test adapter initialization."""
adapter = YourCustomAdapter(api_key="test_key")
assert adapter.client is not None

def test_get_evaluation_rows(self):
"""Test conversion to EvaluationRow format."""
adapter = YourCustomAdapter(api_key="test_key")

# Mock the external API response
with patch.object(adapter.client, 'fetch_data') as mock_fetch:
mock_fetch.return_value = [
# Mock data in your format
{"id": "1", "question": "Test?", "answer": "Yes"}
]

rows = list(adapter.get_evaluation_rows(limit=1))

assert len(rows) == 1
assert isinstance(rows[0], EvaluationRow)
assert len(rows[0].messages) > 0

def test_error_handling(self):
"""Test error handling."""
adapter = YourCustomAdapter(api_key="test_key")

with patch.object(adapter.client, 'fetch_data') as mock_fetch:
mock_fetch.side_effect = Exception("API Error")

rows = list(adapter.get_evaluation_rows())
assert len(rows) == 0 # Should handle error gracefully
```
Expand All @@ -341,18 +341,18 @@ For simple chat data:
```python
def _extract_messages(self, conversation: Dict) -> List[Message]:
messages = []

# Add system prompt if available
if conversation.get('system_prompt'):
messages.append(Message(role="system", content=conversation['system_prompt']))

# Add conversation turns
for turn in conversation['turns']:
messages.append(Message(
role=turn['role'],
content=turn['content']
))

return messages
```

Expand All @@ -363,27 +363,27 @@ For tool calling scenarios:
```python
def _extract_messages(self, trace: Dict) -> List[Message]:
messages = []

for step in trace['steps']:
if step['type'] == 'user_message':
messages.append(Message(role="user", content=step['content']))

elif step['type'] == 'assistant_message':
message = Message(role="assistant", content=step.get('content'))

# Add tool calls if present
if step.get('tool_calls'):
message.tool_calls = step['tool_calls']

messages.append(message)

elif step['type'] == 'tool_response':
messages.append(Message(
role="tool",
content=step['content'],
tool_call_id=step['tool_call_id']
))

return messages
```

Expand Down Expand Up @@ -515,10 +515,10 @@ Here are some potential adapters that would be valuable:

- **OpenAI Evals**: Load data from OpenAI's evals repository
- **LLM Evaluation Datasets**: MMLU, HellaSwag, etc.
- **Chat Platforms**: Discord, Slack conversation exports
- **Chat Platforms**: Discord, Slack conversation exports
- **Monitoring Tools**: Other observability platforms
- **Custom APIs**: Company-specific data sources
- **File Formats**: Parquet, Excel, database exports
- **Research Datasets**: Academic benchmarks and competitions

We welcome contributions for any of these or other creative integrations!
We welcome contributions for any of these or other creative integrations!
Loading
Loading