Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ data/
!src/data/

blob/
models/
./models/
outputs/
unsloth_compiled_cache/
grpo_trainer_lora_model/
Expand Down
92 changes: 92 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,98 @@ uv run pytest tests/test_search_replace_diff.py
uv run pytest tests/test_search_replace_diff.py::test_specific_function
```

## Trajectory Analyzer

The trajectory analyzer toolkit loads agent trajectories from multiple formats, extracts metrics, and generates comparative visualizations. It's useful for analyzing tool usage patterns, comparing models across scaffolds, and studying transfer learning effects.

### Supported Formats

- **nano-agent**: `detailed_predictions.jsonl` with OpenAI chat format
- **R2E-Gym**: JSONL with `trajectory_steps` using XML function calls
- **SWE-agent**: Log files with `🎬 ACTION` markers

### Quick Start

```bash
# Run with the example config (adjust paths in the config file first)
uv run python -m src.trajectory_analyzer --config-name example_config

# Generate specific plots
uv run python -m src.trajectory_analyzer --config-name example_config \
'plots=[tool_distribution,comparison,transfer_analysis]' \
output_dir=./my_plots
```

### Configuration

Create a YAML config file (see `src/trajectory_analyzer/conf/example_config.yaml`):

```yaml
output_dir: "./plots"

plots:
- tool_distribution # Bar chart of tool usage per run
- shell_command_distribution # Bar chart of shell commands
- success_rates # Resolution rates and tool success
- token_analysis # Token usage by success
- comparison # Multi-run comparison grid
- transfer_analysis # Cross-scaffold transfer analysis

# For transfer learning analysis: map models to their training scaffold
model_to_trained_scaffold:
"agentica-org/DeepSWE-Preview": "r2e-gym"

runs:
- name: "deepswe-nano-agent"
format: "nano_agent"
trajectories: "/path/to/detailed_predictions.jsonl"
results: "/path/to/swebench_results.json" # optional
base_model: "agentica-org/DeepSWE-Preview"
scaffold: "nano-agent"
lora_adapter: null

- name: "deepswe-r2e-gym"
format: "r2e_gym"
trajectories: "/path/to/trajectories.jsonl"
results: null # uses reward field from trajectory
base_model: "agentica-org/DeepSWE-Preview"
scaffold: "r2e-gym"
lora_adapter: null
```

### Programmatic Usage

```python
from src.trajectory_analyzer.loaders import NanoAgentLoader, R2EGymLoader
from src.trajectory_analyzer.analysis import MetricsExtractor, RunComparator
from src.trajectory_analyzer.plotting import TrajectoryPlotter

# Load trajectories
loader = NanoAgentLoader()
run = loader.load_run(
name="my-run",
scaffold="nano-agent",
base_model="Qwen/Qwen3-32B",
trajectories_path="path/to/detailed_predictions.jsonl",
results_path="path/to/results.json", # optional
)

# Extract metrics
extractor = MetricsExtractor()
metrics = extractor.extract_run_metrics(run)
print(f"Resolved: {metrics.resolved_instances}/{metrics.total_instances}")
print(f"Avg tool calls: {metrics.avg_tool_calls:.1f}")

# Compare runs
comparator = RunComparator()
transfer = comparator.analyze_transfer(source_run, target_run, "r2e-gym")
print(f"Transfer delta: {transfer.transfer_delta:+.1%}")

# Generate plots
plotter = TrajectoryPlotter(output_dir="./plots")
plotter.plot_all([run1, run2], plots=["comparison", "tool_distribution"])
```

## Documentation Structure

This repository uses several Markdown files to organize information:
Expand Down
31 changes: 0 additions & 31 deletions benchmarks/benchmark_container.def

This file was deleted.

14 changes: 13 additions & 1 deletion benchmarks/swe_bench/run_harness_eval.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,16 @@ set -euo pipefail
# --run-id my_run \
# [--max-workers 8]
#
# Note: --run-id is required
#
# Requirements (on this CPU server):
# pip install swebench
# Docker installed and running

subset="verified"
split="test"
preds=""
run_id="swebench_local_run"
run_id=""
max_workers="8"

while [[ $# -gt 0 ]]; do
Expand All @@ -41,6 +43,11 @@ if [[ -z "$preds" ]]; then
exit 1
fi

if [[ -z "$run_id" ]]; then
echo "ERROR: --run-id is required" >&2
exit 1
fi

case "$subset" in
verified|Verified)
dataset_name="princeton-nlp/SWE-bench_Verified";;
Expand Down Expand Up @@ -69,4 +76,9 @@ python -m swebench.harness.run_evaluation \
--cache_level "instance" \
--timeout 3600

# Clean up logs directory
if [[ -d "logs/run_evaluation/$run_id" ]]; then
echo "Cleaning up logs directory: logs/run_evaluation/$run_id"
rm -rf "logs/run_evaluation/$run_id"
fi

17 changes: 10 additions & 7 deletions benchmarks/swe_bench/run_nano_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from datasets import load_dataset


def run_evaluation(endpoint: str, model_name: str, subset: str, split: str, slice_spec: str, output_dir: Path):
def run_evaluation(endpoint: str, model_name: str, subset: str, split: str, slice_spec: str, output_dir: Path, backend: str = "local"):
"""Run nano_agent on SWE-bench tasks and save predictions using a process pool."""

# Load SWE-bench dataset
Expand Down Expand Up @@ -49,10 +49,11 @@ def run_evaluation(endpoint: str, model_name: str, subset: str, split: str, slic
config = NanoConfig(
api_base=endpoint,
model=model_name, # e.g., "nano" for LoRA
token_limit=16384,
time_limit=40,
tool_limit=30,
temperature=0.2,
token_limit=65536,
time_limit=600,
tool_limit=500,
temperature=1.0,
backend=backend,
)

# Prepare inputs for workers
Expand All @@ -70,7 +71,7 @@ def run_evaluation(endpoint: str, model_name: str, subset: str, split: str, slic
detailed_predictions: dict[str, dict] = {}

# Run with a process pool of up to 8 workers
max_workers = min(8, len(inputs)) if inputs else 0
max_workers = min(48, len(inputs)) if inputs else 0
if max_workers == 0:
print("No instances to process.")
return
Expand Down Expand Up @@ -156,11 +157,13 @@ def main():
help="Dataset split")
parser.add_argument("--slice", default=":25",
help="Slice to run. Forms: :N (first N) or start:end (half-open)")
parser.add_argument("--backend", choices=["local", "apptainer"], default="local",
help="Execution backend (local or apptainer)")

args = parser.parse_args()

output_dir = Path(args.output_dir)
run_evaluation(args.endpoint, args.model_name, args.subset, args.split, args.slice, output_dir)
run_evaluation(args.endpoint, args.model_name, args.subset, args.split, args.slice, output_dir, args.backend)


if __name__ == "__main__":
Expand Down
Loading