-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_framework.py
More file actions
341 lines (277 loc) · 12.8 KB
/
test_framework.py
File metadata and controls
341 lines (277 loc) · 12.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
#!/usr/bin/env python3
"""
================================================================================
ARC Prize 2025 - Testing Framework
================================================================================
Test our solution locally before submitting to Kaggle
This is OPEN SOURCE software - no commercial license restrictions
Released under MIT License for the ARC Prize 2025 competition
Author: Andrew Jewell Sr.
Company: AutomataNexus, LLC
Date: September 26, 2024
Version: 1.0.0
Description:
This framework allows us to:
1. Test on evaluation data (if available)
2. Validate our solver accuracy
3. Measure performance and timing
4. Identify weak patterns
5. Improve before submitting
================================================================================
"""
import json
import numpy as np
from pathlib import Path
import time
import logging
from typing import Dict, List, Tuple, Any
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
# Import our components
from arc_solver import ARCSolver
from pattern_detectors import analyze_task_with_all_detectors
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class ARCTestFramework:
"""Framework for testing our ARC solution"""
def __init__(self, pattern_library_path: str = None):
self.solver = ARCSolver(pattern_library_path)
self.results = {
'correct': 0,
'total': 0,
'accuracy_by_pattern': defaultdict(lambda: {'correct': 0, 'total': 0}),
'timing': [],
'failures': []
}
def load_evaluation_data(self) -> Tuple[Dict, Dict]:
"""Load evaluation challenges and solutions"""
data_dir = Path('/mnt/d/opt/ARCPrize2025/data')
# Try to load evaluation data
eval_challenges_path = data_dir / 'arc-agi_evaluation_challenges.json'
eval_solutions_path = data_dir / 'arc-agi_evaluation_solutions.json'
if not eval_challenges_path.exists():
logger.warning("Evaluation data not found, using subset of training data")
# Use last 100 training tasks as mock evaluation
train_path = data_dir / 'arc-agi_training_challenges.json'
solutions_path = data_dir / 'arc-agi_training_solutions.json'
with open(train_path, 'r') as f:
all_challenges = json.load(f)
with open(solutions_path, 'r') as f:
all_solutions = json.load(f)
# Take last 100 as evaluation set
eval_tasks = list(all_challenges.keys())[-100:]
challenges = {k: all_challenges[k] for k in eval_tasks}
solutions = {k: all_solutions[k] for k in eval_tasks}
else:
with open(eval_challenges_path, 'r') as f:
challenges = json.load(f)
with open(eval_solutions_path, 'r') as f:
solutions = json.load(f)
logger.info(f"Loaded {len(challenges)} evaluation tasks")
return challenges, solutions
def evaluate_task(self, task_id: str, task: Dict, solution: List) -> Dict[str, Any]:
"""Evaluate a single task"""
start_time = time.time()
# Get predictions from solver
predictions = self.solver.solve(task)
solve_time = time.time() - start_time
# Check accuracy
correct = False
attempt_correct = [False, False]
for i, (pred, actual) in enumerate(zip(predictions, solution)):
# Check attempt 1
if np.array_equal(np.array(pred['attempt_1']), np.array(actual)):
attempt_correct[0] = True
correct = True
# Check attempt 2
elif np.array_equal(np.array(pred['attempt_2']), np.array(actual)):
attempt_correct[1] = True
correct = True
# Analyze what patterns were in this task
train_examples = task.get('train', [])
detected_patterns = analyze_task_with_all_detectors(train_examples)
# Find which pattern had highest confidence
best_pattern = None
best_confidence = 0
for pattern_name, result in detected_patterns.items():
if result.get('confidence', 0) > best_confidence:
best_confidence = result['confidence']
best_pattern = pattern_name
return {
'task_id': task_id,
'correct': correct,
'attempt_correct': attempt_correct,
'solve_time': solve_time,
'best_pattern': best_pattern,
'pattern_confidence': best_confidence,
'predictions': predictions,
'actual': solution
}
def run_evaluation(self, max_tasks: int = None):
"""Run full evaluation"""
logger.info("Starting evaluation...")
# Load evaluation data
challenges, solutions = self.load_evaluation_data()
# Limit tasks if requested
if max_tasks:
task_ids = list(challenges.keys())[:max_tasks]
else:
task_ids = list(challenges.keys())
# Evaluate each task
for i, task_id in enumerate(task_ids):
logger.info(f"Evaluating task {i+1}/{len(task_ids)}: {task_id}")
task = challenges[task_id]
solution = solutions[task_id]
result = self.evaluate_task(task_id, task, solution)
# Update statistics
self.results['total'] += 1
if result['correct']:
self.results['correct'] += 1
else:
self.results['failures'].append(result)
# Update pattern-specific accuracy
pattern = result['best_pattern']
if pattern:
self.results['accuracy_by_pattern'][pattern]['total'] += 1
if result['correct']:
self.results['accuracy_by_pattern'][pattern]['correct'] += 1
self.results['timing'].append(result['solve_time'])
# Log progress every 10 tasks
if (i + 1) % 10 == 0:
current_acc = self.results['correct'] / self.results['total']
logger.info(f"Progress: {i+1}/{len(task_ids)}, Accuracy: {current_acc:.2%}")
def analyze_results(self):
"""Analyze and display results"""
logger.info("\n" + "="*60)
logger.info("EVALUATION RESULTS")
logger.info("="*60)
# Overall accuracy
accuracy = self.results['correct'] / self.results['total']
logger.info(f"\nOverall Accuracy: {accuracy:.2%} ({self.results['correct']}/{self.results['total']})")
# Need 85% to win
if accuracy >= 0.85:
logger.info("✓ MEETS 85% THRESHOLD FOR GRAND PRIZE!")
else:
logger.info(f"✗ Need {0.85 - accuracy:.2%} more to reach 85% threshold")
# Timing statistics
avg_time = np.mean(self.results['timing'])
max_time = np.max(self.results['timing'])
total_time = np.sum(self.results['timing'])
logger.info(f"\nTiming Statistics:")
logger.info(f" Average time per task: {avg_time:.2f}s")
logger.info(f" Max time: {max_time:.2f}s")
logger.info(f" Total time: {total_time/60:.1f} minutes")
# Extrapolate to full test set (240 tasks)
estimated_full_time = (total_time / self.results['total']) * 240
logger.info(f" Estimated time for 240 tasks: {estimated_full_time/3600:.1f} hours")
# Pattern-specific accuracy
logger.info(f"\nAccuracy by Pattern Type:")
pattern_stats = []
for pattern, stats in self.results['accuracy_by_pattern'].items():
if stats['total'] > 0:
acc = stats['correct'] / stats['total']
pattern_stats.append((pattern, acc, stats['total']))
# Sort by accuracy
pattern_stats.sort(key=lambda x: x[1], reverse=True)
for pattern, acc, total in pattern_stats:
logger.info(f" {pattern}: {acc:.2%} ({total} tasks)")
# Analyze failures
if self.results['failures']:
logger.info(f"\nFailure Analysis:")
logger.info(f" Total failures: {len(self.results['failures'])}")
# Group failures by pattern
failure_patterns = defaultdict(int)
for failure in self.results['failures']:
pattern = failure.get('best_pattern', 'unknown')
failure_patterns[pattern] += 1
logger.info(" Failures by pattern:")
for pattern, count in sorted(failure_patterns.items(), key=lambda x: x[1], reverse=True):
logger.info(f" {pattern}: {count} failures")
def visualize_results(self):
"""Create visualization of results"""
# Create figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# 1. Pattern accuracy bar chart
ax = axes[0, 0]
patterns = []
accuracies = []
for pattern, stats in self.results['accuracy_by_pattern'].items():
if stats['total'] > 0:
patterns.append(pattern)
accuracies.append(stats['correct'] / stats['total'])
ax.bar(patterns, accuracies)
ax.set_title('Accuracy by Pattern Type')
ax.set_ylabel('Accuracy')
ax.set_ylim(0, 1)
ax.axhline(y=0.85, color='r', linestyle='--', label='85% threshold')
ax.legend()
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)
# 2. Timing distribution
ax = axes[0, 1]
ax.hist(self.results['timing'], bins=30)
ax.set_title('Task Solving Time Distribution')
ax.set_xlabel('Time (seconds)')
ax.set_ylabel('Count')
# 3. Cumulative accuracy over time
ax = axes[1, 0]
cumulative_acc = []
for i in range(1, self.results['total'] + 1):
acc = sum(1 for f in self.results['failures'][:i] if not f['correct']) / i
cumulative_acc.append(1 - acc)
ax.plot(cumulative_acc)
ax.set_title('Cumulative Accuracy')
ax.set_xlabel('Tasks Evaluated')
ax.set_ylabel('Accuracy')
ax.axhline(y=0.85, color='r', linestyle='--')
# 4. Summary text
ax = axes[1, 1]
ax.text(0.1, 0.9, f"Overall Accuracy: {self.results['correct']/self.results['total']:.2%}",
transform=ax.transAxes, fontsize=14, weight='bold')
ax.text(0.1, 0.7, f"Total Tasks: {self.results['total']}", transform=ax.transAxes, fontsize=12)
ax.text(0.1, 0.5, f"Correct: {self.results['correct']}", transform=ax.transAxes, fontsize=12)
ax.text(0.1, 0.3, f"Failed: {len(self.results['failures'])}", transform=ax.transAxes, fontsize=12)
ax.axis('off')
plt.tight_layout()
plt.savefig('/mnt/d/opt/ARCPrize2025/test_results.png')
logger.info("Saved visualization to test_results.png")
def export_failure_analysis(self):
"""Export detailed failure analysis for improvement"""
output = {
'summary': {
'total': self.results['total'],
'correct': self.results['correct'],
'accuracy': self.results['correct'] / self.results['total'],
'failures': len(self.results['failures'])
},
'failures': []
}
# Add detailed failure info
for failure in self.results['failures'][:10]: # First 10 failures
output['failures'].append({
'task_id': failure['task_id'],
'pattern': failure['best_pattern'],
'confidence': failure['pattern_confidence'],
'attempts': failure['attempt_correct']
})
# Save to file
with open('/mnt/d/opt/ARCPrize2025/failure_analysis.json', 'w') as f:
json.dump(output, f, indent=2)
logger.info("Exported failure analysis to failure_analysis.json")
def main():
"""Run testing framework"""
# Initialize framework
tester = ARCTestFramework()
# Run evaluation on subset first
logger.info("Running evaluation on 20 tasks for quick test...")
tester.run_evaluation(max_tasks=20)
# Analyze results
tester.analyze_results()
# Create visualizations
tester.visualize_results()
# Export failure analysis
tester.export_failure_analysis()
logger.info("\nTesting complete! Check results and visualizations.")
if __name__ == "__main__":
main()