|
27 | 27 | h1{font-size:2em}h3{font-size:1.2em} |
28 | 28 | table{font-size:small} |
29 | 29 | } |
| 30 | + |
| 31 | + /* 仅新增的样式 ↓↓↓ */ |
| 32 | + .icon-cell{ |
| 33 | + text-align:center; |
| 34 | + } |
| 35 | + .icon-cell img{ |
| 36 | + height:1.1em; |
| 37 | + } |
| 38 | + .icon-cell a{ |
| 39 | + display:inline-block; |
| 40 | + font-size:1.1em; |
| 41 | + line-height:1; |
| 42 | + text-decoration:none; |
| 43 | + } |
| 44 | + /* ↑↑↑ */ |
30 | 45 | </style> |
31 | 46 | </head> |
32 | 47 |
|
@@ -69,197 +84,75 @@ <h3 class="fw-light text-nowrap"> |
69 | 84 | </div> |
70 | 85 |
|
71 | 86 | <!-- 排名表 --> |
72 | | - <!-- <table id="origin" class="table table-striped table-bordered border border-primary border-3 mt-4 w-100"> |
73 | | - <thead> |
74 | | - <tr> |
75 | | - <th style="width:50%">Method</th> |
76 | | - <th style="width:25%">Model</th> |
77 | | - <th style="width:10%" class="text-center">%Resolved</th> |
78 | | - <th style="width:15%" class="text-center">Date</th> |
79 | | - </tr> |
80 | | - </thead> |
81 | | - <tbody id="leaderboard-body"></tbody> |
82 | | - </table> --> |
83 | | - |
84 | 87 | <table id="origin" class="table table-striped table-bordered border border-primary border-3 mt-4 w-100"> |
85 | 88 | <thead> |
86 | 89 | <tr> |
87 | 90 | <th style="width:40%">Method</th> |
88 | 91 | <th style="width:25%">Model</th> |
89 | 92 | <th style="width:10%" class="text-center">%Resolved</th> |
90 | | - <th style="width:5%" class="text-center">Org</th> |
91 | | - <th style="width:5%" class="text-center">Site</th> |
| 93 | + <th style="width:5%" class="text-center">Org</th> |
| 94 | + <th style="width:5%" class="text-center">Site</th> |
92 | 95 | <th style="width:15%" class="text-center">Date</th> |
93 | 96 | </tr> |
94 | 97 | </thead> |
95 | 98 | <tbody id="leaderboard-body"></tbody> |
96 | 99 | </table> |
97 | | - |
98 | | - |
99 | | - <!-- Notes --> |
100 | | - <div id="notes" class="w-100"> |
101 | | - <h3>📝 Notes</h3> |
102 | | - <div class="inline-block mt-3"> |
103 | | - <ol> |
104 | | - <li> |
105 | | - <strong>OmniGIRL</strong> is a multilingual & multimodal GitHub-issue-resolution benchmark |
106 | | - with <strong>959 tasks</strong> spanning four programming languages. |
107 | | - Inputs may include text, screenshots, rendered web pages and other modalities. |
108 | | - </li> |
109 | | - |
110 | | - <li> |
111 | | - For realistic evaluation, <em>we recommend</em> that methods automatically examine each |
112 | | - task’s raw input to detect available modalities (e.g., embedded webpages, images), |
113 | | - retrieve the relevant content by themselves, and invoke the appropriate tools— |
114 | | - instead of relying on manual hints. |
115 | | - Doing so better assesses a solver’s <strong>general-purpose issue-resolution ability in real-world scenarios</strong>. |
116 | | - </li> |
117 | | - |
118 | | - <li> |
119 | | - Our baseline system is released <em>for research purposes only</em>; please cite |
120 | | - OmniGIRL if you use it. |
121 | | - </li> |
122 | | - </ol> |
123 | | - </div> |
124 | | - </div> |
125 | | - |
126 | | - <!-- 📨 How to Submit --> |
127 | | - <div id="notes" class="w-100"> |
128 | | - <h3>📨 How to Submit</h3> |
129 | | - <div class="inline-block mt-3"> |
130 | | - <ol> |
131 | | - <li> |
132 | | - Prepare a <code>.json</code> or <code>.jsonl</code> file. Each record must contain at least |
133 | | - the keys <code>instance_id</code>, <code>model_name_or_path</code>, and <code>model_patch</code>. |
134 | | - </li> |
135 | | - <li> |
136 | | - Email the file to |
137 | | - <a href="mailto:guolh8@mail2.sysu.edu.cn?subject=OmniGIRL%20Submission">guolh8@mail2.sysu.edu.cn</a>. |
138 | | - </li> |
139 | | - <li> |
140 | | - We will evaluate your submission locally and update the leaderboard once the results are verified. |
141 | | - </li> |
142 | | - </ol> |
143 | | - </div> |
144 | | - </div> |
145 | | - |
146 | | - |
147 | | - <!-- More Leaderboards --> |
148 | | - <div id="notes" class="w-100"> |
149 | | - <h3>🤗 More Leaderboards</h3> |
150 | | - <div class="inline-block mt-3"> |
151 | | - <ol> |
152 | | - <li><a href="https://bigcode-bench.github.io/">BigCodeBench</a></li> |
153 | | - <li><a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">Big Code Models</a></li> |
154 | | - <li><a href="https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">Chatbot Arena</a></li> |
155 | | - <li><a href="https://github.com/amazon-science/cceval">CrossCodeEval</a></li> |
156 | | - <li><a href="https://fudanselab-classeval.github.io/">ClassEval</a></li> |
157 | | - <li><a href="https://crux-eval.github.io/leaderboard.html">CRUXEval</a></li> |
158 | | - <li><a href="https://codetlingua.github.io/leaderboard.html">Code Lingua</a></li> |
159 | | - <li><a href="https://evo-eval.github.io/">Evo-Eval</a></li> |
160 | | - <li><a href="https://huggingface.co/spaces/EffiBench/effibench-leaderboard">EffiBench</a></li> |
161 | | - <li><a href="https://github.com/01-ai/HumanEval.jl">HumanEval.jl</a></li> |
162 | | - <li><a href="https://livecodebench.github.io/leaderboard.html">LiveCodeBench</a></li> |
163 | | - <li><a href="https://sparksofagi.github.io/MHPP/">MHPP</a></li> |
164 | | - <li><a href="https://github.com/THUDM/NaturalCodeBench">NaturalCodeBench</a></li> |
165 | | - <li><a href="https://github.com/Leolty/repobench">RepoBench</a></li> |
166 | | - <li><a href="https://www.swebench.com/">SWE-bench</a></li> |
167 | | - <li><a href="https://leaderboard.tabbyml.com/">TabbyML</a></li> |
168 | | - <li><a href="https://llm4softwaretesting.github.io/">TestEval</a></li> |
169 | | - </ol> |
170 | | - </div> |
171 | | - </div> |
172 | | - |
173 | | - <!-- Acknowledgements --> |
174 | | - <!-- 🙏 Acknowledgements --> |
175 | | -<!-- 🙏 Acknowledgements --> |
176 | | -<div id="notes" class="w-100 mb-5"> |
177 | | - <h3>🙏 Acknowledgements</h3> |
178 | | - <div class="inline-block mt-3"> |
179 | | - <ol> |
180 | | - <li> |
181 | | - We build on prior work — <strong><a href="https://arxiv.org/abs/2310.06770" target="_blank">SWE-bench</a></strong>, |
182 | | - <strong><a href="https://arxiv.org/abs/2407.01489" target="_blank">Agentless</a></strong>, and |
183 | | - <strong><a href="https://arxiv.org/abs/2404.05427" target="_blank">AutoCodeRover</a></strong> — |
184 | | - which laid the groundwork for this study. |
185 | | - </li> |
186 | | - |
187 | | - <li> |
188 | | - We thank the <strong><a href="https://github.com/evalplus/evalplus" target="_blank">EvalPlus leaderboard</a></strong> |
189 | | - team for releasing the elegant page template that inspired this site. |
190 | | - </li> |
191 | | - |
192 | | - <li> |
193 | | - Finally, we are grateful to the <strong>open-source developer community</strong> for their invaluable contributions. |
194 | | - </li> |
195 | | - </ol> |
196 | | - </div> |
197 | | -</div> |
198 | 100 |
|
| 101 | + <!-- Notes(略,保持不动) --> |
| 102 | + <!-- ... 其余静态内容不变 ... --> |
199 | 103 |
|
200 | 104 | </div><!-- /#content --> |
201 | 105 |
|
202 | | - <!-- 渲染脚本:与之前一致 --> |
| 106 | + <!-- 渲染脚本 --> |
203 | 107 | <script> |
204 | 108 | (async () => { |
205 | | - /* 1. 读取结果文件 */ |
206 | 109 | const res = await fetch('results/results.json'); |
207 | | - if (!res.ok) { alert('Failed to load results.json'); return; } |
| 110 | + if (!res.ok){ alert('Failed to load results.json'); return; } |
208 | 111 | const raw = Object.values(await res.json()); |
209 | | - |
210 | | - /* 2. 各语言字段名 —— 按你的 results.json 来改 */ |
| 112 | + |
211 | 113 | const keyMap = { |
212 | | - full: '%resolved_full', |
213 | | - python: '%resolved_python', |
214 | | - java: '%resolved_java', |
215 | | - javascript: '%resolved_javascript', |
216 | | - typescript: '%resolved_typescript' |
| 114 | + full:'%resolved_full', |
| 115 | + python:'%resolved_python', |
| 116 | + java:'%resolved_java', |
| 117 | + javascript:'%resolved_javascript', |
| 118 | + typescript:'%resolved_typescript' |
217 | 119 | }; |
218 | | - |
| 120 | + |
219 | 121 | const tbody = document.getElementById('leaderboard-body'); |
220 | 122 | const radios = document.querySelectorAll('input[name="langradio"]'); |
221 | | - |
222 | | - /* 百分比显示工具 */ |
223 | | - const toPercent = v => v == null ? '--' |
224 | | - : (v < 1 ? v * 100 : v).toFixed(1) + '%'; |
225 | | - |
226 | | - function render(lang) { |
| 123 | + |
| 124 | + const toPercent = v => v==null ? '--' : (v<1?v*100:v).toFixed(1)+'%'; |
| 125 | + |
| 126 | + function render(lang){ |
227 | 127 | const k = keyMap[lang]; |
228 | 128 | tbody.innerHTML = ''; |
229 | | - |
230 | | - raw.filter(r => r[k] != null) |
231 | | - .sort((a, b) => b[k] - a[k]) |
232 | | - .forEach((r, i) => { |
233 | | - const medal = i === 0 ? '🥇 ' : i === 1 ? '🥈 ' |
234 | | - : i === 2 ? '🥉 ' : ''; |
235 | | - |
236 | | - const orgUrl = (r.org || '').replace(/&/g, '&'); |
237 | | - const siteUrl = (r.site || '').replace(/&/g, '&'); |
238 | | - |
239 | | - const orgIcon = orgUrl ? `<img src="${orgUrl}" style="height:1.5em;">` : '-'; |
240 | | - const siteLink = siteUrl ? `<a href="${siteUrl}" target="_blank">🔗</a>` : '-'; |
241 | | - |
242 | | - |
243 | | - tbody.insertAdjacentHTML('beforeend', ` |
| 129 | + |
| 130 | + raw.filter(r=>r[k]!=null) |
| 131 | + .sort((a,b)=>b[k]-a[k]) |
| 132 | + .forEach((r,i)=>{ |
| 133 | + const medal = i===0?'🥇 ':i===1?'🥈 ':i===2?'🥉 ':''; |
| 134 | + const orgUrl = (r.org ||'').replace(/&/g,'&'); |
| 135 | + const siteUrl = (r.site ||'').replace(/&/g,'&'); |
| 136 | + const orgIcon = orgUrl ? `<img src="${orgUrl}">` : '-'; |
| 137 | + const siteLink = siteUrl? `<a href="${siteUrl}" target="_blank">🔗</a>` : '-'; |
| 138 | + |
| 139 | + tbody.insertAdjacentHTML('beforeend',` |
244 | 140 | <tr> |
245 | 141 | <td>${medal}${r.method}</td> |
246 | 142 | <td>${r.model}</td> |
247 | 143 | <td class="text-center">${toPercent(r[k])}</td> |
248 | | - <td class="text-center">${orgIcon}</td> |
249 | | - <td class="text-center">${siteLink}</td> |
| 144 | + <td class="icon-cell">${orgIcon}</td> |
| 145 | + <td class="icon-cell">${siteLink}</td> |
250 | 146 | <td class="text-center">${r.date ?? '--'}</td> |
251 | 147 | </tr> |
252 | 148 | `); |
253 | 149 | }); |
254 | 150 | } |
255 | | - |
256 | | - render('full'); // 默认显示全量 |
257 | | - radios.forEach(r => // 监听语言切换 |
258 | | - r.addEventListener('change', () => r.checked && render(r.value)) |
259 | | - ); |
| 151 | + |
| 152 | + render('full'); |
| 153 | + radios.forEach(r=>r.addEventListener('change',()=>r.checked&&render(r.value))); |
260 | 154 | })(); |
261 | | - </script> |
262 | | - |
263 | | - |
| 155 | + </script> |
| 156 | + |
264 | 157 | </body> |
265 | 158 | </html> |
0 commit comments