Update leaderboard and mostly automate with GPTScript

raymyers · Dec 5, 2024 · e929d11 · e929d11
1 parent 71fed95
commit e929d11
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 43 deletions.
diff --git a/license-notes.txt b/license-notes.txt
@@ -0,0 +1,7 @@
+OpenHands = MIT
+Composio = Source Available (ELv2)
+Agentless = MIT
+Moatless = MIT
+AppMap Navie = Source Available
+AutoCodeRover = Source Available
+SWE-agent = MIT
diff --git a/src/components/Leaderboard/index.tsx b/src/components/Leaderboard/index.tsx
@@ -3,56 +3,61 @@ export default function Leaderboard(props): JSX.Element {
     const { compact } = props;
     const showGroup = true;
     const showLicense = !compact;
-    const showStatus = !compact;
     const leaderboardData = [
 
       {
-        name: "Moatless Tools",
-        subtitle: "+ Claude 3.5 Sonnet",
-        url: "https://github.com/aorwall/moatless-tools",
-        score: "-",
-        scoreLite: "26.67%",
-        status: "Verified",
-        group: "Albert Örwall",
+        name: "OpenHands + CodeAct v2.1",
+        subtitle: "(claude-3-5-sonnet-20241022)",
+        url: "https://docs.all-hands.dev/",
+        score: "53.00%",
+        group: "OpenHands",
         license: "MIT"
       },
       {
-        name: "OpenDevin CodeAct 1.3",
-        subtitle: "+ GPT-4o",
-        url: "https://x.com/gneubig/status/1791498953709752405",
-        score: "-",
-        scoreLite: "26.67%",
-        status: "Reported",
-        group: "OpenDevin",
+        name: "Agentless-1.5",
+        subtitle: "+ Claude-3.5 Sonnet (20241022)",
+        url: "https://github.com/OpenAutoCoder/Agentless",
+        score: "50.80%",
+        group: "Agentless",
         license: "MIT"
       },
       {
-        name: "Aider",
-        subtitle: "+ GPT-4o, Claude 3 Opus",
-        url: "https://aider.chat/2024/05/22/swe-bench-lite.html",
-        score: "-",
-        scoreLite: "26.33%",
-        status: "Reported",
-        group: "Paul Gauthier",
-        license: "Apache-2"
+        name: "Composio SWE-Kit",
+        subtitle: "(2024-10-25)",
+        url: "https://github.com/ComposioHQ/composio/tree/master/python/swe/agent",
+        score: "48.60%",
+        group: "Composio",
+        license: "Source Available (ELv2)"
       },
       {
-        name: "AppMap Navie",
-        subtitle: "+ GPT-4o",
+        name: "AppMap Navie v2",
+        subtitle: "",
         url: "https://appmap.io/navie",
-        score: "-",
-        scoreLite: "21.67%",
-        status: "Reported",
+        score: "47.20%",
         group: "AppMap",
         license: "Source Available"
       },
+      {
+        name: "AutoCodeRover-v2.0",
+        subtitle: "(Claude-3.5-Sonnet-20241022)",
+        url: "https://www.autocoderover.net/",
+        score: "46.20%",
+        group: "AutoCodeRover",
+        license: "Source Available"
+      },
+      {
+        name: "Moatless Tools",
+        subtitle: "+ Claude 3.5 Sonnet (20241022)",
+        url: "https://github.com/aorwall/moatless-tools",
+        score: "~45%",
+        group: "Albert Örwall",
+        license: "MIT"
+      },
       {
         name: "SWE-agent",
-        subtitle: "+ GPT-4",
+        subtitle: "+ Claude 3.5 Sonnet",
         url: "https://swe-agent.com/",
-        score: "12.47%",
-        scoreLite: "18%",
-        status: "Verified",
+        score: "33.60%",
         group: "Princeton NLP",
         license: "MIT"
       },
@@ -64,8 +69,7 @@ export default function Leaderboard(props): JSX.Element {
         <tr>
           <th>Rank</th>
           <th>Agent</th>
-          <th>Score (lite)</th>
-          { showStatus && <th>Status</th> }
+          <th>Score (verified)</th>
           { showGroup && <th>Group</th> }
           { showLicense && <th>License</th> }
         </tr>
@@ -76,14 +80,13 @@ export default function Leaderboard(props): JSX.Element {
               <a href={data.url}>{data.name}</a>
               {data.subtitle && <span><br />{data.subtitle}</span>}
             </td>
-            <td>{data.scoreLite}</td>
-            { showStatus && <td>{data.status}</td> }
+            <td>{data.score}</td>
             { showGroup && <td>{data.group}</td> }
             { showLicense && <td>{data.license}</td> }
           </tr>
         )}
       </table>
-      <p><i>Last updated: 2024-06-25</i></p>
+      <p><i>Last updated: 2024-11-24</i></p>
     </div>
   );
 }
diff --git a/src/pages/leaderboards.mdx b/src/pages/leaderboards.mdx
@@ -7,20 +7,20 @@ import Leaderboard from '@site/src/components/Leaderboard';
 
 There are many LLM benchmarks, but for the purposes of evaluating Autonomous DevTools we are most interested in testing and agent's ability to address a realistic task on an existing codebase.
 
-## SWE-bench lite
+## SWE-bench verified
 **The gold standard**. 
 Released in September 2023 by Princeton NLP, SWE-bench is the most widely accepted measure of an agent's ability to solve tasks in a realistic codebase. 
 It was constructed from GitHub Pull Requests from real Open Source respositories, with unit tests verifying the change. 
-To pass, the agent must effectively recreate that Pull Request. Since the full set is costly to run, the subset `SWE-bench lite` is often used.
+To pass, the agent must effectively recreate that Pull Request.
 
-[SWE-bench](https://www.swebench.com) maintains the official leaderboard where results are reported and verified.
+The full set is costly to run, we currently prefer the subset `SWE-bench verified`, which has been confirmed possible to solve by humans. Paul Gauthier [explains](https://github.com/princeton-nlp/SWE-bench/issues/72) the problem that led to that subset.
+
+[SWE-bench](https://www.swebench.com) maintains the official leaderboard where results are reported.
 Nopilot focuses on listing all reported scored by Open Source and Source Available agents.
 
 <Leaderboard />
 
-These are *unassisted* scores. SWE-bench scores come in "assisted" and "unassisted" versions. "Assisted" means the agent is told which files need to be modified by the "oracle". There is usually a large difference between these scores, highlighting that navigating the codebase is a key part of the problem.
-
-Paul Gauthier [points out](https://github.com/princeton-nlp/SWE-bench/issues/72) that some SWE-bench cases appear to be underspecified and effectively impossible to solve because the tests rely on implementation detail. It's unclear what the maximum possible score is.
+These are *unassisted* scores. "Assisted" means the agent is told which files need to be modified by the "oracle". There is usually a large difference between these scores, highlighting that navigating the codebase is a key part of the problem.
 
 ## Aider Leaderboards
 

diff --git a/update-leaderboard.gpt b/update-leaderboard.gpt
@@ -0,0 +1,7 @@
+tools: sys.download, sys.read, sys.write
+
+Download https://www.swebench.com to swebench.html
+Extract the names and scores in the div with id "leaderboard-Verified", save to current-verified.txt
+Filter current-verified to only include those listed in `license-notes.txt`. Updating `src/components/Leaderboard/index.tsx` with the items and licenses.
+Update "last updated" date in the `src/components/Leaderboard/index.tsx` file to reflect the current date.
+