From 4fee0c3f47be82065e6026d808f68be4e065c7bf Mon Sep 17 00:00:00 2001 From: Boyu Gou Date: Tue, 17 Dec 2024 15:51:23 -0500 Subject: [PATCH] update --- .github/workflows/main.yml | 2 +- README.md | 6 + .../author_Boyuan_Zheng.md | 0 .../author_Caiming_Xiong.md | 0 .../author_Difei_Gao.md | 0 .../author_Graham_Neubig.md | 0 .../author_Hanyu_Lai.md | 0 .../author_Huan_Sun.md | 0 .../author_Jie_Tang.md | 0 .../author_Shuyan_Zhou.md | 0 .../author_Tao_Yu.md | 0 .../author_Tianbao_Xie.md | 0 .../author_Wei_Chen.md | 0 .../author_Xiao_Liu.md | 0 .../author_Yu_Su.md | 0 .../author_Yuxiao_Dong.md | 0 .../author_Zhiyong_Wu.md | 0 .../env_desktop.md | 0 .../env_general.md | 0 {grouped_by_env => paper_by_env}/env_gui.md | 0 .../env_mobile.md | 18 +-- {grouped_by_env => paper_by_env}/env_web.md | 0 update_template_or_data/update_paper_list.md | 90 ++++++------ .../update_readme_template.md | 8 +- .../utils/scripts/sort_by_date.py | 135 +++++++++--------- 25 files changed, 133 insertions(+), 126 deletions(-) rename {grouped_by_authors => paper_by_author}/author_Boyuan_Zheng.md (100%) rename {grouped_by_authors => paper_by_author}/author_Caiming_Xiong.md (100%) rename {grouped_by_authors => paper_by_author}/author_Difei_Gao.md (100%) rename {grouped_by_authors => paper_by_author}/author_Graham_Neubig.md (100%) rename {grouped_by_authors => paper_by_author}/author_Hanyu_Lai.md (100%) rename {grouped_by_authors => paper_by_author}/author_Huan_Sun.md (100%) rename {grouped_by_authors => paper_by_author}/author_Jie_Tang.md (100%) rename {grouped_by_authors => paper_by_author}/author_Shuyan_Zhou.md (100%) rename {grouped_by_authors => paper_by_author}/author_Tao_Yu.md (100%) rename {grouped_by_authors => paper_by_author}/author_Tianbao_Xie.md (100%) rename {grouped_by_authors => paper_by_author}/author_Wei_Chen.md (100%) rename {grouped_by_authors => paper_by_author}/author_Xiao_Liu.md (100%) rename {grouped_by_authors => paper_by_author}/author_Yu_Su.md (100%) rename {grouped_by_authors => paper_by_author}/author_Yuxiao_Dong.md (100%) rename {grouped_by_authors => paper_by_author}/author_Zhiyong_Wu.md (100%) rename {grouped_by_env => paper_by_env}/env_desktop.md (100%) rename {grouped_by_env => paper_by_env}/env_general.md (100%) rename {grouped_by_env => paper_by_env}/env_gui.md (100%) rename {grouped_by_env => paper_by_env}/env_mobile.md (100%) rename {grouped_by_env => paper_by_env}/env_web.md (100%) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index da3247e..c56ce2e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -53,7 +53,7 @@ jobs: run: | git config --global user.name "github-actions" git config --global user.email "github-actions@github.com" - git add README.md grouped_by_env grouped_by_authors + git add README.md paper_by_env paper_by_author git commit -m "Update README with sorted content from update_template_or_data/update_paper_list.md" git push env: diff --git a/README.md b/README.md index 11a58b0..c9160c6 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,12 @@ This paper list covers a variety of papers related to GUI Agents, including but - Works in general Domains extensively used by GUI Agents (e.g., SoM prompting) +
+ Image 1 + Image 2 +
+ + ## Papers Grouped by Envs | [Web](grouped_by_env/env_web.md) | [Mobile](grouped_by_env/env_mobile.md) | [Desktop](grouped_by_env/env_desktop.md) | [GUI](grouped_by_env/env_gui.md) | [General](grouped_by_env/env_general.md) | diff --git a/grouped_by_authors/author_Boyuan_Zheng.md b/paper_by_author/author_Boyuan_Zheng.md similarity index 100% rename from grouped_by_authors/author_Boyuan_Zheng.md rename to paper_by_author/author_Boyuan_Zheng.md diff --git a/grouped_by_authors/author_Caiming_Xiong.md b/paper_by_author/author_Caiming_Xiong.md similarity index 100% rename from grouped_by_authors/author_Caiming_Xiong.md rename to paper_by_author/author_Caiming_Xiong.md diff --git a/grouped_by_authors/author_Difei_Gao.md b/paper_by_author/author_Difei_Gao.md similarity index 100% rename from grouped_by_authors/author_Difei_Gao.md rename to paper_by_author/author_Difei_Gao.md diff --git a/grouped_by_authors/author_Graham_Neubig.md b/paper_by_author/author_Graham_Neubig.md similarity index 100% rename from grouped_by_authors/author_Graham_Neubig.md rename to paper_by_author/author_Graham_Neubig.md diff --git a/grouped_by_authors/author_Hanyu_Lai.md b/paper_by_author/author_Hanyu_Lai.md similarity index 100% rename from grouped_by_authors/author_Hanyu_Lai.md rename to paper_by_author/author_Hanyu_Lai.md diff --git a/grouped_by_authors/author_Huan_Sun.md b/paper_by_author/author_Huan_Sun.md similarity index 100% rename from grouped_by_authors/author_Huan_Sun.md rename to paper_by_author/author_Huan_Sun.md diff --git a/grouped_by_authors/author_Jie_Tang.md b/paper_by_author/author_Jie_Tang.md similarity index 100% rename from grouped_by_authors/author_Jie_Tang.md rename to paper_by_author/author_Jie_Tang.md diff --git a/grouped_by_authors/author_Shuyan_Zhou.md b/paper_by_author/author_Shuyan_Zhou.md similarity index 100% rename from grouped_by_authors/author_Shuyan_Zhou.md rename to paper_by_author/author_Shuyan_Zhou.md diff --git a/grouped_by_authors/author_Tao_Yu.md b/paper_by_author/author_Tao_Yu.md similarity index 100% rename from grouped_by_authors/author_Tao_Yu.md rename to paper_by_author/author_Tao_Yu.md diff --git a/grouped_by_authors/author_Tianbao_Xie.md b/paper_by_author/author_Tianbao_Xie.md similarity index 100% rename from grouped_by_authors/author_Tianbao_Xie.md rename to paper_by_author/author_Tianbao_Xie.md diff --git a/grouped_by_authors/author_Wei_Chen.md b/paper_by_author/author_Wei_Chen.md similarity index 100% rename from grouped_by_authors/author_Wei_Chen.md rename to paper_by_author/author_Wei_Chen.md diff --git a/grouped_by_authors/author_Xiao_Liu.md b/paper_by_author/author_Xiao_Liu.md similarity index 100% rename from grouped_by_authors/author_Xiao_Liu.md rename to paper_by_author/author_Xiao_Liu.md diff --git a/grouped_by_authors/author_Yu_Su.md b/paper_by_author/author_Yu_Su.md similarity index 100% rename from grouped_by_authors/author_Yu_Su.md rename to paper_by_author/author_Yu_Su.md diff --git a/grouped_by_authors/author_Yuxiao_Dong.md b/paper_by_author/author_Yuxiao_Dong.md similarity index 100% rename from grouped_by_authors/author_Yuxiao_Dong.md rename to paper_by_author/author_Yuxiao_Dong.md diff --git a/grouped_by_authors/author_Zhiyong_Wu.md b/paper_by_author/author_Zhiyong_Wu.md similarity index 100% rename from grouped_by_authors/author_Zhiyong_Wu.md rename to paper_by_author/author_Zhiyong_Wu.md diff --git a/grouped_by_env/env_desktop.md b/paper_by_env/env_desktop.md similarity index 100% rename from grouped_by_env/env_desktop.md rename to paper_by_env/env_desktop.md diff --git a/grouped_by_env/env_general.md b/paper_by_env/env_general.md similarity index 100% rename from grouped_by_env/env_general.md rename to paper_by_env/env_general.md diff --git a/grouped_by_env/env_gui.md b/paper_by_env/env_gui.md similarity index 100% rename from grouped_by_env/env_gui.md rename to paper_by_env/env_gui.md diff --git a/grouped_by_env/env_mobile.md b/paper_by_env/env_mobile.md similarity index 100% rename from grouped_by_env/env_mobile.md rename to paper_by_env/env_mobile.md index b1d674f..565bbbb 100644 --- a/grouped_by_env/env_mobile.md +++ b/paper_by_env/env_mobile.md @@ -7,15 +7,6 @@ - ๐Ÿ”‘ Key: [framework], [dataset], [benchmark], [AndroidLab] - ๐Ÿ“– TLDR: This paper introduces **AndroidLab**, a comprehensive framework for training and systematically benchmarking Android autonomous agents. It provides an operational environment with diverse modalities and action spaces, supporting both large language models (LLMs) and multimodal models (LMMs). The benchmark includes 138 tasks across nine apps on predefined Android virtual devices. Utilizing AndroidLab, the authors developed an Android Instruction dataset and trained six open-source LLMs and LMMs, significantly improving their average success rates. -- [MobileSafetyBench: Evaluating Safety of Autonomous Agents in Mobile Device Control](https://arxiv.org/abs/2410.17520) - - Juyong Lee, Dongyoon Hahm, June Suk Choi, W. Bradley Knox, Kimin Lee - - ๐Ÿ›๏ธ Institutions: KAIST, UT at Austin - - ๐Ÿ“… Date: October 23, 2024 - - ๐Ÿ“‘ Publisher: arXiv - - ๐Ÿ’ป Env: [Mobile] - - ๐Ÿ”‘ Key: [benchmark], [safety], [evaluation], [Android emulator] - - ๐Ÿ“– TLDR: *MobileSafetyBench* introduces a benchmark for evaluating the safety of large language model (LLM)-based autonomous agents in mobile device control. Using Android emulators, the benchmark simulates real-world tasks in apps such as messaging and banking to assess agents' safety and helpfulness. The safety-focused tasks test for privacy risk management and robustness against adversarial prompt injections. Experiments show agents perform well in helpful tasks but struggle with safety-related challenges, underscoring the need for continued advancements in mobile safety mechanisms for autonomous agents. - - [Lightweight Neural App Control](https://arxiv.org/abs/2410.17883) - Filippos Christianos, Georgios Papoudakis, Thomas Coste, Jianye Hao, Jun Wang, Kun Shao - ๐Ÿ›๏ธ Institutions: Huawei Noah's Ark Lab, UCL @@ -25,6 +16,15 @@ - ๐Ÿ”‘ Key: [framework], [vision-language model], [Action Transformer], [app agent], [Android control], [multi-modal] - ๐Ÿ“– TLDR: This paper introduces LiMAC, a mobile control framework for Android that integrates an Action Transformer and fine-tuned vision-language models to execute precise actions in mobile apps. Tested on open-source datasets, LiMAC improves action accuracy by up to 42% over traditional prompt engineering baselines, demonstrating enhanced efficiency and accuracy in mobile app control tasks. +- [MobileSafetyBench: Evaluating Safety of Autonomous Agents in Mobile Device Control](https://arxiv.org/abs/2410.17520) + - Juyong Lee, Dongyoon Hahm, June Suk Choi, W. Bradley Knox, Kimin Lee + - ๐Ÿ›๏ธ Institutions: KAIST, UT at Austin + - ๐Ÿ“… Date: October 23, 2024 + - ๐Ÿ“‘ Publisher: arXiv + - ๐Ÿ’ป Env: [Mobile] + - ๐Ÿ”‘ Key: [benchmark], [safety], [evaluation], [Android emulator] + - ๐Ÿ“– TLDR: *MobileSafetyBench* introduces a benchmark for evaluating the safety of large language model (LLM)-based autonomous agents in mobile device control. Using Android emulators, the benchmark simulates real-world tasks in apps such as messaging and banking to assess agents' safety and helpfulness. The safety-focused tasks test for privacy risk management and robustness against adversarial prompt injections. Experiments show agents perform well in helpful tasks but struggle with safety-related challenges, underscoring the need for continued advancements in mobile safety mechanisms for autonomous agents. + - [SPA-Bench: A Comprehensive Benchmark for SmartPhone Agent Evaluation](https://ar5iv.org/abs/2410.15164) - Jingxuan Chen, Derek Yuen, Bin Xie, Yuhao Yang, Gongwei Chen, Zhihao Wu, Li Yixing, Xurui Zhou, Weiwen Liu, Shuai Wang, Rui Shao, Liqiang Nie, Yasheng Wang, Jianye Hao, Jun Wang, Kun Shao - ๐Ÿ›๏ธ Institutions: Huawei Noahโ€™s Ark Lab, Harbin Institute of Technology, Shenzhen, UCL diff --git a/grouped_by_env/env_web.md b/paper_by_env/env_web.md similarity index 100% rename from grouped_by_env/env_web.md rename to paper_by_env/env_web.md diff --git a/update_template_or_data/update_paper_list.md b/update_template_or_data/update_paper_list.md index 0527b5a..64d9186 100644 --- a/update_template_or_data/update_paper_list.md +++ b/update_template_or_data/update_paper_list.md @@ -169,15 +169,6 @@ - ๐Ÿ”‘ Key: [dataset], [framework], [synthetic data] - ๐Ÿ“– TLDR: The *EDGE* framework proposes an innovative approach to improve GUI understanding and interaction capabilities in vision-language models through large-scale, multi-granularity synthetic data generation. By leveraging webpage data, EDGE minimizes the need for manual annotations and enhances the adaptability of models across desktop and mobile GUI environments. Evaluations show its effectiveness in diverse GUI-related tasks, contributing significantly to autonomous agent development in GUI navigation and interaction. -- [VideoWebArena: Evaluating Long Context Multimodal Agents with Video Understanding Web Tasks](https://doi.org/10.48550/arXiv.2410.19100) - - Lawrence Jang, Yinheng Li, Charles Ding, Justin Lin, Paul Pu Liang, Dan Zhao, Rogerio Bonatti, Kazuhito Koishida - - ๐Ÿ›๏ธ Institutions: CMU, MIT, NYU, Microsoft - - ๐Ÿ“… Date: October 24, 2024 - - ๐Ÿ“‘ Publisher: arXiv - - ๐Ÿ’ป Env: [Web] - - ๐Ÿ”‘ Key: [benchmark], [dataset], [video understanding], [long-context], [VideoWA] - - ๐Ÿ“– TLDR: This paper introduces **VideoWebArena (VideoWA)**, a benchmark assessing multimodal agents in video-based tasks. It features over 2,000 tasks focused on skill and factual retention, using video tutorials to simulate long-context environments. Results highlight current challenges in agentic abilities, providing a critical testbed for long-context video understanding improvements. - - [AgentStore: Scalable Integration of Heterogeneous Agents As Specialized Generalist Computer Assistant](https://arxiv.org/abs/2410.18603) - Chengyou Jia, Minnan Luo, Zhuohang Dang, Qiushi Sun, Fangzhi Xu, Junlin Hu, Tianbao Xie, Zhiyong Wu - ๐Ÿ›๏ธ Institutions: XJTU, Shanghai AI Lab, HKU @@ -196,6 +187,15 @@ - ๐Ÿ”‘ Key: [API-based agent], [hybrid agent], [benchmark], [WebArena], [SOTA performance] - ๐Ÿ“– TLDR: This paper introduces API-based and hybrid agents designed to execute online tasks by accessing both APIs and traditional web browsing interfaces. In evaluations using WebArena, a benchmark for web navigation, the API-based agent achieves higher performance than browser-based agents, and the hybrid model achieves a success rate of 35.8%, setting a new state-of-the-art (SOTA) in task-agnostic web navigation. The findings highlight the efficiency and reliability gains of API interactions for web agents. +- [VideoWebArena: Evaluating Long Context Multimodal Agents with Video Understanding Web Tasks](https://doi.org/10.48550/arXiv.2410.19100) + - Lawrence Jang, Yinheng Li, Charles Ding, Justin Lin, Paul Pu Liang, Dan Zhao, Rogerio Bonatti, Kazuhito Koishida + - ๐Ÿ›๏ธ Institutions: CMU, MIT, NYU, Microsoft + - ๐Ÿ“… Date: October 24, 2024 + - ๐Ÿ“‘ Publisher: arXiv + - ๐Ÿ’ป Env: [Web] + - ๐Ÿ”‘ Key: [benchmark], [dataset], [video understanding], [long-context], [VideoWA] + - ๐Ÿ“– TLDR: This paper introduces **VideoWebArena (VideoWA)**, a benchmark assessing multimodal agents in video-based tasks. It features over 2,000 tasks focused on skill and factual retention, using video tutorials to simulate long-context environments. Results highlight current challenges in agentic abilities, providing a critical testbed for long-context video understanding improvements. + - [Lightweight Neural App Control](https://arxiv.org/abs/2410.17883) - Filippos Christianos, Georgios Papoudakis, Thomas Coste, Jianye Hao, Jun Wang, Kun Shao - ๐Ÿ›๏ธ Institutions: Huawei Noah's Ark Lab, UCL @@ -268,6 +268,15 @@ - ๐Ÿ”‘ Key: [framework], [autonomous GUI interaction], [experience-augmented hierarchical planning] - ๐Ÿ“– TLDR: This paper introduces Agent S, an open agentic framework that enables autonomous interaction with computers through a Graphical User Interface (GUI). The system addresses key challenges in automating computer tasks through experience-augmented hierarchical planning and an Agent-Computer Interface (ACI). Agent S demonstrates significant improvements over baselines on the OSWorld benchmark, achieving a 20.58% success rate (83.6% relative improvement). The framework shows generalizability across different operating systems and provides insights for developing more effective GUI agents. +- [ST-WebAgentBench: A Benchmark for Evaluating Safety and Trustworthiness in Web Agents](https://sites.google.com/view/st-webagentbench/home) + - Ido Levy, Ben Wiesel, Sami Marreed, Alon Oved, Avi Yaeli, Segev Shlomov + - ๐Ÿ›๏ธ Institutions: IBM Research + - ๐Ÿ“… Date: October 9, 2024 + - ๐Ÿ“‘ Publisher: arXiv + - ๐Ÿ’ป Env: [Web] + - ๐Ÿ”‘ Key: [benchmark], [safety], [trustworthiness], [ST-WebAgentBench] + - ๐Ÿ“– TLDR: This paper introduces **ST-WebAgentBench**, a benchmark designed to evaluate the safety and trustworthiness of web agents in enterprise contexts. It defines safe and trustworthy agent behavior, outlines the structure of safety policies, and introduces the "Completion under Policies" metric to assess agent performance. The study reveals that current state-of-the-art agents struggle with policy adherence, highlighting the need for improved policy awareness and compliance in web agents. + - [TinyClick: Single-Turn Agent for Empowering GUI Automation](https://arxiv.org/abs/2410.11871) - Pawel Pawlowski, Krystian Zawistowski, Wojciech Lapacz, Marcin Skorupa, Adam Wiacek, Sebastien Postansque, Jakub Hoscilowicz - ๐Ÿ›๏ธ Institutions: Samsung R&D Poland, Warsaw University of Technology @@ -286,15 +295,6 @@ - ๐Ÿ”‘ Key: [framework], [model], [SeeClick], [AITW benchmark] - ๐Ÿ“– TLDR: The paper introduces *ClickAgent*, a framework that enhances autonomous agents' interaction with mobile UIs by improving their ability to locate interface elements accurately. This is achieved through a dual-component system where an MLLM performs reasoning and action planning, while a dedicated UI location model (e.g., SeeClick) handles element identification. ClickAgent, evaluated on the AITW benchmark and tested on both emulators and real Android devices, surpasses other agents like CogAgent and AppAgent in task success rate, advancing automation reliability on mobile platforms. -- [ST-WebAgentBench: A Benchmark for Evaluating Safety and Trustworthiness in Web Agents](https://sites.google.com/view/st-webagentbench/home) - - Ido Levy, Ben Wiesel, Sami Marreed, Alon Oved, Avi Yaeli, Segev Shlomov - - ๐Ÿ›๏ธ Institutions: IBM Research - - ๐Ÿ“… Date: October 9, 2024 - - ๐Ÿ“‘ Publisher: arXiv - - ๐Ÿ’ป Env: [Web] - - ๐Ÿ”‘ Key: [benchmark], [safety], [trustworthiness], [ST-WebAgentBench] - - ๐Ÿ“– TLDR: This paper introduces **ST-WebAgentBench**, a benchmark designed to evaluate the safety and trustworthiness of web agents in enterprise contexts. It defines safe and trustworthy agent behavior, outlines the structure of safety policies, and introduces the "Completion under Policies" metric to assess agent performance. The study reveals that current state-of-the-art agents struggle with policy adherence, highlighting the need for improved policy awareness and compliance in web agents. - - [Navigating the Digital World as Humans Do: Universal Visual Grounding for GUI Agents](https://osu-nlp-group.github.io/UGround/) - Boyu Gou, Ruochen Wang, Boyuan Zheng, Yucheng Xie, Cheng Chang, Yiheng Shu, Haotian Sun, Yu Su - ๐Ÿ›๏ธ Institutions: OSU, Orby AI @@ -448,6 +448,15 @@ - ๐Ÿ”‘ Key: [framework], [AppAgent v2] - ๐Ÿ“– TLDR: This work presents *AppAgent v2*, a novel LLM-based multimodal agent framework for mobile devices capable of navigating applications by emulating human-like interactions such as tapping and swiping. The agent constructs a flexible action space that enhances adaptability across various applications, including parsing text and vision descriptions. It operates through two main phases: exploration and deployment, utilizing retrieval-augmented generation (RAG) technology to efficiently retrieve and update information from a knowledge base, thereby empowering the agent to perform tasks effectively and accurately. +- [OmniParser for Pure Vision Based GUI Agent](https://microsoft.github.io/OmniParser/) + - Yadong Lu, Jianwei Yang, Yelong Shen, Ahmed Awadallah + - ๐Ÿ›๏ธ Institutions: Microsoft Research, Microsoft Gen AI + - ๐Ÿ“… Date: August 1, 2024 + - ๐Ÿ“‘ Publisher: arXiv + - ๐Ÿ’ป Env: [GUI] + - ๐Ÿ”‘ Key: [framework], [dataset], [OmniParser] + - ๐Ÿ“– TLDR: This paper introduces **OmniParser**, a method for parsing user interface screenshots into structured elements, enhancing the ability of models like GPT-4V to generate actions accurately grounded in corresponding UI regions. The authors curated datasets for interactable icon detection and icon description, fine-tuning models to parse interactable regions and extract functional semantics of UI elements. + - [CoCo-Agent: A Comprehensive Cognitive MLLM Agent for Smartphone GUI Automation](https://aclanthology.org/2024.findings-acl.539) - Xinbei Ma, Zhuosheng Zhang, Hai Zhao - ๐Ÿ›๏ธ Institutions: SJTU @@ -466,15 +475,6 @@ - ๐Ÿ”‘ Key: [multimodal agents], [environmental distractions], [robustness] - ๐Ÿ“– TLDR: This paper highlights the vulnerability of multimodal agents to environmental distractions. The researchers demonstrate that these agents, which process multiple types of input (e.g., text, images, audio), can be significantly impacted by irrelevant or misleading environmental cues. The study provides insights into the limitations of current multimodal systems and emphasizes the need for more robust architectures that can filter out distractions and maintain focus on relevant information in complex, real-world environments. -- [OmniParser for Pure Vision Based GUI Agent](https://microsoft.github.io/OmniParser/) - - Yadong Lu, Jianwei Yang, Yelong Shen, Ahmed Awadallah - - ๐Ÿ›๏ธ Institutions: Microsoft Research, Microsoft Gen AI - - ๐Ÿ“… Date: August 1, 2024 - - ๐Ÿ“‘ Publisher: arXiv - - ๐Ÿ’ป Env: [GUI] - - ๐Ÿ”‘ Key: [framework], [dataset], [OmniParser] - - ๐Ÿ“– TLDR: This paper introduces **OmniParser**, a method for parsing user interface screenshots into structured elements, enhancing the ability of models like GPT-4V to generate actions accurately grounded in corresponding UI regions. The authors curated datasets for interactable icon detection and icon description, fine-tuning models to parse interactable regions and extract functional semantics of UI elements. - - [OfficeBench: Benchmarking Language Agents across Multiple Applications for Office Automation](https://arxiv.org/abs/2407.19056) - Zilong Wang, Yuedong Cui, Li Zhong, Zimin Zhang, Da Yin, Bill Yuchen Lin, Jingbo Shang - ๐Ÿ›๏ธ Institutions: UCSD, UCLA, AI2 @@ -547,6 +547,15 @@ - ๐Ÿ”‘ Key: [framework], [tool formulation], [multi-agent collaboration], [MobileExperts] - ๐Ÿ“– TLDR: This paper introduces *MobileExperts*, a framework that enhances autonomous operations on mobile devices by dynamically assembling agent teams based on user requirements. Each agent independently explores and formulates tools to evolve into an expert, improving efficiency and reducing reasoning costs. +- [AMEX: Android Multi-annotation Expo Dataset for Mobile GUI Agents](https://yuxiangchai.github.io/AMEX/) + - Yuxiang Chai, Siyuan Huang, Yazhe Niu, Han Xiao, Liang Liu, Dingyu Zhang, Peng Gao, Shuai Ren, Hongsheng Li + - ๐Ÿ›๏ธ Institutions: CUHK, SJTU, Shanghai AI Lab, vivo AI Lab + - ๐Ÿ“… Date: July 3, 2024 + - ๐Ÿ“‘ Publisher: arXiv + - ๐Ÿ’ป Env: [Mobile] + - ๐Ÿ”‘ Key: [dataset], [benchmark], [AMEX] + - ๐Ÿ“– TLDR: This paper introduces the **Android Multi-annotation EXpo (AMEX)**, a comprehensive dataset designed for training and evaluating mobile GUI-control agents. AMEX comprises over 104K high-resolution screenshots from 110 popular mobile applications, annotated at multiple levels, including GUI interactive element grounding, functionality descriptions, and complex natural language instructions. The dataset aims to advance research on AI agents capable of completing complex tasks by interacting directly with mobile device GUIs. + - [CRAB: Cross-environment Agent Benchmark for Multimodal Language Model Agents](https://arxiv.org/abs/2407.01511) - Tianqi Xu, Linyao Chen, Dai-Jie Wu, Yanjun Chen, Zecheng Zhang, Xiang Yao, Zhiqiang Xie, Yongchao Chen, Shilong Liu, Bochen Qian, Philip Torr, Bernard Ghanem, Guohao Li - ๐Ÿ›๏ธ Institutions: KAUST, UTokyo, CMU, Stanford, Harvard, Tsinghua, SUSTech, Oxford @@ -565,15 +574,6 @@ - ๐Ÿ”‘ Key: [framework], [VisionDroid] - ๐Ÿ“– TLDR: The paper presents **VisionDroid**, a vision-driven automated GUI testing approach utilizing Multimodal Large Language Models (MLLM) to detect non-crash functional bugs in mobile applications. By extracting GUI text information and aligning it with screenshots, VisionDroid enables MLLM to understand GUI context, facilitating deeper and function-oriented exploration. The approach segments exploration history into logically cohesive parts, prompting MLLM for bug detection, demonstrating superior performance over existing methods. -- [AMEX: Android Multi-annotation Expo Dataset for Mobile GUI Agents](https://yuxiangchai.github.io/AMEX/) - - Yuxiang Chai, Siyuan Huang, Yazhe Niu, Han Xiao, Liang Liu, Dingyu Zhang, Peng Gao, Shuai Ren, Hongsheng Li - - ๐Ÿ›๏ธ Institutions: CUHK, SJTU, Shanghai AI Lab, vivo AI Lab - - ๐Ÿ“… Date: July 3, 2024 - - ๐Ÿ“‘ Publisher: arXiv - - ๐Ÿ’ป Env: [Mobile] - - ๐Ÿ”‘ Key: [dataset], [benchmark], [AMEX] - - ๐Ÿ“– TLDR: This paper introduces the **Android Multi-annotation EXpo (AMEX)**, a comprehensive dataset designed for training and evaluating mobile GUI-control agents. AMEX comprises over 104K high-resolution screenshots from 110 popular mobile applications, annotated at multiple levels, including GUI interactive element grounding, functionality descriptions, and complex natural language instructions. The dataset aims to advance research on AI agents capable of completing complex tasks by interacting directly with mobile device GUIs. - - [Read Anywhere Pointed: Layout-aware GUI Screen Reading with Tree-of-Lens Grounding](https://screen-point-and-read.github.io/) - Yue Fan, Lei Ding, Ching-Chen Kuo, Shan Jiang, Yang Zhao, Xinze Guan, Jie Yang, Yi Zhang, Xin Eric Wang - ๐Ÿ›๏ธ Institutions: UCSC, Microsoft Research @@ -718,15 +718,6 @@ - ๐Ÿ”‘ Key: [framework], [multi-agent], [planning], [decision-making], [reflection] - ๐Ÿ“– TLDR: The paper presents **Mobile-Agent-v2**, a multi-agent architecture designed to assist with mobile device operations. It comprises three agents: a planning agent that generates task progress, a decision agent that navigates tasks using a memory unit, and a reflection agent that corrects erroneous operations. This collaborative approach addresses challenges in navigation and long-context input scenarios, achieving over a 30% improvement in task completion compared to single-agent architectures. -- [Visual Grounding for User Interfaces](https://aclanthology.org/2024.naacl-industry.9/) - - Yijun Qian, Yujie Lu, Alexander Hauptmann, Oriana Riva - - ๐Ÿ›๏ธ Institutions: CMU, UCSB - - ๐Ÿ“… Date: June 2024 - - ๐Ÿ“‘ Publisher: NAACL 2024 - - ๐Ÿ’ป Env: [GUI] - - ๐Ÿ”‘ Key: [framework], [visual grounding], [UI element localization], [LVG] - - ๐Ÿ“– TLDR: This work introduces the task of visual UI grounding, which unifies detection and grounding by enabling models to identify UI elements referenced by natural language commands solely from visual input. The authors propose **LVG**, a model that outperforms baselines pre-trained on larger datasets by over 4.9 points in top-1 accuracy, demonstrating its effectiveness in localizing referenced UI elements without relying on UI metadata. - - [WebSuite: Systematically Evaluating Why Web Agents Fail](https://arxiv.org/abs/2406.01623) - Eric Li, Jim Waldo - ๐Ÿ›๏ธ Institutions: Harvard @@ -745,6 +736,15 @@ - ๐Ÿ”‘ Key: [benchmark], [instructional videos], [visual planning], [hierarchical task decomposition], [complex software interaction] - ๐Ÿ“– TLDR: VideoGUI presents a benchmark for evaluating GUI automation on tasks derived from instructional videos, focusing on visually intensive applications like Adobe Photoshop and video editing software. The benchmark includes 178 tasks, with a hierarchical evaluation method distinguishing high-level planning, mid-level procedural steps, and precise action execution. VideoGUI reveals current model limitations in complex visual tasks, marking a significant step toward improved visual planning in GUI automation. +- [Visual Grounding for User Interfaces](https://aclanthology.org/2024.naacl-industry.9/) + - Yijun Qian, Yujie Lu, Alexander Hauptmann, Oriana Riva + - ๐Ÿ›๏ธ Institutions: CMU, UCSB + - ๐Ÿ“… Date: June 2024 + - ๐Ÿ“‘ Publisher: NAACL 2024 + - ๐Ÿ’ป Env: [GUI] + - ๐Ÿ”‘ Key: [framework], [visual grounding], [UI element localization], [LVG] + - ๐Ÿ“– TLDR: This work introduces the task of visual UI grounding, which unifies detection and grounding by enabling models to identify UI elements referenced by natural language commands solely from visual input. The authors propose **LVG**, a model that outperforms baselines pre-trained on larger datasets by over 4.9 points in top-1 accuracy, demonstrating its effectiveness in localizing referenced UI elements without relying on UI metadata. + - [Large Language Models Can Self-Improve At Web Agent Tasks](https://arxiv.org/abs/2405.20309) - Ajay Patel, Markus Hofmarcher, Claudiu Leoveanu-Condrei, Marius-Constantin Dinu, Chris Callison-Burch, Sepp Hochreiter - ๐Ÿ›๏ธ Institutions: University of Pennsylvania, ExtensityAI, Johannes Kepler University Linz, NXAI diff --git a/update_template_or_data/update_readme_template.md b/update_template_or_data/update_readme_template.md index a9f64ce..1b27775 100644 --- a/update_template_or_data/update_readme_template.md +++ b/update_template_or_data/update_readme_template.md @@ -10,9 +10,15 @@ This paper list covers a variety of papers related to GUI Agents, including but - Works in general Domains extensively used by GUI Agents (e.g., SoM prompting) +
+ Image 1 + Image 2 +
+ + ## Papers Grouped by Envs -| [Web](grouped_by_env/env_web.md) | [Mobile](grouped_by_env/env_mobile.md) | [Desktop](grouped_by_env/env_desktop.md) | [GUI](grouped_by_env/env_gui.md) | [General](grouped_by_env/env_general.md) | +| [Web](paper_by_envenv_web.md) | [Mobile](paper_by_envenv_mobile.md) | [Desktop](paper_by_envenv_desktop.md) | [GUI](paper_by_envenv_gui.md) | [General](paper_by_envenv_general.md) | |---------|---------------------------------------|-----------------------------------------|---------------------------------|-----------------------------------------| [//]: # (## Papers Grouped by Keywords) diff --git a/update_template_or_data/utils/scripts/sort_by_date.py b/update_template_or_data/utils/scripts/sort_by_date.py index d5ccf1c..a354a5a 100644 --- a/update_template_or_data/utils/scripts/sort_by_date.py +++ b/update_template_or_data/utils/scripts/sort_by_date.py @@ -82,7 +82,7 @@ def parse_date_with_defaults(date_str): import os # 1. ๅˆ›ๅปบๅญ้›†็š„ๆ–‡ไปถๅคน๏ผŒๅฆ‚ๆžœๆฒกๆœ‰็š„่ฏ -subgroup_dir = "grouped_by_env" +subgroup_dir = "paper_by_env" if not os.path.exists(subgroup_dir): os.makedirs(subgroup_dir) @@ -135,7 +135,7 @@ def parse_date_with_defaults(date_str): top_15_authors = [author for author, _ in author_counter.most_common(15)] # 4. ไธบๆฏไธชไฝœ่€…็”Ÿๆˆไธ€ไธชๆ–‡ไปถ๏ผŒๅชๅŒ…ๅซ่ฏฅไฝœ่€…็š„่ฎบๆ–‡ -subgroup_dir = "grouped_by_authors" +subgroup_dir = "paper_by_author" if not os.path.exists(subgroup_dir): os.makedirs(subgroup_dir) @@ -166,44 +166,44 @@ def parse_date_with_defaults(date_str): print(f"็”Ÿๆˆๆ–‡ไปถ๏ผš{author_file_path}") -# -# import matplotlib.pyplot as plt -# import seaborn as sns -# from collections import Counter -# # -# # 1. ๅˆๅง‹ๅŒ–ไธ€ไธชCounterๆฅ็ปŸ่ฎกไฝœ่€…ไฝœๅ“ๆ•ฐ้‡ -# author_counter = Counter() -# -# # 2. ้ๅŽ†่ฎบๆ–‡ๆก็›ฎ๏ผŒ็ปŸ่ฎกๆฏไธชไฝœ่€…็š„ไฝœๅ“ๆ•ฐ้‡ -# for _, row in papers_df.iterrows(): -# authors = row['Authors'] -# author_list = [author.strip() for author in authors.split(',')] # ๅ‡่ฎพไฝœ่€…ไปฅ้€—ๅทๅˆ†้š” -# author_counter.update(author_list) -# -# # 3. ่Žทๅ–ไฝœๅ“ๆœ€ๅคš็š„ๅ‰15ไธชไฝœ่€… -# top_15_authors = [author for author, _ in author_counter.most_common(15)] -# -# # 4. ่Žทๅ–ๅ‰15ไธชไฝœ่€…็š„ไฝœๅ“ๆ•ฐ้‡ -# top_15_counts = [author_counter[author] for author in top_15_authors] -# -# # 5. ๅˆ›ๅปบๆกๅฝขๅ›พ -# plt.figure(figsize=(10, 6)) # ่ฎพ็ฝฎๅ›พ่กจๅคงๅฐ -# sns.barplot(x=top_15_counts, y=top_15_authors, palette="viridis") -# -# # 6. ่ฎพ็ฝฎๅ›พ่กจๆ ‡้ข˜ๅ’Œๆ ‡็ญพ -# plt.title("Top 15 Authors by Number of Papers", fontsize=16) -# plt.xlabel("Number of Papers", fontsize=14) -# plt.ylabel("Authors", fontsize=14) -# -# # 7. ็พŽๅŒ–ๅ›พ่กจ๏ผˆๅฏ้€‰๏ผ‰ -# plt.xticks(fontsize=12) -# plt.yticks(fontsize=12) -# plt.tight_layout() -# -# # 8. ไฟๅญ˜ๅ›พ่กจๅˆฐๆ–‡ไปถ -# plt.savefig("update_template_or_data/top_15_authors.png") + +import matplotlib.pyplot as plt +import seaborn as sns +from collections import Counter # -# # 9. ๆ˜พ็คบๅ›พ่กจ +# 1. ๅˆๅง‹ๅŒ–ไธ€ไธชCounterๆฅ็ปŸ่ฎกไฝœ่€…ไฝœๅ“ๆ•ฐ้‡ +author_counter = Counter() + +# 2. ้ๅŽ†่ฎบๆ–‡ๆก็›ฎ๏ผŒ็ปŸ่ฎกๆฏไธชไฝœ่€…็š„ไฝœๅ“ๆ•ฐ้‡ +for _, row in papers_df.iterrows(): + authors = row['Authors'] + author_list = [author.strip() for author in authors.split(',')] # ๅ‡่ฎพไฝœ่€…ไปฅ้€—ๅทๅˆ†้š” + author_counter.update(author_list) + +# 3. ่Žทๅ–ไฝœๅ“ๆœ€ๅคš็š„ๅ‰15ไธชไฝœ่€… +top_15_authors = [author for author, _ in author_counter.most_common(15)] + +# 4. ่Žทๅ–ๅ‰15ไธชไฝœ่€…็š„ไฝœๅ“ๆ•ฐ้‡ +top_15_counts = [author_counter[author] for author in top_15_authors] + +# 5. ๅˆ›ๅปบๆกๅฝขๅ›พ +plt.figure(figsize=(10, 6)) # ่ฎพ็ฝฎๅ›พ่กจๅคงๅฐ +sns.barplot(x=top_15_counts, y=top_15_authors, palette="viridis") + +# 6. ่ฎพ็ฝฎๅ›พ่กจๆ ‡้ข˜ๅ’Œๆ ‡็ญพ +plt.title("Top 15 Authors by Number of Papers", fontsize=16) +plt.xlabel("Number of Papers", fontsize=14) +plt.ylabel("Authors", fontsize=14) + +# 7. ็พŽๅŒ–ๅ›พ่กจ๏ผˆๅฏ้€‰๏ผ‰ +plt.xticks(fontsize=12) +plt.yticks(fontsize=12) +plt.tight_layout() + +# 8. ไฟๅญ˜ๅ›พ่กจๅˆฐๆ–‡ไปถ +plt.savefig("update_template_or_data/statistics/top_15_authors.png") + +# 9. ๆ˜พ็คบๅ›พ่กจ # plt.show() from wordcloud import WordCloud @@ -222,42 +222,37 @@ def remove_square_brackets(s): from collections import Counter # 1. ๅฎšไน‰ๆŽ’้™ค็š„ๅ…ณ้”ฎ่ฏ -excluded_keywords = {"model", "framework", "benchmark", "dataset"} +# excluded_keywords = {"model", "framework", "benchmark", "dataset"} excluded_keywords = {} # 2. ๆๅ–ๆ‰€ๆœ‰่ฎบๆ–‡็š„ๅ…ณ้”ฎ่ฏๅนถ่ฎก็ฎ—้ข‘็Ž‡๏ผˆๆŽ’้™ค็‰นๅฎšๅ…ณ้”ฎ่ฏ๏ผ‰ +# Remove any leading or trailing spaces, and split by commas if necessary all_keywords = [] for _, row in papers_df.iterrows(): keywords = row['Keywords'] - # ๆๅ–ๅนถๆŽ’้™คไธ้œ€่ฆ็š„ๅ…ณ้”ฎ่ฏ - filtered_keywords = [kw.strip() for kw in keywords.split(", ") if kw.strip().lower() not in excluded_keywords] + # Ensure keywords are split correctly and clean them + filtered_keywords = [kw.strip() for kw in keywords.split(",") if kw.strip()] all_keywords.extend(filtered_keywords) -# # 3. ่ฎก็ฎ—ๆฏไธชๅ…ณ้”ฎ่ฏ็š„้ข‘็Ž‡ -# keyword_counts = Counter(all_keywords) -# -# # 4. ๅˆ›ๅปบ่ฏไบ‘ๅ›พ๏ผŒ่ฐƒๆ•ดๅˆ†่พจ็Ž‡ๅ’Œๅญ—ไฝ“ๅคงๅฐ -# wordcloud = WordCloud( -# width=1200, # ๅขžๅคงๅ›พๅƒๅฎฝๅบฆ -# height=500, # ๅขžๅคงๅ›พๅƒ้ซ˜ๅบฆ -# background_color="white", # ่ƒŒๆ™ฏ้ขœ่‰ฒ -# max_words=200, # ๆ˜พ็คบ็š„ๆœ€ๅคš่ฏๆ•ฐ -# colormap="viridis", # ่ฏไบ‘็š„้ขœ่‰ฒๆ˜ ๅฐ„ -# contour_width=0, # ่ฏไบ‘่พนๆก†ๅฎฝๅบฆ๏ผˆ0่กจ็คบๆฒกๆœ‰่พนๆก†๏ผ‰ -# contour_color='black', # ่ฏไบ‘่พนๆก†้ขœ่‰ฒ -# max_font_size=50, # ๆœ€ๅคงๅญ—ไฝ“ๅคงๅฐ -# min_font_size=10, # ๆœ€ๅฐๅญ—ไฝ“ๅคงๅฐ -# ).generate_from_frequencies(keyword_counts) -# -# # 5. ๆ˜พ็คบๅนถไฟๅญ˜่ฏไบ‘ๅ›พ๏ผŒๅขžๅŠ DPIๆฅๆ้ซ˜ๆธ…ๆ™ฐๅบฆ -# plt.figure(figsize=(12, 6)) # ่ฎพ็ฝฎๅ›พๅƒๅฐบๅฏธ -# plt.imshow(wordcloud, interpolation='bilinear') # ๆ’ๅ€ผๆ˜พ็คบ่ฏไบ‘ๅ›พ -# plt.axis('off') # ไธๆ˜พ็คบๅๆ ‡่ฝด -# plt.tight_layout(pad=0) -# -# # 6. ไฟๅญ˜ๅ›พ็‰‡ๆ–‡ไปถ๏ผŒๅนถ่ฎพ็ฝฎ้ซ˜DPI -# wordcloud_img_path = "update_template_or_data/keyword_wordcloud.png" -# plt.savefig(wordcloud_img_path, format='png', dpi=460) # ๅขžๅŠ DPI๏ผŒ็”Ÿๆˆ้ซ˜ๅˆ†่พจ็Ž‡ๅ›พๅƒ -# plt.close() # ๅ…ณ้—ญๅ›พๅƒ๏ผŒ้‡Šๆ”พ่ต„ๆบ -# -# print(f"ๅ…ณ้”ฎ่ฏ่ฏไบ‘ๅ›พๅทฒไฟๅญ˜ไธบ: {wordcloud_img_path}") +# Calculate the frequency of each keyword +keyword_counts = Counter(all_keywords) + +print(keyword_counts) +wordcloud = WordCloud( + width=1000, + height=600, + background_color="white", + colormap="viridis", + contour_width=0, + contour_color='black', +).generate_from_frequencies(keyword_counts) + +# Reduce DPI for better scaling +plt.figure(figsize=(10, 6)) +plt.imshow(wordcloud, interpolation='bilinear') +plt.axis('off') +plt.tight_layout(pad=0) + +# Lower DPI to avoid large images +plt.savefig("update_template_or_data/statistics/keyword_wordcloud.png", format='png', dpi=460) # Set lower DPI for smaller file size +plt.close()