diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/cache.json b/cache.json new file mode 100644 index 0000000..5a0f925 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-10-30T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2410.23289v1","updated":"2024-10-30T17:59:41Z","published":"2024-10-30T17:59:41Z","title":"Bridging the Human to Robot Dexterity Gap through Object-Oriented\n Rewards","summary":" Training robots directly from human videos is an emerging area in robotics\nand computer vision. While there has been notable progress with two-fingered\ngrippers, learning autonomous tasks for multi-fingered robot hands in this way\nremains challenging. A key reason for this difficulty is that a policy trained\non human hands may not directly transfer to a robot hand due to morphology\ndifferences. In this work, we present HuDOR, a technique that enables online\nfine-tuning of policies by directly computing rewards from human videos.\nImportantly, this reward function is built using object-oriented trajectories\nderived from off-the-shelf point trackers, providing meaningful learning\nsignals despite the morphology gap and visual differences between human and\nrobot hands. Given a single video of a human solving a task, such as gently\nopening a music box, HuDOR enables our four-fingered Allegro hand to learn the\ntask with just an hour of online interaction. Our experiments across four tasks\nshow that HuDOR achieves a 4x improvement over baselines. Code and videos are\navailable on our website, https://object-rewards.github.io.\n","authors":["Irmak Guzey","Yinlong Dai","Georgy Savva","Raunaq Bhirangi","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2410.23289v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23283v1","updated":"2024-10-30T17:58:26Z","published":"2024-10-30T17:58:26Z","title":"DisCo: Distributed Contact-Rich Trajectory Optimization for Forceful\n Multi-Robot Collaboration","summary":" We present DisCo, a distributed algorithm for contact-rich, multi-robot\ntasks. DisCo is a distributed contact-implicit trajectory optimization\nalgorithm, which allows a group of robots to optimize a time sequence of forces\nto objects and to their environment to accomplish tasks such as collaborative\nmanipulation, robot team sports, and modular robot locomotion. We build our\nalgorithm on a variant of the Alternating Direction Method of Multipliers\n(ADMM), where each robot computes its own contact forces and contact-switching\nevents from a smaller single-robot, contact-implicit trajectory optimization\nproblem, while cooperating with other robots through dual variables, enforcing\nconstraints between robots. Each robot iterates between solving its local\nproblem, and communicating over a wireless mesh network to enforce these\nconsistency constraints with its neighbors, ultimately converging to a\ncoordinated plan for the group. The local problems solved by each robot are\nsignificantly less challenging than a centralized problem with all robots'\ncontact forces and switching events, improving the computational efficiency,\nwhile also preserving the privacy of some aspects of each robot's operation. We\ndemonstrate the effectiveness of our algorithm in simulations of collaborative\nmanipulation, multi-robot team sports scenarios, and in modular robot\nlocomotion, where DisCo achieves $3$x higher success rates with a 2.5x to 5x\nfaster computation time. Further, we provide results of hardware experiments on\na modular truss robot, with three collaborating truss nodes planning\nindividually while working together to produce a punctuated rolling-gate motion\nof the composite structure. Videos are available on the project page:\nhttps://disco-opt.github.io.\n","authors":["Ola Shorinwa","Matthew Devlin","Elliot W. Hawkes","Mac Schwager"],"pdf_url":"https://arxiv.org/pdf/2410.23283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23277v1","updated":"2024-10-30T17:55:52Z","published":"2024-10-30T17:55:52Z","title":"SlowFast-VGen: Slow-Fast Learning for Action-Driven Long Video\n Generation","summary":" Human beings are endowed with a complementary learning system, which bridges\nthe slow learning of general world dynamics with fast storage of episodic\nmemory from a new experience. Previous video generation models, however,\nprimarily focus on slow learning by pre-training on vast amounts of data,\noverlooking the fast learning phase crucial for episodic memory storage. This\noversight leads to inconsistencies across temporally distant frames when\ngenerating longer videos, as these frames fall beyond the model's context\nwindow. To this end, we introduce SlowFast-VGen, a novel dual-speed learning\nsystem for action-driven long video generation. Our approach incorporates a\nmasked conditional video diffusion model for the slow learning of world\ndynamics, alongside an inference-time fast learning strategy based on a\ntemporal LoRA module. Specifically, the fast learning process updates its\ntemporal LoRA parameters based on local inputs and outputs, thereby efficiently\nstoring episodic memory in its parameters. We further propose a slow-fast\nlearning loop algorithm that seamlessly integrates the inner fast learning loop\ninto the outer slow learning loop, enabling the recall of prior multi-episode\nexperiences for context-aware skill learning. To facilitate the slow learning\nof an approximate world model, we collect a large-scale dataset of 200k videos\nwith language action annotations, covering a wide range of scenarios. Extensive\nexperiments show that SlowFast-VGen outperforms baselines across various\nmetrics for action-driven video generation, achieving an FVD score of 514\ncompared to 782, and maintaining consistency in longer videos, with an average\nof 0.37 scene cuts versus 0.89. The slow-fast learning loop algorithm\nsignificantly enhances performances on long-horizon planning tasks as well.\nProject Website: https://slowfast-vgen.github.io\n","authors":["Yining Hong","Beide Liu","Maxine Wu","Yuanhao Zhai","Kai-Wei Chang","Lingjie Li","Kevin Lin","Chung-Ching Lin","Jianfeng Wang","Zhengyuan Yang","Yingnian Wu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2410.23277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23262v1","updated":"2024-10-30T17:46:31Z","published":"2024-10-30T17:46:31Z","title":"EMMA: End-to-End Multimodal Model for Autonomous Driving","summary":" We introduce EMMA, an End-to-end Multimodal Model for Autonomous driving.\nBuilt on a multi-modal large language model foundation, EMMA directly maps raw\ncamera sensor data into various driving-specific outputs, including planner\ntrajectories, perception objects, and road graph elements. EMMA maximizes the\nutility of world knowledge from the pre-trained large language models, by\nrepresenting all non-sensor inputs (e.g. navigation instructions and ego\nvehicle status) and outputs (e.g. trajectories and 3D locations) as natural\nlanguage text. This approach allows EMMA to jointly process various driving\ntasks in a unified language space, and generate the outputs for each task using\ntask-specific prompts. Empirically, we demonstrate EMMA's effectiveness by\nachieving state-of-the-art performance in motion planning on nuScenes as well\nas competitive results on the Waymo Open Motion Dataset (WOMD). EMMA also\nyields competitive results for camera-primary 3D object detection on the Waymo\nOpen Dataset (WOD). We show that co-training EMMA with planner trajectories,\nobject detection, and road graph tasks yields improvements across all three\ndomains, highlighting EMMA's potential as a generalist model for autonomous\ndriving applications. However, EMMA also exhibits certain limitations: it can\nprocess only a small amount of image frames, does not incorporate accurate 3D\nsensing modalities like LiDAR or radar and is computationally expensive. We\nhope that our results will inspire further research to mitigate these issues\nand to further evolve the state of the art in autonomous driving model\narchitectures.\n","authors":["Jyh-Jing Hwang","Runsheng Xu","Hubert Lin","Wei-Chih Hung","Jingwei Ji","Kristy Choi","Di Huang","Tong He","Paul Covington","Benjamin Sapp","James Guo","Dragomir Anguelov","Mingxing Tan"],"pdf_url":"https://arxiv.org/pdf/2410.23262v1.pdf","comment":"Blog post: https://waymo.com/blog/2024/10/introducing-emma/"},{"id":"http://arxiv.org/abs/2410.23254v1","updated":"2024-10-30T17:37:31Z","published":"2024-10-30T17:37:31Z","title":"Keypoint Abstraction using Large Models for Object-Relative Imitation\n Learning","summary":" Generalization to novel object configurations and instances across diverse\ntasks and environments is a critical challenge in robotics. Keypoint-based\nrepresentations have been proven effective as a succinct representation for\ncapturing essential object features, and for establishing a reference frame in\naction prediction, enabling data-efficient learning of robot skills. However,\ntheir manual design nature and reliance on additional human labels limit their\nscalability. In this paper, we propose KALM, a framework that leverages large\npre-trained vision-language models (LMs) to automatically generate\ntask-relevant and cross-instance consistent keypoints. KALM distills robust and\nconsistent keypoints across views and objects by generating proposals using LMs\nand verifies them against a small set of robot demonstration data. Based on the\ngenerated keypoints, we can train keypoint-conditioned policy models that\npredict actions in keypoint-centric frames, enabling robots to generalize\neffectively across varying object poses, camera views, and object instances\nwith similar functional shapes. Our method demonstrates strong performance in\nthe real world, adapting to different tasks and environments from only a\nhandful of demonstrations while requiring no additional labels. Website:\nhttps://kalm-il.github.io/\n","authors":["Xiaolin Fang","Bo-Ruei Huang","Jiayuan Mao","Jasmine Shone","Joshua B. Tenenbaum","Tomás Lozano-Pérez","Leslie Pack Kaelbling"],"pdf_url":"https://arxiv.org/pdf/2410.23254v1.pdf","comment":"CoRL LangRob Workshop, 2024"},{"id":"http://arxiv.org/abs/2403.17009v2","updated":"2024-10-30T17:35:06Z","published":"2024-03-25T17:59:58Z","title":"Is Your LiDAR Placement Optimized for 3D Scene Understanding?","summary":" The reliability of driving perception systems under unprecedented conditions\nis crucial for practical usage. Latest advancements have prompted increasing\ninterest in multi-LiDAR perception. However, prevailing driving datasets\npredominantly utilize single-LiDAR systems and collect data devoid of adverse\nconditions, failing to capture the complexities of real-world environments\naccurately. Addressing these gaps, we proposed Place3D, a full-cycle pipeline\nthat encompasses LiDAR placement optimization, data generation, and downstream\nevaluations. Our framework makes three appealing contributions. 1) To identify\nthe most effective configurations for multi-LiDAR systems, we introduce the\nSurrogate Metric of the Semantic Occupancy Grids (M-SOG) to evaluate LiDAR\nplacement quality. 2) Leveraging the M-SOG metric, we propose a novel\noptimization strategy to refine multi-LiDAR placements. 3) Centered around the\ntheme of multi-condition multi-LiDAR perception, we collect a 280,000-frame\ndataset from both clean and adverse conditions. Extensive experiments\ndemonstrate that LiDAR placements optimized using our approach outperform\nvarious baselines. We showcase exceptional results in both LiDAR semantic\nsegmentation and 3D object detection tasks, under diverse weather and sensor\nfailure conditions.\n","authors":["Ye Li","Lingdong Kong","Hanjiang Hu","Xiaohao Xu","Xiaonan Huang"],"pdf_url":"https://arxiv.org/pdf/2403.17009v2.pdf","comment":"NeurIPS 2024 (Spotlight); 36 pages, 16 figures, 14 tables; Code at\n https://github.com/ywyeli/Place3D"},{"id":"http://arxiv.org/abs/2410.23234v1","updated":"2024-10-30T17:22:45Z","published":"2024-10-30T17:22:45Z","title":"EMOTION: Expressive Motion Sequence Generation for Humanoid Robots with\n In-Context Learning","summary":" This paper introduces a framework, called EMOTION, for generating expressive\nmotion sequences in humanoid robots, enhancing their ability to engage in\nhumanlike non-verbal communication. Non-verbal cues such as facial expressions,\ngestures, and body movements play a crucial role in effective interpersonal\ninteractions. Despite the advancements in robotic behaviors, existing methods\noften fall short in mimicking the diversity and subtlety of human non-verbal\ncommunication. To address this gap, our approach leverages the in-context\nlearning capability of large language models (LLMs) to dynamically generate\nsocially appropriate gesture motion sequences for human-robot interaction. We\nuse this framework to generate 10 different expressive gestures and conduct\nonline user studies comparing the naturalness and understandability of the\nmotions generated by EMOTION and its human-feedback version, EMOTION++, against\nthose by human operators. The results demonstrate that our approach either\nmatches or surpasses human performance in generating understandable and natural\nrobot motions under certain scenarios. We also provide design implications for\nfuture research to consider a set of variables when generating expressive\nrobotic gestures.\n","authors":["Peide Huang","Yuhan Hu","Nataliya Nechyporenko","Daehwa Kim","Walter Talbott","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.23234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23215v1","updated":"2024-10-30T17:03:27Z","published":"2024-10-30T17:03:27Z","title":"Levels of explanation -- implementation and evaluation of what and when\n for different time-sensitive tasks","summary":" In this work, we focused on constructing and evaluating levels of\nexplanation(LOE) that address two basic aspect of HRI: 1. What information\nshould be communicated to the user by the robot? 2. When should the robot\ncommunicate this information? For constructing the LOE, we defined two terms,\nverbosity and explanation patterns, each with two levels (verbosity -- high and\nlow, explanation patterns -- dynamic and static). Based on these parameters,\nthree different LOE (high, medium, and low) were constructed and evaluated in a\nuser study with a telepresence robot. The user study was conducted for a\nsimulated telerobotic healthcare task with two different conditions related to\ntime sensitivity, as evaluated by two different user groups -- one that\nperformed the task within a time limit and the other with no time limit. We\nfound that the high LOE was preferred in terms of adequacy of explanation,\nnumber of collisions, number of incorrect movements, and number of\nclarifications when users performed the experiment in the without time limit\ncondition. We also found that both high and medium LOE did not have significant\ndifferences in completion time, the fluency of HRI, and trust in the robot.\nWhen users performed the experiment in the with time limit condition, high and\nmedium LOE had better task performances and were preferred to the low LOE in\nterms of completion time, fluency, adequacy of explanation, trust, number of\ncollisions, number of incorrect movements and number of clarifications. Future\ndirections for advancing LOE are discussed.\n","authors":["Shikhar Kumar","Omer Keidar","Yael Edan"],"pdf_url":"https://arxiv.org/pdf/2410.23215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23156v1","updated":"2024-10-30T16:11:05Z","published":"2024-10-30T16:11:05Z","title":"VisualPredicator: Learning Abstract World Models with Neuro-Symbolic\n Predicates for Robot Planning","summary":" Broadly intelligent agents should form task-specific abstractions that\nselectively expose the essential elements of a task, while abstracting away the\ncomplexity of the raw sensorimotor space. In this work, we present\nNeuro-Symbolic Predicates, a first-order abstraction language that combines the\nstrengths of symbolic and neural knowledge representations. We outline an\nonline algorithm for inventing such predicates and learning abstract world\nmodels. We compare our approach to hierarchical reinforcement learning,\nvision-language model planning, and symbolic predicate invention approaches, on\nboth in- and out-of-distribution tasks across five simulated robotic domains.\nResults show that our approach offers better sample complexity, stronger\nout-of-distribution generalization, and improved interpretability.\n","authors":["Yichao Liang","Nishanth Kumar","Hao Tang","Adrian Weller","Joshua B. Tenenbaum","Tom Silver","João F. Henriques","Kevin Ellis"],"pdf_url":"https://arxiv.org/pdf/2410.23156v1.pdf","comment":"In submission"},{"id":"http://arxiv.org/abs/2410.23128v1","updated":"2024-10-30T15:40:06Z","published":"2024-10-30T15:40:06Z","title":"Leader-Follower 3D Formation for Underwater Robots","summary":" The schooling behavior of fish is hypothesized to confer many survival\nbenefits, including foraging success, safety from predators, and energy savings\nthrough hydrodynamic interactions when swimming in formation. Underwater robot\ncollectives may be able to achieve similar benefits in future applications,\ne.g. using formation control to achieve efficient spatial sampling for\nenvironmental monitoring. Although many theoretical algorithms exist for\nmulti-robot formation control, they have not been tested in the underwater\ndomain due to the fundamental challenges in underwater communication. Here we\nintroduce a leader-follower strategy for underwater formation control that\nallows us to realize complex 3D formations, using purely vision-based\nperception and a reactive control algorithm that is low computation. We use a\nphysical platform, BlueSwarm, to demonstrate for the first time an experimental\nrealization of inline, side-by-side, and staggered swimming 3D formations. More\ncomplex formations are studied in a physics-based simulator, providing new\ninsights into the convergence and stability of formations given underwater\ninertial/drag conditions. Our findings lay the groundwork for future\napplications of underwater robot swarms in aquatic environments with minimal\ncommunication.\n","authors":["Di Ni","Hungtang Ko","Radhika Nagpal"],"pdf_url":"https://arxiv.org/pdf/2410.23128v1.pdf","comment":"Accepted at DARS 2024 (The 17th International Symposium on\n Distributed Autonomous Robotic Systems)"},{"id":"http://arxiv.org/abs/2410.23085v1","updated":"2024-10-30T15:00:06Z","published":"2024-10-30T15:00:06Z","title":"S3PT: Scene Semantics and Structure Guided Clustering to Boost\n Self-Supervised Pre-Training for Autonomous Driving","summary":" Recent self-supervised clustering-based pre-training techniques like DINO and\nCribo have shown impressive results for downstream detection and segmentation\ntasks. However, real-world applications such as autonomous driving face\nchallenges with imbalanced object class and size distributions and complex\nscene geometries. In this paper, we propose S3PT a novel scene semantics and\nstructure guided clustering to provide more scene-consistent objectives for\nself-supervised training. Specifically, our contributions are threefold: First,\nwe incorporate semantic distribution consistent clustering to encourage better\nrepresentation of rare classes such as motorcycles or animals. Second, we\nintroduce object diversity consistent spatial clustering, to handle imbalanced\nand diverse object sizes, ranging from large background areas to small objects\nsuch as pedestrians and traffic signs. Third, we propose a depth-guided spatial\nclustering to regularize learning based on geometric information of the scene,\nthus further refining region separation on the feature level. Our learned\nrepresentations significantly improve performance in downstream semantic\nsegmentation and 3D object detection tasks on the nuScenes, nuImages, and\nCityscapes datasets and show promising domain translation properties.\n","authors":["Maciej K. Wozniak","Hariprasath Govindarajan","Marvin Klingner","Camille Maurice","Ravi Kiran","Senthil Yogamani"],"pdf_url":"https://arxiv.org/pdf/2410.23085v1.pdf","comment":"Accepted for WACV 2025"},{"id":"http://arxiv.org/abs/2303.10465v2","updated":"2024-10-30T14:55:39Z","published":"2023-03-18T18:01:39Z","title":"Cognitive Load-based Affective Workload Allocation for Multi-human\n Multi-robot Teams","summary":" The interaction and collaboration between humans and multiple robots\nrepresent a novel field of research known as human multi-robot systems.\nAdequately designed systems within this field allow teams composed of both\nhumans and robots to work together effectively on tasks such as monitoring,\nexploration, and search and rescue operations. This paper presents a deep\nreinforcement learning-based affective workload allocation controller\nspecifically for multi-human multi-robot teams. The proposed controller can\ndynamically reallocate workloads based on the performance of the operators\nduring collaborative missions with multi-robot systems. The operators'\nperformances are evaluated through the scores of a self-reported questionnaire\n(i.e., subjective measurement) and the results of a deep learning-based\ncognitive workload prediction algorithm that uses physiological and behavioral\ndata (i.e., objective measurement). To evaluate the effectiveness of the\nproposed controller, we use a multi-human multi-robot CCTV monitoring task as\nan example and carry out comprehensive real-world experiments with 32 human\nsubjects for both quantitative measurement and qualitative analysis. Our\nresults demonstrate the performance and effectiveness of the proposed\ncontroller and highlight the importance of incorporating both subjective and\nobjective measurements of the operators' cognitive workload as well as seeking\nconsent for workload transitions, to enhance the performance of multi-human\nmulti-robot teams.\n","authors":["Wonse Jo","Ruiqi Wang","Baijian Yang","Dan Foti","Mo Rastgaar","Byung-Cheol Min"],"pdf_url":"https://arxiv.org/pdf/2303.10465v2.pdf","comment":"This paper is submitted and accepted to IEEE Transactions on\n Human-Machine Systems"},{"id":"http://arxiv.org/abs/2404.08563v2","updated":"2024-10-30T14:48:14Z","published":"2024-04-12T16:01:02Z","title":"FusionPortableV2: A Unified Multi-Sensor Dataset for Generalized SLAM\n Across Diverse Platforms and Scalable Environments","summary":" Simultaneous Localization and Mapping (SLAM) technology has been widely\napplied in various robotic scenarios, from rescue operations to autonomous\ndriving. However, the generalization of SLAM algorithms remains a significant\nchallenge, as current datasets often lack scalability in terms of platforms and\nenvironments. To address this limitation, we present FusionPortableV2, a\nmulti-sensor SLAM dataset featuring sensor diversity, varied motion patterns,\nand a wide range of environmental scenarios. Our dataset comprises $27$\nsequences, spanning over $2.5$ hours and collected from four distinct\nplatforms: a handheld suite, a legged robots, a unmanned ground vehicle (UGV),\nand a vehicle. These sequences cover diverse settings, including buildings,\ncampuses, and urban areas, with a total length of $38.7km$. Additionally, the\ndataset includes ground-truth (GT) trajectories and RGB point cloud maps\ncovering approximately $0.3km^2$. To validate the utility of our dataset in\nadvancing SLAM research, we assess several state-of-the-art (SOTA) SLAM\nalgorithms. Furthermore, we demonstrate the dataset's broad application beyond\ntraditional SLAM tasks by investigating its potential for monocular depth\nestimation. The complete dataset, including sensor data, GT, and calibration\ndetails, is accessible at\nhttps://fusionportable.github.io/dataset/fusionportable_v2.\n","authors":["Hexiang Wei","Jianhao Jiao","Xiangcheng Hu","Jingwen Yu","Xupeng Xie","Jin Wu","Yilong Zhu","Yuxuan Liu","Lujia Wang","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2404.08563v2.pdf","comment":"21 pages, 17 figures, 7 tables. Accepted by International Journal of\n Robotics Research (IJRR)"},{"id":"http://arxiv.org/abs/2410.23059v1","updated":"2024-10-30T14:33:22Z","published":"2024-10-30T14:33:22Z","title":"FilMBot: A High-Speed Soft Parallel Robotic Micromanipulator","summary":" Soft robotic manipulators are generally slow despite their great\nadaptability, resilience, and compliance. This limitation also extends to\ncurrent soft robotic micromanipulators. Here, we introduce FilMBot, a 3-DOF\nfilm-based, electromagnetically actuated, soft kinematic robotic\nmicromanipulator achieving speeds up to 2117 $\\deg$/s and 2456 $\\deg$/s in\n$\\alpha$ and $\\beta$ angular motions, with corresponding linear velocities of\n1.61 m/s and 1.92 m/s using a 4-cm needle end-effector, and 1.57 m/s along the\nZ axis. The robot can reach ~1.50 m/s in path-following tasks, operates at\nfrequencies up to 30 Hz, and remains functional up to 50 Hz. It demonstrates\nhigh precision (~6.3 $\\mu$m, or ~0.05% of its workspace) in small\npath-following tasks. The novel combination of the low-stiffness soft kinematic\nfilm structure and strong electromagnetic actuation in FilMBot opens new\navenues for soft robotics. Furthermore, its simple construction and\ninexpensive, readily accessible components could broaden the application of\nmicromanipulators beyond current academic and professional users.\n","authors":["Jiangkun Yu","Houari Bettahar","Hakan Kandemir","Quan Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.23059v1.pdf","comment":"12 pages, 15 figures"},{"id":"http://arxiv.org/abs/2410.11387v3","updated":"2024-10-30T14:31:25Z","published":"2024-10-15T08:24:05Z","title":"LLM2Swarm: Robot Swarms that Responsively Reason, Plan, and Collaborate\n through LLMs","summary":" Robot swarms are composed of many simple robots that communicate and\ncollaborate to fulfill complex tasks. Robot controllers usually need to be\nspecified by experts on a case-by-case basis via programming code. This process\nis time-consuming, prone to errors, and unable to take into account all\nsituations that may be encountered during deployment. On the other hand, recent\nLarge Language Models (LLMs) have demonstrated reasoning and planning\ncapabilities, introduced new ways to interact with and program machines, and\nincorporate both domain-specific and commonsense knowledge. Hence, we propose\nto address the aforementioned challenges by integrating LLMs with robot swarms\nand show the potential in proofs of concept (showcases). For this integration,\nwe explore two approaches. The first approach is 'indirect integration,' where\nLLMs are used to synthesize and validate the robot controllers. This approach\nmay reduce development time and human error before deployment. Moreover, during\ndeployment, it could be used for on-the-fly creation of new robot behaviors.\nThe second approach is 'direct integration,' where each robot locally executes\na separate LLM instance during deployment for robot-robot collaboration and\nhuman-swarm interaction. These local LLM instances enable each robot to reason,\nplan, and collaborate using natural language, as demonstrated in our showcases\nwhere the robots are able to detect a variety of anomalies, without prior\ninformation about the nature of these anomalies. To enable further research on\nour mainly conceptual contribution, we release the software and videos for our\nLLM2Swarm system: https://github.com/Pold87/LLM2Swarm.\n","authors":["Volker Strobel","Marco Dorigo","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2410.11387v3.pdf","comment":"Accepted at NeurIPS 2024 Workshop on Open-World Agents. Code:\n https://github.com/Pold87/LLM2Swarm/"},{"id":"http://arxiv.org/abs/2410.23049v1","updated":"2024-10-30T14:16:59Z","published":"2024-10-30T14:16:59Z","title":"TumblerBots: Tumbling Robotic sensors for Minimally-invasive Benthic\n Monitoring","summary":" Robotic systems show significant promise for water environmental sensing\napplications such as water quality monitoring, pollution mapping and\nbiodiversity data collection.\n Conventional deployment methods often disrupt fragile ecosystems, preventing\ndepiction of the undisturbed environmental condition. In response to this\nchallenge, we propose a novel framework utilizing a lightweight tumbler system\nequipped with a sensing unit, deployed via a drone. This design minimizes\ndisruption to the water habitat by maintaining a slow descent. The sensing unit\nis detached once on the water surface, enabling precise and non-invasive data\ncollection from the benthic zone.\n The tumbler is designed to be lightweight and compact, enabling deployment\nvia a drone. The sensing pod, which detaches from the tumbler and descends to\nthe bottom of the water body, is equipped with temperature and pressure\nsensors, as well as a buoyancy system. The later, activated upon task\ncompletion, utilizes a silicon membrane inflated via a chemical reaction. The\nreaction generates a pressure of 70 kPa, causing the silicon membrane to expand\nby 30\\%, which exceeds the 5.7\\% volume increase required for positive\nbuoyancy. The tumblers, made from ecofriendly materials to minimize\nenvironmental impact when lost during the mission, were tested for their\ngliding ratio and descent rate. They exhibit a low descent rate, in the range\nof 0.8 to 2.5 meters per seconds, which minimizes disturbance to the ecosystem\nupon water landing. Additionally, the system demonstrated robustness in\nmoderate to strong wind conditions during outdoor tests, validating the overall\nframework.\n","authors":["L. Romanello","A. Teboul","F. Wiesemuller","P. H. Nguyen","M. Kovac","S. F. Armanini"],"pdf_url":"https://arxiv.org/pdf/2410.23049v1.pdf","comment":"Submitted to IEEE Robosoft 2025"},{"id":"http://arxiv.org/abs/2304.00910v4","updated":"2024-10-30T14:08:46Z","published":"2023-04-03T11:57:10Z","title":"Integrating One-Shot View Planning with a Single Next-Best View via\n Long-Tail Multiview Sampling","summary":" Existing view planning systems either adopt an iterative paradigm using\nnext-best views (NBV) or a one-shot pipeline relying on the set-covering\nview-planning (SCVP) network. However, neither of these methods can\nconcurrently guarantee both high-quality and high-efficiency reconstruction of\n3D unknown objects. To tackle this challenge, we introduce a crucial\nhypothesis: with the availability of more information about the unknown object,\nthe prediction quality of the SCVP network improves. There are two ways to\nprovide extra information: (1) leveraging perception data obtained from NBVs,\nand (2) training on an expanded dataset of multiview inputs. In this work, we\nintroduce a novel combined pipeline that incorporates a single NBV before\nactivating the proposed multiview-activated (MA-)SCVP network. The MA-SCVP is\ntrained on a multiview dataset generated by our long-tail sampling method,\nwhich addresses the issue of unbalanced multiview inputs and enhances the\nnetwork performance. Extensive simulated experiments substantiate that our\nsystem demonstrates a significant surface coverage increase and a substantial\n45% reduction in movement cost compared to state-of-the-art systems. Real-world\nexperiments justify the capability of our system for generalization and\ndeployment.\n","authors":["Sicong Pan","Hao Hu","Hui Wei","Nils Dengler","Tobias Zaenker","Murad Dawood","Maren Bennewitz"],"pdf_url":"https://arxiv.org/pdf/2304.00910v4.pdf","comment":"Accepted to IEEE Transactions on Robotics. Full appendices version"},{"id":"http://arxiv.org/abs/2410.23039v1","updated":"2024-10-30T14:06:51Z","published":"2024-10-30T14:06:51Z","title":"Neural Attention Field: Emerging Point Relevance in 3D Scenes for\n One-Shot Dexterous Grasping","summary":" One-shot transfer of dexterous grasps to novel scenes with object and context\nvariations has been a challenging problem. While distilled feature fields from\nlarge vision models have enabled semantic correspondences across 3D scenes,\ntheir features are point-based and restricted to object surfaces, limiting\ntheir capability of modeling complex semantic feature distributions for\nhand-object interactions. In this work, we propose the \\textit{neural attention\nfield} for representing semantic-aware dense feature fields in the 3D space by\nmodeling inter-point relevance instead of individual point features. Core to it\nis a transformer decoder that computes the cross-attention between any 3D query\npoint with all the scene points, and provides the query point feature with an\nattention-based aggregation. We further propose a self-supervised framework for\ntraining the transformer decoder from only a few 3D pointclouds without hand\ndemonstrations. Post-training, the attention field can be applied to novel\nscenes for semantics-aware dexterous grasping from one-shot demonstration.\nExperiments show that our method provides better optimization landscapes by\nencouraging the end-effector to focus on task-relevant scene regions, resulting\nin significant improvements in success rates on real robots compared with the\nfeature-field-based methods.\n","authors":["Qianxu Wang","Congyue Deng","Tyler Ga Wei Lum","Yuanpei Chen","Yaodong Yang","Jeannette Bohg","Yixin Zhu","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2410.23039v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23033v1","updated":"2024-10-30T14:02:23Z","published":"2024-10-30T14:02:23Z","title":"Exploring the Potential of Multi-modal Sensing Framework for Forest\n Ecology","summary":" Forests offer essential resources and services to humanity, yet preserving\nand restoring them presents challenges, particularly due to the limited\navailability of actionable data, especially in hard-to-reach areas like forest\ncanopies. Accessibility continues to pose a challenge for biologists collecting\ndata in forest environments, often requiring them to invest significant time\nand energy in climbing trees to place sensors. This operation not only consumes\nresources but also exposes them to danger. Efforts in robotics have been\ndirected towards accessing the tree canopy using robots. A swarm of drones has\nshowcased autonomous navigation through the canopy, maneuvering with agility\nand evading tree collisions, all aimed at mapping the area and collecting data.\nHowever, relying solely on free-flying drones has proven insufficient for data\ncollection. Flying drones within the canopy generates loud noise, disturbing\nanimals and potentially corrupting the data. Additionally, commercial drones\noften have limited autonomy for dexterous tasks where aerial physical\ninteraction could be required, further complicating data acquisition efforts.\nAerial deployed sensor placement methods such as bio-gliders and sensor\nshooting have proven effective for data collection within the lower canopy.\nHowever, these methods face challenges related to retrieving the data and\nsensors, often necessitating human intervention.\n","authors":["Luca Romanello","Tian Lan","Mirko Kovac","Sophie F. Armanini","Basaran Bahadir Kocer"],"pdf_url":"https://arxiv.org/pdf/2410.23033v1.pdf","comment":"Peer-reviewed and accepted in IEEE ICRA 2024 Workshop RUNE"},{"id":"http://arxiv.org/abs/2410.23032v1","updated":"2024-10-30T14:02:15Z","published":"2024-10-30T14:02:15Z","title":"Camber-changing flapping hydrofoils for efficient and environmental-safe\n water propulsion system","summary":" This research introduces a novel hydrofoil-based propulsion framework for\nunmanned aquatic robots, inspired by the undulating locomotion observed in\nselect aquatic species. The proposed system incorporates a camber-modulating\nmechanism to enhance hydrofoil propulsive force generation and eventually\nefficiency. Through dynamic simulations, we validate the effectiveness of the\ncamber-adjusting hydrofoil compared to a symmetric counterpart. The results\ndemonstrate a significant improvement in horizontal thrust, emphasizing the\npotential of the cambering approach to enhance propulsive performance.\nAdditionally, a prototype flipper design is presented, featuring individual\ncontrol of heave and pitch motions, as well as a camber-adjustment mechanism.\nThe integrated system not only provides efficient water-based propulsion but\nalso offers the capacity for generating vertical forces during take-off\nmaneuvers for seaplanes. The design is tailored to harness wave energy,\ncontributing to the exploration of alternative energy resources. This work\nadvances the understanding of bionic oscillatory principles for aquatic robots\nand provides a foundation for future developments in environmentally safe and\nagile underwater exploration.\n","authors":["Luca Romanello","Leonard Hohaus","David-Marian Schmitt","Mirko Kovac","Sophie F. Armanini"],"pdf_url":"https://arxiv.org/pdf/2410.23032v1.pdf","comment":"Peer-reviewed and accepted in Ubiquitous Robots 2024, New York City"},{"id":"http://arxiv.org/abs/2410.23022v1","updated":"2024-10-30T13:52:43Z","published":"2024-10-30T13:52:43Z","title":"Online Intrinsic Rewards for Decision Making Agents from Large Language\n Model Feedback","summary":" Automatically synthesizing dense rewards from natural language descriptions\nis a promising paradigm in reinforcement learning (RL), with applications to\nsparse reward problems, open-ended exploration, and hierarchical skill design.\nRecent works have made promising steps by exploiting the prior knowledge of\nlarge language models (LLMs). However, these approaches suffer from important\nlimitations: they are either not scalable to problems requiring billions of\nenvironment samples; or are limited to reward functions expressible by compact\ncode, which may require source code and have difficulty capturing nuanced\nsemantics; or require a diverse offline dataset, which may not exist or be\nimpossible to collect. In this work, we address these limitations through a\ncombination of algorithmic and systems-level contributions. We propose ONI, a\ndistributed architecture that simultaneously learns an RL policy and an\nintrinsic reward function using LLM feedback. Our approach annotates the\nagent's collected experience via an asynchronous LLM server, which is then\ndistilled into an intrinsic reward model. We explore a range of algorithmic\nchoices for reward modeling with varying complexity, including hashing,\nclassification, and ranking models. By studying their relative tradeoffs, we\nshed light on questions regarding intrinsic reward design for sparse reward\nproblems. Our approach achieves state-of-the-art performance across a range of\nchallenging, sparse reward tasks from the NetHack Learning Environment in a\nsimple unified process, solely using the agent's gathered experience, without\nrequiring external datasets nor source code. We make our code available at\n\\url{URL} (coming soon).\n","authors":["Qinqing Zheng","Mikael Henaff","Amy Zhang","Aditya Grover","Brandon Amos"],"pdf_url":"https://arxiv.org/pdf/2410.23022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00552v4","updated":"2024-10-30T13:49:11Z","published":"2024-05-01T14:50:58Z","title":"Long-Term Human Trajectory Prediction using 3D Dynamic Scene Graphs","summary":" We present a novel approach for long-term human trajectory prediction in\nindoor human-centric environments, which is essential for long-horizon robot\nplanning in these environments. State-of-the-art human trajectory prediction\nmethods are limited by their focus on collision avoidance and short-term\nplanning, and their inability to model complex interactions of humans with the\nenvironment. In contrast, our approach overcomes these limitations by\npredicting sequences of human interactions with the environment and using this\ninformation to guide trajectory predictions over a horizon of up to 60s. We\nleverage Large Language Models (LLMs) to predict interactions with the\nenvironment by conditioning the LLM prediction on rich contextual information\nabout the scene. This information is given as a 3D Dynamic Scene Graph that\nencodes the geometry, semantics, and traversability of the environment into a\nhierarchical representation. We then ground these interaction sequences into\nmulti-modal spatio-temporal distributions over human positions using a\nprobabilistic approach based on continuous-time Markov Chains. To evaluate our\napproach, we introduce a new semi-synthetic dataset of long-term human\ntrajectories in complex indoor environments, which also includes annotations of\nhuman-object interactions. We show in thorough experimental evaluations that\nour approach achieves a 54% lower average negative log-likelihood and a 26.5%\nlower Best-of-20 displacement error compared to the best non-privileged (i.e.,\nevaluated in a zero-shot fashion on the dataset) baselines for a time horizon\nof 60s.\n","authors":["Nicolas Gorlo","Lukas Schmid","Luca Carlone"],"pdf_url":"https://arxiv.org/pdf/2405.00552v4.pdf","comment":"8 pages, 6 figures. Accepted at IEEE Robotics and Automation Letters\n (RA-L). Code released at: https://github.com/MIT-SPARK/LP2"},{"id":"http://arxiv.org/abs/2410.23004v1","updated":"2024-10-30T13:30:39Z","published":"2024-10-30T13:30:39Z","title":"DexGraspNet 2.0: Learning Generative Dexterous Grasping in Large-scale\n Synthetic Cluttered Scenes","summary":" Grasping in cluttered scenes remains highly challenging for dexterous hands\ndue to the scarcity of data. To address this problem, we present a large-scale\nsynthetic benchmark, encompassing 1319 objects, 8270 scenes, and 427 million\ngrasps. Beyond benchmarking, we also propose a novel two-stage grasping method\nthat learns efficiently from data by using a diffusion model that conditions on\nlocal geometry. Our proposed generative method outperforms all baselines in\nsimulation experiments. Furthermore, with the aid of test-time-depth\nrestoration, our method demonstrates zero-shot sim-to-real transfer, attaining\n90.7% real-world dexterous grasping success rate in cluttered scenes.\n","authors":["Jialiang Zhang","Haoran Liu","Danshi Li","Xinqiang Yu","Haoran Geng","Yufei Ding","Jiayi Chen","He Wang"],"pdf_url":"https://arxiv.org/pdf/2410.23004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22997v1","updated":"2024-10-30T13:22:55Z","published":"2024-10-30T13:22:55Z","title":"A Comparison of Prompt Engineering Techniques for Task Planning and\n Execution in Service Robotics","summary":" Recent advances in LLM have been instrumental in autonomous robot control and\nhuman-robot interaction by leveraging their vast general knowledge and\ncapabilities to understand and reason across a wide range of tasks and\nscenarios. Previous works have investigated various prompt engineering\ntechniques for improving the performance of \\glspl{LLM} to accomplish tasks,\nwhile others have proposed methods that utilize LLMs to plan and execute tasks\nbased on the available functionalities of a given robot platform. In this work,\nwe consider both lines of research by comparing prompt engineering techniques\nand combinations thereof within the application of high-level task planning and\nexecution in service robotics. We define a diverse set of tasks and a simple\nset of functionalities in simulation, and measure task completion accuracy and\nexecution time for several state-of-the-art models.\n","authors":["Jonas Bode","Bastian Pätzold","Raphael Memmesheimer","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2410.22997v1.pdf","comment":"6 pages, 3 figures, 2 tables, to be published in the 2024 IEEE-RAS\n International Conference on Humanoid Robots, We make our code, including all\n prompts, available at https://github.com/AIS-Bonn/Prompt_Engineering"},{"id":"http://arxiv.org/abs/2410.22982v1","updated":"2024-10-30T12:46:15Z","published":"2024-10-30T12:46:15Z","title":"PDSR: Efficient UAV Deployment for Swift and Accurate Post-Disaster\n Search and Rescue","summary":" This paper introduces a comprehensive framework for Post-Disaster Search and\nRescue (PDSR), aiming to optimize search and rescue operations leveraging\nUnmanned Aerial Vehicles (UAVs). The primary goal is to improve the precision\nand availability of sensing capabilities, particularly in various catastrophic\nscenarios. Central to this concept is the rapid deployment of UAV swarms\nequipped with diverse sensing, communication, and intelligence capabilities,\nfunctioning as an integrated system that incorporates multiple technologies and\napproaches for efficient detection of individuals buried beneath rubble or\ndebris following a disaster. Within this framework, we propose architectural\nsolution and address associated challenges to ensure optimal performance in\nreal-world disaster scenarios. The proposed framework aims to achieve complete\ncoverage of damaged areas significantly faster than traditional methods using a\nmulti-tier swarm architecture. Furthermore, integrating multi-modal sensing\ndata with machine learning for data fusion could enhance detection accuracy,\nensuring precise identification of survivors.\n","authors":["Alaa Awad Abdellatif","Ali Elmancy","Amr Mohamed","Ahmed Massoud","Wadha Lebda","Khalid K. Naji"],"pdf_url":"https://arxiv.org/pdf/2410.22982v1.pdf","comment":"This paper is currently under review at IEEE IoT Magazine"},{"id":"http://arxiv.org/abs/2410.22980v1","updated":"2024-10-30T12:45:12Z","published":"2024-10-30T12:45:12Z","title":"Efficient End-to-End 6-Dof Grasp Detection Framework for Edge Devices\n with Hierarchical Heatmaps and Feature Propagation","summary":" 6-DoF grasp detection is critically important for the advancement of\nintelligent embodied systems, as it provides feasible robot poses for object\ngrasping. Various methods have been proposed to detect 6-DoF grasps through the\nextraction of 3D geometric features from RGBD or point cloud data. However,\nmost of these approaches encounter challenges during real robot deployment due\nto their significant computational demands, which can be particularly\nproblematic for mobile robot platforms, especially those reliant on edge\ncomputing devices. This paper presents an Efficient End-to-End Grasp Detection\nNetwork (E3GNet) for 6-DoF grasp detection utilizing hierarchical heatmap\nrepresentations. E3GNet effectively identifies high-quality and diverse grasps\nin cluttered real-world environments. Benefiting from our end-to-end\nmethodology and efficient network design, our approach surpasses previous\nmethods in model inference efficiency and achieves real-time 6-Dof grasp\ndetection on edge devices. Furthermore, real-world experiments validate the\neffectiveness of our method, achieving a satisfactory 94% object grasping\nsuccess rate.\n","authors":["Kaiqin Yang. Yixiang Dai","Guijin Wang","Siang Chen"],"pdf_url":"https://arxiv.org/pdf/2410.22980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22931v1","updated":"2024-10-30T11:37:47Z","published":"2024-10-30T11:37:47Z","title":"GPTR: Gaussian Process Trajectory Representation for Continuous-Time\n Motion Estimation","summary":" Continuous-time trajectory representation has gained significant popularity\nin recent years, as it offers an elegant formulation that allows the fusion of\na larger number of sensors and sensing modalities, overcoming limitations of\ntraditional discrete-time frameworks. To bolster the adoption of the\ncontinuous-time paradigm, we propose a so-called Gaussian Process Trajectory\nRepresentation (GPTR) framework for continuous-time motion estimation (CTME)\ntasks. Our approach stands out by employing a third-order random jerk model,\nfeaturing closed-form expressions for both rotational and translational state\nderivatives. This model provides smooth, continuous trajectory representations\nthat are crucial for precise estimation of complex motion. To support the wider\nrobotics and computer vision communities, we have made the source code for GPTR\navailable as a light-weight header-only library. This format was chosen for its\nease of integration, allowing developers to incorporate GPTR into existing\nsystems without needing extensive code modifications. Moreover, we also provide\na set of optimization examples with LiDAR, camera, IMU, UWB factors, and\nclosed-form analytical Jacobians under the proposed GP framework. Our\nexperiments demonstrate the efficacy and efficiency of GP-based trajectory\nrepresentation in various motion estimation tasks, and the examples can serve\nas the prototype to help researchers quickly develop future applications such\nas batch optimization, calibration, sensor fusion, trajectory planning, etc.,\nwith continuous-time trajectory representation. Our project is accessible at\nhttps://github.com/brytsknguyen/gptr .\n","authors":["Thien-Minh Nguyen","Ziyu Cao","Kailai Li","Shenghai Yuan","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2410.22931v1.pdf","comment":"The source code has been released. All feedbacks are welcome"},{"id":"http://arxiv.org/abs/2410.22910v1","updated":"2024-10-30T11:06:43Z","published":"2024-10-30T11:06:43Z","title":"An Efficient Representation of Whole-body Model Predictive Control for\n Online Compliant Dual-arm Mobile Manipulation","summary":" Dual-arm mobile manipulators can transport and manipulate large-size objects\nwith simple end-effectors. To interact with dynamic environments with strict\nsafety and compliance requirements, achieving whole-body motion planning online\nwhile meeting various hard constraints for such highly redundant mobile\nmanipulators poses a significant challenge. We tackle this challenge by\npresenting an efficient representation of whole-body motion trajectories within\nour bilevel model-based predictive control (MPC) framework. We utilize\nB\\'ezier-curve parameterization to represent the optimized collision-free\ntrajectories of two collaborating end-effectors in the first MPC, facilitating\nfast long-horizon object-oriented motion planning in SE(3) while considering\napproximated feasibility constraints. This approach is further applied to\nparameterize whole-body trajectories in the second MPC for whole-body motion\ngeneration with predictive admittance control in a relatively short horizon\nwhile satisfying whole-body hard constraints. This representation enables two\nMPCs with continuous properties, thereby avoiding inaccurate model-state\ntransition and dense decision-variable settings in existing MPCs using the\ndiscretization method. It strengthens the online execution of the bilevel MPC\nframework in high-dimensional space and facilitates the generation of\nconsistent commands for our hybrid position/velocity-controlled robot. The\nsimulation comparisons and real-world experiments demonstrate the efficiency\nand robustness of this approach in various scenarios for static and dynamic\nobstacle avoidance, and compliant interaction control with the manipulated\nobject and external disturbances.\n","authors":["Wenqian Du","Ran Long","João Moura","Jiayi Wang","Saeid Samadi","Sethu Vijayakumar"],"pdf_url":"https://arxiv.org/pdf/2410.22910v1.pdf","comment":"Under Review for IEEE Transactions on Robotics"},{"id":"http://arxiv.org/abs/2410.22893v1","updated":"2024-10-30T10:40:57Z","published":"2024-10-30T10:40:57Z","title":"Human-inspired Grasping Strategies of Fresh Fruits and Vegetables\n Applied to Robotic Manipulation","summary":" Robotic manipulation of fresh fruits and vegetables, including the grasping\nof multiple loose items, has a strong industrial need but it still is a\nchallenging task for robotic manipulation. This paper outlines the distinctive\nmanipulation strategies used by humans to pick loose fruits and vegetables with\nthe aim to better adopt them for robotic manipulation of diverse items. In this\nwork we present a first version of a robotic setup designed to pick different\nsingle or multiple fresh items, featuring multi-fingered compliant robotic\ngripper. We analyse human grasping strategies from the perspective of\nindustrial Key Performance Indicators (KPIs) used in the logistic sector. The\nrobotic system was validated using the same KPIs, as well as taking into\naccount human performance and strategies. This paper lays the foundation for\nfuture development of the robotic demonstrator for fresh fruit and vegetable\nintelligent manipulation, and outlines the need for generic approaches to\nhandle the complexity of the task.\n","authors":["Romeo Orsolino","Mykhaylo Marfeychuk","Mariana de Paula Assis Fonseca","Mario Baggetta","Wesley Wimshurst","Francesco Porta","Morgan Clarke","Giovanni Berselli","Jelizaveta Konstantinova"],"pdf_url":"https://arxiv.org/pdf/2410.22893v1.pdf","comment":"*Authors contributed equally"},{"id":"http://arxiv.org/abs/2410.06613v2","updated":"2024-10-30T10:21:13Z","published":"2024-10-09T07:09:29Z","title":"ES-Gaussian: Gaussian Splatting Mapping via Error Space-Based Gaussian\n Completion","summary":" Accurate and affordable indoor 3D reconstruction is critical for effective\nrobot navigation and interaction. Traditional LiDAR-based mapping provides high\nprecision but is costly, heavy, and power-intensive, with limited ability for\nnovel view rendering. Vision-based mapping, while cost-effective and capable of\ncapturing visual data, often struggles with high-quality 3D reconstruction due\nto sparse point clouds. We propose ES-Gaussian, an end-to-end system using a\nlow-altitude camera and single-line LiDAR for high-quality 3D indoor\nreconstruction. Our system features Visual Error Construction (VEC) to enhance\nsparse point clouds by identifying and correcting areas with insufficient\ngeometric detail from 2D error maps. Additionally, we introduce a novel 3DGS\ninitialization method guided by single-line LiDAR, overcoming the limitations\nof traditional multi-view setups and enabling effective reconstruction in\nresource-constrained environments. Extensive experimental results on our new\nDreame-SR dataset and a publicly available dataset demonstrate that ES-Gaussian\noutperforms existing methods, particularly in challenging scenarios. The\nproject page is available at https://chenlu-china.github.io/ES-Gaussian/.\n","authors":["Lu Chen","Yingfu Zeng","Haoang Li","Zhitao Deng","Jiafu Yan","Zhenjun Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.06613v2.pdf","comment":"This preprint has been withdrawn due to concerns regarding the\n originality of certain technical elements, as well as its basis in a company\n project report that was intended solely for internal discussions. To avoid\n any potential misunderstandings, we have decided to withdraw this submission\n from public access. We apologize for any confusion this may have caused"},{"id":"http://arxiv.org/abs/2410.22848v1","updated":"2024-10-30T09:29:37Z","published":"2024-10-30T09:29:37Z","title":"Non-contact Dexterous Micromanipulation with Multiple Optoelectronic\n Robots","summary":" Micromanipulation systems leverage automation and robotic technologies to\nimprove the precision, repeatability, and efficiency of various tasks at the\nmicroscale. However, current approaches are typically limited to specific\nobjects or tasks, which necessitates the use of custom tools and specialized\ngrasping methods. This paper proposes a novel non-contact micromanipulation\nmethod based on optoelectronic technologies. The proposed method utilizes\nrepulsive dielectrophoretic forces generated in the optoelectronic field to\ndrive a microrobot, enabling the microrobot to push the target object in a\ncluttered environment without physical contact. The non-contact feature can\nminimize the risks of potential damage, contamination, or adhesion while\nlargely improving the flexibility of manipulation. The feature enables the use\nof a general tool for indirect object manipulation, eliminating the need for\nspecialized tools. A series of simulation studies and real-world experiments --\nincluding non-contact trajectory tracking, obstacle avoidance, and reciprocal\navoidance between multiple microrobots -- are conducted to validate the\nperformance of the proposed method. The proposed formulation provides a general\nand dexterous solution for a range of objects and tasks at the micro scale.\n","authors":["Yongyi Jia","Shu Miao","Ao Wang","Caiding Ni","Lin Feng","Xiaowo Wang","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2410.22848v1.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2410.22825v1","updated":"2024-10-30T09:04:45Z","published":"2024-10-30T09:04:45Z","title":"Grasping Force Estimation for Markerless Visuotactile Sensors","summary":" Tactile sensors have been used for force estimation in the past, especially\nVision-Based Tactile Sensors (VBTS) have recently become a new trend due to\ntheir high spatial resolution and low cost. In this work, we have designed and\nimplemented several approaches to estimate the normal grasping force using\ndifferent types of markerless visuotactile representations obtained from VBTS.\nOur main goal is to determine the most appropriate visuotactile representation,\nbased on a performance analysis during robotic grasping tasks. Our proposal has\nbeen tested on the dataset generated with our DIGIT sensors and another one\nobtained using GelSight Mini sensors from another state-of-the-art work. We\nhave also tested the generalization capabilities of our best approach, called\nRGBmod. The results led to two main conclusions. First, the RGB visuotactile\nrepresentation is a better input option than the depth image or a combination\nof the two for estimating normal grasping forces. Second, RGBmod achieved a\ngood performance when tested on 10 unseen everyday objects in real-world\nscenarios, achieving an average relative error of 0.125 +- 0.153. Furthermore,\nwe show that our proposal outperforms other works in the literature that use\nRGB and depth information for the same task.\n","authors":["Julio Castaño-Amoros","Pablo Gil"],"pdf_url":"https://arxiv.org/pdf/2410.22825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22816v1","updated":"2024-10-30T08:50:14Z","published":"2024-10-30T08:50:14Z","title":"Enhancing Tool Manipulation of An Aerial Vehicle with A Dynamically\n Displacing Center-of-Mass","summary":" As aerial robots gain traction in industrial applications, there is growing\ninterest in enhancing their physical interaction capabilities. Pushing tasks\nperformed by aerial manipulators have been successfully demonstrated in\ncontact-based inspections. However, more complex industrial applications\nrequire these systems to support higher-DoF (Degree of Freedom) manipulators\nand generate larger forces while pushing (e.g., drilling, grinding). This paper\nbuilds on our previous work, where we introduced an aerial vehicle with a\ndynamically displacing CoM (Center of Mass) to improve force exertion during\ninteractions. We propose a novel approach to further enhance this system's\nforce generation by optimizing its CoM location during interactions.\nAdditionally, we study the case of this aerial vehicle equipped with a 2-DoF\nmanipulation arm to extend the system's functionality in tool-based tasks. The\neffectiveness of the proposed methods is validated through simulations,\ndemonstrating the potential of this system for advanced aerial manipulation in\npractical settings.\n","authors":["Tong Hui","Matteo Fumagalli"],"pdf_url":"https://arxiv.org/pdf/2410.22816v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2404.01110"},{"id":"http://arxiv.org/abs/2208.02439v2","updated":"2024-10-30T07:34:58Z","published":"2022-08-04T04:11:36Z","title":"MPPI-IPDDP: Hybrid Method of Collision-Free Smooth Trajectory Generation\n for Autonomous Robots","summary":" This paper presents a hybrid trajectory optimization method designed to\ngenerate collision-free, smooth trajectories for autonomous mobile robots. By\ncombining sampling-based Model Predictive Path Integral (MPPI) control with\ngradient-based Interior-Point Differential Dynamic Programming (IPDDP), we\nleverage their respective strengths in exploration and smoothing. The proposed\nmethod, MPPI-IPDDP, involves three steps: First, MPPI control is used to\ngenerate a coarse trajectory. Second, a collision-free convex corridor is\nconstructed. Third, IPDDP is applied to smooth the coarse trajectory, utilizing\nthe collision-free corridor from the second step. To demonstrate the\neffectiveness of our approach, we apply the proposed algorithm to trajectory\noptimization for differential-drive wheeled mobile robots and point-mass\nquadrotors. In comparisons with other MPPI variants and continuous\noptimization-based solvers, our method shows superior performance in terms of\ncomputational robustness and trajectory smoothness.\n Code: https://github.com/i-ASL/mppi-ipddp Video: https://youtu.be/-oUAt5sd9Bk\n","authors":["Min-Gyeom Kim","Minchan Jung","JunGee Hong","Kwang-Ki K. Kim"],"pdf_url":"https://arxiv.org/pdf/2208.02439v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22752v1","updated":"2024-10-30T07:18:00Z","published":"2024-10-30T07:18:00Z","title":"SoftCTRL: Soft conservative KL-control of Transformer Reinforcement\n Learning for Autonomous Driving","summary":" In recent years, motion planning for urban self-driving cars (SDV) has become\na popular problem due to its complex interaction of road components. To tackle\nthis, many methods have relied on large-scale, human-sampled data processed\nthrough Imitation learning (IL). Although effective, IL alone cannot adequately\nhandle safety and reliability concerns. Combining IL with Reinforcement\nlearning (RL) by adding KL divergence between RL and IL policy to the RL loss\ncan alleviate IL's weakness but suffer from over-conservation caused by\ncovariate shift of IL. To address this limitation, we introduce a method that\ncombines IL with RL using an implicit entropy-KL control that offers a simple\nway to reduce the over-conservation characteristic. In particular, we validate\ndifferent challenging simulated urban scenarios from the unseen dataset,\nindicating that although IL can perform well in imitation tasks, our proposed\nmethod significantly improves robustness (over 17\\% reduction in failures) and\ngenerates human-like driving behavior.\n","authors":["Minh Tri Huynh","Duc Dung Nguyen"],"pdf_url":"https://arxiv.org/pdf/2410.22752v1.pdf","comment":"submitted to IEEE Open Journal of Intelligent Transportation Systems"},{"id":"http://arxiv.org/abs/2401.01881v2","updated":"2024-10-30T05:47:26Z","published":"2024-01-03T18:42:22Z","title":"Robust Control Barrier Functions using Uncertainty Estimation with\n Application to Mobile Robots","summary":" This paper proposes a safety-critical control design approach for nonlinear\ncontrol affine systems in the presence of matched and unmatched uncertainties.\nOur constructive framework couples control barrier function (CBF) theory with a\nnew uncertainty estimator to ensure robust safety. The estimated uncertainty\nwith a derived upper bound on the estimation error is used for synthesizing\nCBFs and safety-critical controllers via a quadratic program-based feedback\ncontrol law that rigorously ensures robust safety while improving disturbance\nrejection performance. The method is extended to higher-order CBFs (HOCBFs) to\nachieve safety under unmatched uncertainty, which may cause relative degree\ndifferences with respect to control input and disturbances. We assume the\nrelative degree difference is at most one, resulting in a second-order cone\nconstraint. The proposed robust HOCBF method is demonstrated via a simulation\nof an uncertain elastic actuator control problem. Finally, we experimentally\ndemonstrated the efficacy of our robust CBF framework on a tracked robot with\nslope-induced matched and unmatched perturbations.\n","authors":["Ersin Das","Joel W. Burdick"],"pdf_url":"https://arxiv.org/pdf/2401.01881v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22707v1","updated":"2024-10-30T05:34:52Z","published":"2024-10-30T05:34:52Z","title":"Robotic State Recognition with Image-to-Text Retrieval Task of\n Pre-Trained Vision-Language Model and Black-Box Optimization","summary":" State recognition of the environment and objects, such as the open/closed\nstate of doors and the on/off of lights, is indispensable for robots that\nperform daily life support and security tasks. Until now, state recognition\nmethods have been based on training neural networks from manual annotations,\npreparing special sensors for the recognition, or manually programming to\nextract features from point clouds or raw images. In contrast, we propose a\nrobotic state recognition method using a pre-trained vision-language model,\nwhich is capable of Image-to-Text Retrieval (ITR) tasks. We prepare several\nkinds of language prompts in advance, calculate the similarity between these\nprompts and the current image by ITR, and perform state recognition. By\napplying the optimal weighting to each prompt using black-box optimization,\nstate recognition can be performed with higher accuracy. Experiments show that\nthis theory enables a variety of state recognitions by simply preparing\nmultiple prompts without retraining neural networks or manual programming. In\naddition, since only prompts and their weights need to be prepared for each\nrecognizer, there is no need to prepare multiple models, which facilitates\nresource management. It is possible to recognize the open/closed state of\ntransparent doors, the state of whether water is running or not from a faucet,\nand even the qualitative state of whether a kitchen is clean or not, which have\nbeen challenging so far, through language.\n","authors":["Kento Kawaharazuka","Yoshiki Obinata","Naoaki Kanazawa","Kei Okada","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2410.22707v1.pdf","comment":"Accepted at Humanoids2024"},{"id":"http://arxiv.org/abs/2410.22691v1","updated":"2024-10-30T04:54:14Z","published":"2024-10-30T04:54:14Z","title":"MiniTac: An Ultra-Compact 8 mm Vision-Based Tactile Sensor for Enhanced\n Palpation in Robot-Assisted Minimally Invasive Surgery","summary":" Robot-assisted minimally invasive surgery (RAMIS) provides substantial\nbenefits over traditional open and laparoscopic methods. However, a significant\nlimitation of RAMIS is the surgeon's inability to palpate tissues, a crucial\ntechnique for examining tissue properties and detecting abnormalities,\nrestricting the widespread adoption of RAMIS. To overcome this obstacle, we\nintroduce MiniTac, a novel vision-based tactile sensor with an ultra-compact\ncross-sectional diameter of 8 mm, designed for seamless integration into\nmainstream RAMIS devices, particularly the Da Vinci surgical systems. MiniTac\nfeatures a novel mechanoresponsive photonic elastomer membrane that changes\ncolor distribution under varying contact pressures. This color change is\ncaptured by an embedded miniature camera, allowing MiniTac to detect tumors\nboth on the tissue surface and in deeper layers typically obscured from\nendoscopic view. MiniTac's efficacy has been rigorously tested on both phantoms\nand ex-vivo tissues. By leveraging advanced mechanoresponsive photonic\nmaterials, MiniTac represents a significant advancement in integrating tactile\nsensing into RAMIS, potentially expanding its applicability to a wider array of\nclinical scenarios that currently rely on traditional surgical approaches.\n","authors":["Wanlin Li","Zihang Zhao","Leiyao Cui","Weiyi Zhang","Hangxin Liu","Li-An Li","Yixin Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.22691v1.pdf","comment":"accepted for publication in the IEEE Robotics and Automation Letters\n (RA-L)"},{"id":"http://arxiv.org/abs/2410.22689v1","updated":"2024-10-30T04:49:39Z","published":"2024-10-30T04:49:39Z","title":"Multi-Task Interactive Robot Fleet Learning with Visual World Models","summary":" Recent advancements in large-scale multi-task robot learning offer the\npotential for deploying robot fleets in household and industrial settings,\nenabling them to perform diverse tasks across various environments. However,\nAI-enabled robots often face challenges with generalization and robustness when\nexposed to real-world variability and uncertainty. We introduce Sirius-Fleet, a\nmulti-task interactive robot fleet learning framework to address these\nchallenges. Sirius-Fleet monitors robot performance during deployment and\ninvolves humans to correct the robot's actions when necessary. We employ a\nvisual world model to predict the outcomes of future actions and build anomaly\npredictors to predict whether they will likely result in anomalies. As the\nrobot autonomy improves, the anomaly predictors automatically adapt their\nprediction criteria, leading to fewer requests for human intervention and\ngradually reducing human workload over time. Evaluations on large-scale\nbenchmarks demonstrate Sirius-Fleet's effectiveness in improving multi-task\npolicy performance and monitoring accuracy. We demonstrate Sirius-Fleet's\nperformance in both RoboCasa in simulation and Mutex in the real world, two\ndiverse, large-scale multi-task benchmarks. More information is available on\nthe project website: https://ut-austin-rpl.github.io/sirius-fleet\n","authors":["Huihan Liu","Yu Zhang","Vaarij Betala","Evan Zhang","James Liu","Crystal Ding","Yuke Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.22689v1.pdf","comment":"In Proceedings of CoRL 2024"},{"id":"http://arxiv.org/abs/2410.22672v1","updated":"2024-10-30T03:49:53Z","published":"2024-10-30T03:49:53Z","title":"IM-GIV: an effective integrity monitoring scheme for tightly-coupled\n GNSS/INS/Vision integration based on factor graph optimization","summary":" Global Navigation Satellite System/Inertial Navigation System\n(GNSS/INS)/Vision integration based on factor graph optimization (FGO) has\nrecently attracted extensive attention in navigation and robotics community.\nIntegrity monitoring (IM) capability is required when FGO-based integrated\nnavigation system is used for safety-critical applications. However,\ntraditional researches on IM of integrated navigation system are mostly based\non Kalman filter. It is urgent to develop effective IM scheme for FGO-based\nGNSS/INS/Vision integration. In this contribution, the position error bounding\nformula to ensure the integrity of the GNSS/INS/Vision integration based on FGO\nis designed and validated for the first time. It can be calculated by the\nlinearized equations from the residuals of GNSS pseudo-range, IMU\npre-integration and visual measurements. The specific position error bounding\nis given in the case of GNSS, INS and visual measurement faults. Field\nexperiments were conducted to evaluate and validate the performance of the\nproposed position error bounding. Experimental results demonstrate that the\nproposed position error bounding for the GNSS/INS/Vision integration based on\nFGO can correctly fit the position error against different fault modes, and the\navailability of integrity in six fault modes is 100% after correct and timely\nfault exclusion.\n","authors":["Yunong Tian","Tuan Li","Haitao Jiang","Zhipeng Wang","Chuang Shi"],"pdf_url":"https://arxiv.org/pdf/2410.22672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22325v2","updated":"2024-10-30T03:33:08Z","published":"2024-10-29T17:58:13Z","title":"Robots Pre-train Robots: Manipulation-Centric Robotic Representation\n from Large-Scale Robot Datasets","summary":" The pre-training of visual representations has enhanced the efficiency of\nrobot learning. Due to the lack of large-scale in-domain robotic datasets,\nprior works utilize in-the-wild human videos to pre-train robotic visual\nrepresentation. Despite their promising results, representations from human\nvideos are inevitably subject to distribution shifts and lack the dynamics\ninformation crucial for task completion. We first evaluate various pre-trained\nrepresentations in terms of their correlation to the downstream robotic\nmanipulation tasks (i.e., manipulation centricity). Interestingly, we find that\nthe \"manipulation centricity\" is a strong indicator of success rates when\napplied to downstream tasks. Drawing from these findings, we propose\nManipulation Centric Representation (MCR), a foundation representation learning\nframework capturing both visual features and the dynamics information such as\nactions and proprioceptions of manipulation tasks to improve manipulation\ncentricity. Specifically, we pre-train a visual encoder on the DROID robotic\ndataset and leverage motion-relevant data such as robot proprioceptive states\nand actions. We introduce a novel contrastive loss that aligns visual\nobservations with the robot's proprioceptive state-action dynamics, combined\nwith a behavior cloning (BC)-like actor loss to predict actions during\npre-training, along with a time contrastive loss. Empirical results across 4\nsimulation domains with 20 tasks verify that MCR outperforms the strongest\nbaseline method by 14.8%. Moreover, MCR boosts the performance of\ndata-efficient learning with a UR5e arm on 3 real-world tasks by 76.9%. Project\nwebsite: https://robots-pretrain-robots.github.io/.\n","authors":["Guangqi Jiang","Yifei Sun","Tao Huang","Huanyu Li","Yongyuan Liang","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2410.22325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22662v1","updated":"2024-10-30T03:20:01Z","published":"2024-10-30T03:20:01Z","title":"$\\textbf{EMOS}$: $\\textbf{E}$mbodiment-aware Heterogeneous\n $\\textbf{M}$ulti-robot $\\textbf{O}$perating $\\textbf{S}$ystem with LLM Agents","summary":" Heterogeneous multi-robot systems (HMRS) have emerged as a powerful approach\nfor tackling complex tasks that single robots cannot manage alone. Current\nlarge-language-model-based multi-agent systems (LLM-based MAS) have shown\nsuccess in areas like software development and operating systems, but applying\nthese systems to robot control presents unique challenges. In particular, the\ncapabilities of each agent in a multi-robot system are inherently tied to the\nphysical composition of the robots, rather than predefined roles. To address\nthis issue, we introduce a novel multi-agent framework designed to enable\neffective collaboration among heterogeneous robots with varying embodiments and\ncapabilities, along with a new benchmark named Habitat-MAS. One of our key\ndesigns is $\\textit{Robot Resume}$: Instead of adopting human-designed role\nplay, we propose a self-prompted approach, where agents comprehend robot URDF\nfiles and call robot kinematics tools to generate descriptions of their physics\ncapabilities to guide their behavior in task planning and action execution. The\nHabitat-MAS benchmark is designed to assess how a multi-agent framework handles\ntasks that require embodiment-aware reasoning, which includes 1) manipulation,\n2) perception, 3) navigation, and 4) comprehensive multi-floor object\nrearrangement. The experimental results indicate that the robot's resume and\nthe hierarchical design of our multi-agent system are essential for the\neffective operation of the heterogeneous multi-robot system within this\nintricate problem context.\n","authors":["Junting Chen","Checheng Yu","Xunzhe Zhou","Tianqi Xu","Yao Mu","Mengkang Hu","Wenqi Shao","Yikai Wang","Guohao Li","Lin Shao"],"pdf_url":"https://arxiv.org/pdf/2410.22662v1.pdf","comment":"10 pages of main content, 3 pages of references, 5 pages of appendix,\n 7 figures in total"},{"id":"http://arxiv.org/abs/2406.14558v3","updated":"2024-10-30T02:58:10Z","published":"2024-06-20T17:59:22Z","title":"CooHOI: Learning Cooperative Human-Object Interaction with Manipulated\n Object Dynamics","summary":" Enabling humanoid robots to clean rooms has long been a pursued dream within\nhumanoid research communities. However, many tasks require multi-humanoid\ncollaboration, such as carrying large and heavy furniture together. Given the\nscarcity of motion capture data on multi-humanoid collaboration and the\nefficiency challenges associated with multi-agent learning, these tasks cannot\nbe straightforwardly addressed using training paradigms designed for\nsingle-agent scenarios. In this paper, we introduce Cooperative Human-Object\nInteraction (CooHOI), a framework designed to tackle the challenge of\nmulti-humanoid object transportation problem through a two-phase learning\nparadigm: individual skill learning and subsequent policy transfer. First, a\nsingle humanoid character learns to interact with objects through imitation\nlearning from human motion priors. Then, the humanoid learns to collaborate\nwith others by considering the shared dynamics of the manipulated object using\ncentralized training and decentralized execution (CTDE) multi-agent RL\nalgorithms. When one agent interacts with the object, resulting in specific\nobject dynamics changes, the other agents learn to respond appropriately,\nthereby achieving implicit communication and coordination between teammates.\nUnlike previous approaches that relied on tracking-based methods for\nmulti-humanoid HOI, CooHOI is inherently efficient, does not depend on motion\ncapture data of multi-humanoid interactions, and can be seamlessly extended to\ninclude more participants and a wide range of object types.\n","authors":["Jiawei Gao","Ziqin Wang","Zeqi Xiao","Jingbo Wang","Tai Wang","Jinkun Cao","Xiaolin Hu","Si Liu","Jifeng Dai","Jiangmiao Pang"],"pdf_url":"https://arxiv.org/pdf/2406.14558v3.pdf","comment":"Project website: https://gao-jiawei.com/Research/CooHOI/. NeurIPS\n 2024 Spotlight"},{"id":"http://arxiv.org/abs/2404.00282v3","updated":"2024-10-30T02:22:46Z","published":"2024-03-30T08:28:08Z","title":"Survey on Large Language Model-Enhanced Reinforcement Learning: Concept,\n Taxonomy, and Methods","summary":" With extensive pre-trained knowledge and high-level general capabilities,\nlarge language models (LLMs) emerge as a promising avenue to augment\nreinforcement learning (RL) in aspects such as multi-task learning, sample\nefficiency, and high-level task planning. In this survey, we provide a\ncomprehensive review of the existing literature in LLM-enhanced RL and\nsummarize its characteristics compared to conventional RL methods, aiming to\nclarify the research scope and directions for future studies. Utilizing the\nclassical agent-environment interaction paradigm, we propose a structured\ntaxonomy to systematically categorize LLMs' functionalities in RL, including\nfour roles: information processor, reward designer, decision-maker, and\ngenerator. For each role, we summarize the methodologies, analyze the specific\nRL challenges that are mitigated, and provide insights into future directions.\nLastly, a comparative analysis of each role, potential applications,\nprospective opportunities, and challenges of the LLM-enhanced RL are discussed.\nBy proposing this taxonomy, we aim to provide a framework for researchers to\neffectively leverage LLMs in the RL field, potentially accelerating RL\napplications in complex applications such as robotics, autonomous driving, and\nenergy systems.\n","authors":["Yuji Cao","Huan Zhao","Yuheng Cheng","Ting Shu","Yue Chen","Guolong Liu","Gaoqi Liang","Junhua Zhao","Jinyue Yan","Yun Li"],"pdf_url":"https://arxiv.org/pdf/2404.00282v3.pdf","comment":"22 pages (including bibliography), 6 figures"},{"id":"http://arxiv.org/abs/2410.22643v1","updated":"2024-10-30T02:15:37Z","published":"2024-10-30T02:15:37Z","title":"An Overtaking Trajectory Planning Framework Based on Spatio-temporal\n Topology and Reachable Set Analysis Ensuring Time Efficiency","summary":" Generating overtaking trajectories in high-speed scenarios presents\nsignificant challenges and is typically addressed through hierarchical planning\nmethods. However, this method has two primary drawbacks. First, heuristic\nalgorithms can only provide a single initial solution, which may lead to local\noptima and consequently diminish the quality of the solution. Second, the time\nefficiency of trajectory refinement based on numerical optimization is\ninsufficient. To overcome these limitations, this paper proposes an overtaking\ntrajectory planning framework based on spatio-temporal topology and reachable\nset analysis (SROP), to improve trajectory quality and time efficiency.\nSpecifically, this paper introduces topological classes to describe\ntrajectories representing different overtaking behaviors, which support the\nspatio-temporal topological search method employed by the upper-layer planner\nto identify diverse initial paths. This approach helps prevent getting stuck in\nlocal optima, enhancing the overall solution quality by considering multiple\ninitial solutions from distinct topologies. Moreover, the reachable set method\nis integrated into the lower-layer planner for parallel trajectory evaluation.\nThis method enhances planning efficiency by decoupling vehicle model\nconstraints from the optimization process, enabling parallel computation while\nensuring control feasibility. Simulation results show that the proposed method\nimproves the smoothness of generated trajectories by 66.8% compared to\nstate-of-the-art methods, highlighting its effectiveness in enhancing\ntrajectory quality. Additionally, this method reduces computation time by\n62.9%, demonstrating its efficiency.\n","authors":["Wule Mao","Zhouheng Li","Lei Xie","Hongye Su"],"pdf_url":"https://arxiv.org/pdf/2410.22643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21736v2","updated":"2024-10-30T01:35:32Z","published":"2024-10-29T04:50:34Z","title":"Enhancing Safety and Robustness of Vision-Based Controllers via\n Reachability Analysis","summary":" Autonomous systems, such as self-driving cars and drones, have made\nsignificant strides in recent years by leveraging visual inputs and machine\nlearning for decision-making and control. Despite their impressive performance,\nthese vision-based controllers can make erroneous predictions when faced with\nnovel or out-of-distribution inputs. Such errors can cascade into catastrophic\nsystem failures and compromise system safety. In this work, we compute Neural\nReachable Tubes, which act as parameterized approximations of Backward\nReachable Tubes to stress-test the vision-based controllers and mine their\nfailure modes. The identified failures are then used to enhance the system\nsafety through both offline and online methods. The online approach involves\ntraining a classifier as a run-time failure monitor to detect closed-loop,\nsystem-level failures, subsequently triggering a fallback controller that\nrobustly handles these detected failures to preserve system safety. For the\noffline approach, we improve the original controller via incremental training\nusing a carefully augmented failure dataset, resulting in a more robust\ncontroller that is resistant to the known failure modes. In either approach,\nthe system is safeguarded against shortcomings that transcend the vision-based\ncontroller and pertain to the closed-loop safety of the overall system. We\nvalidate the proposed approaches on an autonomous aircraft taxiing task that\ninvolves using a vision-based controller to guide the aircraft towards the\ncenterline of the runway. Our results show the efficacy of the proposed\nalgorithms in identifying and handling system-level failures, outperforming\nmethods that rely on controller prediction error or uncertainty quantification\nfor identifying system failures.\n","authors":["Kaustav Chakraborty","Aryaman Gupta","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2410.21736v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23516v1","updated":"2024-10-30T23:57:49Z","published":"2024-10-30T23:57:49Z","title":"NUSense: Robust Soft Optical Tactile Sensor","summary":" While most tactile sensors rely on measuring pressure, insights from\ncontinuum mechanics suggest that measuring shear strain provides critical\ninformation for tactile sensing. In this work, we introduce an optical tactile\nsensing principle based on shear strain detection. A silicone rubber layer,\ndyed with color inks, is used to quantify the shear magnitude of the sensing\nlayer. This principle was validated using the NUSense camera-based tactile\nsensor. The wide-angle camera captures the elongation of the soft pad under\nmechanical load, a phenomenon attributed to the Poisson effect. The physical\nand optical properties of the inked pad are essential and should ideally remain\nstable over time. We tested the robustness of the sensor by subjecting the\noutermost layer to multiple load cycles using a robot arm. Additionally, we\ndiscussed potential applications of this sensor in force sensing and contact\nlocalization.\n","authors":["Madina Yergibay","Tleukhan Mussin","Saltanat Seitzhan","Daryn Kenzhebek","Zhanat Kappassov","Harold Soh","Tasbolat Taunyazov"],"pdf_url":"https://arxiv.org/pdf/2410.23516v1.pdf","comment":"Madina Yergibay and Tleukhan Mussin contributed equally. 6 pages, 6\n figures"},{"id":"http://arxiv.org/abs/2309.04459v2","updated":"2024-10-30T23:45:17Z","published":"2023-09-08T17:37:05Z","title":"Subwords as Skills: Tokenization for Sparse-Reward Reinforcement\n Learning","summary":" Exploration in sparse-reward reinforcement learning is difficult due to the\nrequirement of long, coordinated sequences of actions in order to achieve any\nreward. Moreover, in continuous action spaces there are an infinite number of\npossible actions, which only increases the difficulty of exploration. One class\nof methods designed to address these issues forms temporally extended actions,\noften called skills, from interaction data collected in the same domain, and\noptimizes a policy on top of this new action space. Typically such methods\nrequire a lengthy pretraining phase, especially in continuous action spaces, in\norder to form the skills before reinforcement learning can begin. Given prior\nevidence that the full range of the continuous action space is not required in\nsuch tasks, we propose a novel approach to skill-generation with two\ncomponents. First we discretize the action space through clustering, and second\nwe leverage a tokenization technique borrowed from natural language processing\nto generate temporally extended actions. Such a method outperforms baselines\nfor skill-generation in several challenging sparse-reward domains, and requires\norders-of-magnitude less computation in skill-generation and online rollouts.\nOur code is available at \\url{https://github.com/dyunis/subwords_as_skills}.\n","authors":["David Yunis","Justin Jung","Falcon Dai","Matthew Walter"],"pdf_url":"https://arxiv.org/pdf/2309.04459v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23488v1","updated":"2024-10-30T22:43:47Z","published":"2024-10-30T22:43:47Z","title":"PACER: Preference-conditioned All-terrain Costmap Generation","summary":" In autonomous robot navigation, terrain cost assignment is typically\nperformed using a semantics-based paradigm in which terrain is first labeled\nusing a pre-trained semantic classifier and costs are then assigned according\nto a user-defined mapping between label and cost. While this approach is\nrapidly adaptable to changing user preferences, only preferences over the types\nof terrain that are already known by the semantic classifier can be expressed.\nIn this paper, we hypothesize that a machine-learning-based alternative to the\nsemantics-based paradigm above will allow for rapid cost assignment adaptation\nto preferences expressed over new terrains at deployment time without the need\nfor additional training. To investigate this hypothesis, we introduce and study\nPACER, a novel approach to costmap generation that accepts as input a single\nbirds-eye view (BEV) image of the surrounding area along with a user-specified\npreference context and generates a corresponding BEV costmap that aligns with\nthe preference context. Using both real and synthetic data along with a\ncombination of proposed training tasks, we find that PACER is able to adapt\nquickly to new user preferences while also exhibiting better generalization to\nnovel terrains compared to both semantics-based and representation-learning\napproaches.\n","authors":["Luisa Mao","Garrett Warnell","Peter Stone","Joydeep Biswas"],"pdf_url":"https://arxiv.org/pdf/2410.23488v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23464v1","updated":"2024-10-30T21:09:58Z","published":"2024-10-30T21:09:58Z","title":"Design and Motion Analysis of a Reconfigurable Pendulum-Based Rolling\n Disk Robot with Magnetic Coupling","summary":" Reconfigurable robots are at the forefront of robotics innovation due to\ntheir unmatched versatility and adaptability in addressing various tasks\nthrough collaborative operations. This paper explores the design and\nimplementation of a novel pendulum-based magnetic coupling system within a\nreconfigurable disk robot. Diverging from traditional designs, this system\nemphasizes enhancing coupling strength while maintaining the compactness of the\nouter shell. We employ parametric optimization techniques, including magnetic\narray simulations, to improve coupling performance. Additionally, we conduct a\ncomprehensive analysis of the rolling robot's motion to assess its operational\neffectiveness in the coupling mechanism. This examination reveals intriguing\nnew motion patterns driven by frictional and sliding effects between the\nrolling disk modules and the ground. Furthermore, the new setup introduces a\nnovel problem in the area of nonprehensile manipulation.\n","authors":["Ollie Wiltshire","Seyed Amir Tafrishi"],"pdf_url":"https://arxiv.org/pdf/2410.23464v1.pdf","comment":"Accepted to TAROS 2024"},{"id":"http://arxiv.org/abs/2410.23450v1","updated":"2024-10-30T20:46:26Z","published":"2024-10-30T20:46:26Z","title":"Return Augmented Decision Transformer for Off-Dynamics Reinforcement\n Learning","summary":" We study offline off-dynamics reinforcement learning (RL) to utilize data\nfrom an easily accessible source domain to enhance policy learning in a target\ndomain with limited data. Our approach centers on return-conditioned supervised\nlearning (RCSL), particularly focusing on the decision transformer (DT), which\ncan predict actions conditioned on desired return guidance and complete\ntrajectory history. Previous works tackle the dynamics shift problem by\naugmenting the reward in the trajectory from the source domain to match the\noptimal trajectory in the target domain. However, this strategy can not be\ndirectly applicable in RCSL owing to (1) the unique form of the RCSL policy\nclass, which explicitly depends on the return, and (2) the absence of a\nstraightforward representation of the optimal trajectory distribution. We\npropose the Return Augmented Decision Transformer (RADT) method, where we\naugment the return in the source domain by aligning its distribution with that\nin the target domain. We provide the theoretical analysis demonstrating that\nthe RCSL policy learned from RADT achieves the same level of suboptimality as\nwould be obtained without a dynamics shift. We introduce two practical\nimplementations RADT-DARA and RADT-MV respectively. Extensive experiments\nconducted on D4RL datasets reveal that our methods generally outperform dynamic\nprogramming based methods in off-dynamics RL scenarios.\n","authors":["Ruhan Wang","Yu Yang","Zhishuai Liu","Dongruo Zhou","Pan Xu"],"pdf_url":"https://arxiv.org/pdf/2410.23450v1.pdf","comment":"26 pages, 10 tables, 10 figures"},{"id":"http://arxiv.org/abs/2406.18043v2","updated":"2024-10-30T20:16:18Z","published":"2024-06-26T03:41:48Z","title":"GenRL: Multimodal-foundation world models for generalization in embodied\n agents","summary":" Learning generalist embodied agents, able to solve multitudes of tasks in\ndifferent domains is a long-standing problem. Reinforcement learning (RL) is\nhard to scale up as it requires a complex reward design for each task. In\ncontrast, language can specify tasks in a more natural way. Current foundation\nvision-language models (VLMs) generally require fine-tuning or other\nadaptations to be adopted in embodied contexts, due to the significant domain\ngap. However, the lack of multimodal data in such domains represents an\nobstacle to developing foundation models for embodied applications. In this\nwork, we overcome these problems by presenting multimodal-foundation world\nmodels, able to connect and align the representation of foundation VLMs with\nthe latent space of generative world models for RL, without any language\nannotations. The resulting agent learning framework, GenRL, allows one to\nspecify tasks through vision and/or language prompts, ground them in the\nembodied domain's dynamics, and learn the corresponding behaviors in\nimagination. As assessed through large-scale multi-task benchmarking in\nlocomotion and manipulation domains, GenRL enables multi-task generalization\nfrom language and visual prompts. Furthermore, by introducing a data-free\npolicy learning strategy, our approach lays the groundwork for foundational\npolicy learning using generative world models. Website, code and data:\nhttps://mazpie.github.io/genrl/\n","authors":["Pietro Mazzaglia","Tim Verbelen","Bart Dhoedt","Aaron Courville","Sai Rajeswar"],"pdf_url":"https://arxiv.org/pdf/2406.18043v2.pdf","comment":"Presented at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23428v1","updated":"2024-10-30T20:13:40Z","published":"2024-10-30T20:13:40Z","title":"Learning for Deformable Linear Object Insertion Leveraging Flexibility\n Estimation from Visual Cues","summary":" Manipulation of deformable Linear objects (DLOs), including iron wire,\nrubber, silk, and nylon rope, is ubiquitous in daily life. These objects\nexhibit diverse physical properties, such as Young$'$s modulus and bending\nstiffness.Such diversity poses challenges for developing generalized\nmanipulation policies. However, previous research limited their scope to\nsingle-material DLOs and engaged in time-consuming data collection for the\nstate estimation. In this paper, we propose a two-stage manipulation approach\nconsisting of a material property (e.g., flexibility) estimation and policy\nlearning for DLO insertion with reinforcement learning. Firstly, we design a\nflexibility estimation scheme that characterizes the properties of different\ntypes of DLOs. The ground truth flexibility data is collected in simulation to\ntrain our flexibility estimation module. During the manipulation, the robot\ninteracts with the DLOs to estimate flexibility by analyzing their visual\nconfigurations. Secondly, we train a policy conditioned on the estimated\nflexibility to perform challenging DLO insertion tasks. Our pipeline trained\nwith diverse insertion scenarios achieves an 85.6% success rate in simulation\nand 66.67% in real robot experiments. Please refer to our project page:\nhttps://lmeee.github.io/DLOInsert/\n","authors":["Mingen Li","Changhyun Choi"],"pdf_url":"https://arxiv.org/pdf/2410.23428v1.pdf","comment":"7 pages, 9 figures, 3 tables. 2024 IEEE International Conference on\n Robotics and Automation (ICRA)"},{"id":"http://arxiv.org/abs/2410.04680v2","updated":"2024-10-30T20:04:12Z","published":"2024-10-07T01:24:39Z","title":"Next Best Sense: Guiding Vision and Touch with FisherRF for 3D Gaussian\n Splatting","summary":" We propose a framework for active next best view and touch selection for\nrobotic manipulators using 3D Gaussian Splatting (3DGS). 3DGS is emerging as a\nuseful explicit 3D scene representation for robotics, as it has the ability to\nrepresent scenes in a both photorealistic and geometrically accurate manner.\nHowever, in real-world, online robotic scenes where the number of views is\nlimited given efficiency requirements, random view selection for 3DGS becomes\nimpractical as views are often overlapping and redundant. We address this issue\nby proposing an end-to-end online training and active view selection pipeline,\nwhich enhances the performance of 3DGS in few-view robotics settings. We first\nelevate the performance of few-shot 3DGS with a novel semantic depth alignment\nmethod using Segment Anything Model 2 (SAM2) that we supplement with Pearson\ndepth and surface normal loss to improve color and depth reconstruction of\nreal-world scenes. We then extend FisherRF, a next-best-view selection method\nfor 3DGS, to select views and touch poses based on depth uncertainty. We\nperform online view selection on a real robot system during live 3DGS training.\nWe motivate our improvements to few-shot GS scenes, and extend depth-based\nFisherRF to them, where we demonstrate both qualitative and quantitative\nimprovements on challenging robot scenes. For more information, please see our\nproject page at https://arm.stanford.edu/next-best-sense.\n","authors":["Matthew Strong","Boshu Lei","Aiden Swann","Wen Jiang","Kostas Daniilidis","Monroe Kennedy III"],"pdf_url":"https://arxiv.org/pdf/2410.04680v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04856v2","updated":"2024-10-30T19:53:16Z","published":"2023-12-08T06:24:32Z","title":"SCALER: Versatile Multi-Limbed Robot for Free-Climbing in Extreme\n Terrains","summary":" This paper presents SCALER, a versatile free-climbing multi-limbed robot that\nis designed to achieve tightly coupled simultaneous locomotion and dexterous\ngrasping. Although existing quadruped-limbed robots have shown impressive\ndexterous skills such as object manipulation, it is essential to balance\npower-intensive locomotion and dexterous grasping capabilities. We design a\ntorso linkage and a parallel-serial limb to meet such conflicting skills that\npose unique challenges in the hardware designs. SCALER employs underactuated\ntwo-fingered GOAT grippers that can mechanically adapt and offer 7 modes of\ngrasping, enabling SCALER to traverse extreme terrains with multi-modal\ngrasping strategies. We study the whole-body approach, where SCALER uses its\nbody and limbs to generate additional forces for stable grasping with\nenvironments, further enhancing versatility. Furthermore, we improve the GOAT\ngripper actuation speed to realize more dynamic climbing in a closed-loop\ncontrol fashion. With these proposed technologies, SCALER can traverse\nvertical, overhang, upside-down, slippery terrains, and bouldering walls with\nnon-convex-shaped climbing holds under the Earth's gravity.\n","authors":["Yusuke Tanaka","Yuki Shirai","Alexander Schperberg","Xuan Lin","Dennis Hong"],"pdf_url":"https://arxiv.org/pdf/2312.04856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12192v2","updated":"2024-10-30T18:48:00Z","published":"2024-09-18T17:59:43Z","title":"DynaMo: In-Domain Dynamics Pretraining for Visuo-Motor Control","summary":" Imitation learning has proven to be a powerful tool for training complex\nvisuomotor policies. However, current methods often require hundreds to\nthousands of expert demonstrations to handle high-dimensional visual\nobservations. A key reason for this poor data efficiency is that visual\nrepresentations are predominantly either pretrained on out-of-domain data or\ntrained directly through a behavior cloning objective. In this work, we present\nDynaMo, a new in-domain, self-supervised method for learning visual\nrepresentations. Given a set of expert demonstrations, we jointly learn a\nlatent inverse dynamics model and a forward dynamics model over a sequence of\nimage embeddings, predicting the next frame in latent space, without\naugmentations, contrastive sampling, or access to ground truth actions.\nImportantly, DynaMo does not require any out-of-domain data such as Internet\ndatasets or cross-embodied datasets. On a suite of six simulated and real\nenvironments, we show that representations learned with DynaMo significantly\nimprove downstream imitation learning performance over prior self-supervised\nlearning objectives, and pretrained representations. Gains from using DynaMo\nhold across policy classes such as Behavior Transformer, Diffusion Policy, MLP,\nand nearest neighbors. Finally, we ablate over key components of DynaMo and\nmeasure its impact on downstream policy performance. Robot videos are best\nviewed at https://dynamo-ssl.github.io\n","authors":["Zichen Jeff Cui","Hengkai Pan","Aadhithya Iyer","Siddhant Haldar","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2409.12192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23382v1","updated":"2024-10-30T18:38:42Z","published":"2024-10-30T18:38:42Z","title":"Estimating Neural Network Robustness via Lipschitz Constant and\n Architecture Sensitivity","summary":" Ensuring neural network robustness is essential for the safe and reliable\noperation of robotic learning systems, especially in perception and\ndecision-making tasks within real-world environments. This paper investigates\nthe robustness of neural networks in perception systems, specifically examining\ntheir sensitivity to targeted, small-scale perturbations. We identify the\nLipschitz constant as a key metric for quantifying and enhancing network\nrobustness. We derive an analytical expression to compute the Lipschitz\nconstant based on neural network architecture, providing a theoretical basis\nfor estimating and improving robustness. Several experiments reveal the\nrelationship between network design, the Lipschitz constant, and robustness,\noffering practical insights for developing safer, more robust robot learning\nsystems.\n","authors":["Abulikemu Abuduweili","Changliu Liu"],"pdf_url":"https://arxiv.org/pdf/2410.23382v1.pdf","comment":"SAFE-ROL at CoRL 2024"},{"id":"http://arxiv.org/abs/2410.23377v1","updated":"2024-10-30T18:27:55Z","published":"2024-10-30T18:27:55Z","title":"A Cost-Effective Thermal Imaging Safety Sensor for Industry 5.0 and\n Collaborative Robotics","summary":" The Industry 5.0 paradigm focuses on industrial operator well-being and\nsustainable manufacturing practices, where humans play a central role, not only\nduring the repetitive and collaborative tasks of the manufacturing process, but\nalso in the management of the factory floor assets. Human factors, such as\nergonomics, safety, and well-being, push the human-centric smart factory to\nefficiently adopt novel technologies while minimizing environmental and social\nimpact. As operations at the factory floor increasingly rely on collaborative\nrobots (CoBots) and flexible manufacturing systems, there is a growing demand\nfor redundant safety mechanisms (i.e., automatic human detection in the\nproximity of machinery that is under operation). Fostering enhanced process\nsafety for human proximity detection allows for the protection against possible\nincidents or accidents with the deployed industrial devices and machinery. This\npaper introduces the design and implementation of a cost-effective thermal\nimaging Safety Sensor that can be used in the scope of Industry 5.0 to trigger\ndistinct safe mode states in manufacturing processes that rely on collaborative\nrobotics. The proposed Safety Sensor uses a hybrid detection approach and has\nbeen evaluated under controlled environmental conditions. The obtained results\nshow a 97% accuracy at low computational cost when using the developed hybrid\nmethod to detect the presence of humans in thermal images.\n","authors":["Daniel Barros","Paula Fraga-Lamas","Tiago M. Fernandez-Carames","Sergio Ivan Lopes"],"pdf_url":"https://arxiv.org/pdf/2410.23377v1.pdf","comment":"Paper accepted in Edge-IoT 2022"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2410.23287v1","updated":"2024-10-30T17:59:26Z","published":"2024-10-30T17:59:26Z","title":"ReferEverything: Towards Segmenting Everything We Can Speak of in Videos","summary":" We present REM, a framework for segmenting a wide range of concepts in video\nthat can be described through natural language. Our method capitalizes on\nvisual-language representations learned by video diffusion models on\nInternet-scale datasets. A key insight of our approach is preserving as much of\nthe generative model's original representation as possible, while fine-tuning\nit on narrow-domain Referral Object Segmentation datasets. As a result, our\nframework can accurately segment and track rare and unseen objects, despite\nbeing trained on object masks from a limited set of categories. Additionally,\nit can generalize to non-object dynamic concepts, such as waves crashing in the\nocean, as demonstrated in our newly introduced benchmark for Referral Video\nProcess Segmentation (Ref-VPS). Our experiments show that REM performs on par\nwith state-of-the-art approaches on in-domain datasets, like Ref-DAVIS, while\noutperforming them by up to twelve points in terms of region similarity on\nout-of-domain data, leveraging the power of Internet-scale pre-training.\n","authors":["Anurag Bagchi","Zhipeng Bao","Yu-Xiong Wang","Pavel Tokmakov","Martial Hebert"],"pdf_url":"https://arxiv.org/pdf/2410.23287v1.pdf","comment":"Project page at\n https://miccooper9.github.io/projects/ReferEverything/"},{"id":"http://arxiv.org/abs/2410.23280v1","updated":"2024-10-30T17:57:21Z","published":"2024-10-30T17:57:21Z","title":"RelationBooth: Towards Relation-Aware Customized Object Generation","summary":" Customized image generation is crucial for delivering personalized content\nbased on user-provided image prompts, aligning large-scale text-to-image\ndiffusion models with individual needs. However, existing models often overlook\nthe relationships between customized objects in generated images. Instead, this\nwork addresses that gap by focusing on relation-aware customized image\ngeneration, which aims to preserve the identities from image prompts while\nmaintaining the predicate relations described in text prompts. Specifically, we\nintroduce RelationBooth, a framework that disentangles identity and relation\nlearning through a well-curated dataset. Our training data consists of\nrelation-specific images, independent object images containing identity\ninformation, and text prompts to guide relation generation. Then, we propose\ntwo key modules to tackle the two main challenges: generating accurate and\nnatural relations, especially when significant pose adjustments are required,\nand avoiding object confusion in cases of overlap. First, we introduce a\nkeypoint matching loss that effectively guides the model in adjusting object\nposes closely tied to their relationships. Second, we incorporate local\nfeatures from the image prompts to better distinguish between objects,\npreventing confusion in overlapping cases. Extensive results on three\nbenchmarks demonstrate the superiority of RelationBooth in generating precise\nrelations while preserving object identities across a diverse set of objects\nand relations. The source code and trained models will be made available to the\npublic.\n","authors":["Qingyu Shi","Lu Qi","Jianzong Wu","Jinbin Bai","Jingbo Wang","Yunhai Tong","Xiangtai Li","Ming-Husang Yang"],"pdf_url":"https://arxiv.org/pdf/2410.23280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23278v1","updated":"2024-10-30T17:56:02Z","published":"2024-10-30T17:56:02Z","title":"OpenSatMap: A Fine-grained High-resolution Satellite Dataset for\n Large-scale Map Construction","summary":" In this paper, we propose OpenSatMap, a fine-grained, high-resolution\nsatellite dataset for large-scale map construction. Map construction is one of\nthe foundations of the transportation industry, such as navigation and\nautonomous driving. Extracting road structures from satellite images is an\nefficient way to construct large-scale maps. However, existing satellite\ndatasets provide only coarse semantic-level labels with a relatively low\nresolution (up to level 19), impeding the advancement of this field. In\ncontrast, the proposed OpenSatMap (1) has fine-grained instance-level\nannotations; (2) consists of high-resolution images (level 20); (3) is\ncurrently the largest one of its kind; (4) collects data with high diversity.\nMoreover, OpenSatMap covers and aligns with the popular nuScenes dataset and\nArgoverse 2 dataset to potentially advance autonomous driving technologies. By\npublishing and maintaining the dataset, we provide a high-quality benchmark for\nsatellite-based map construction and downstream tasks like autonomous driving.\n","authors":["Hongbo Zhao","Lue Fan","Yuntao Chen","Haochen Wang","yuran Yang","Xiaojuan Jin","Yixin Zhang","Gaofeng Meng","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.23278v1.pdf","comment":"NeurIPS 2024 D&B Track. Project Page:https://opensatmap.github.io/"},{"id":"http://arxiv.org/abs/2410.23277v1","updated":"2024-10-30T17:55:52Z","published":"2024-10-30T17:55:52Z","title":"SlowFast-VGen: Slow-Fast Learning for Action-Driven Long Video\n Generation","summary":" Human beings are endowed with a complementary learning system, which bridges\nthe slow learning of general world dynamics with fast storage of episodic\nmemory from a new experience. Previous video generation models, however,\nprimarily focus on slow learning by pre-training on vast amounts of data,\noverlooking the fast learning phase crucial for episodic memory storage. This\noversight leads to inconsistencies across temporally distant frames when\ngenerating longer videos, as these frames fall beyond the model's context\nwindow. To this end, we introduce SlowFast-VGen, a novel dual-speed learning\nsystem for action-driven long video generation. Our approach incorporates a\nmasked conditional video diffusion model for the slow learning of world\ndynamics, alongside an inference-time fast learning strategy based on a\ntemporal LoRA module. Specifically, the fast learning process updates its\ntemporal LoRA parameters based on local inputs and outputs, thereby efficiently\nstoring episodic memory in its parameters. We further propose a slow-fast\nlearning loop algorithm that seamlessly integrates the inner fast learning loop\ninto the outer slow learning loop, enabling the recall of prior multi-episode\nexperiences for context-aware skill learning. To facilitate the slow learning\nof an approximate world model, we collect a large-scale dataset of 200k videos\nwith language action annotations, covering a wide range of scenarios. Extensive\nexperiments show that SlowFast-VGen outperforms baselines across various\nmetrics for action-driven video generation, achieving an FVD score of 514\ncompared to 782, and maintaining consistency in longer videos, with an average\nof 0.37 scene cuts versus 0.89. The slow-fast learning loop algorithm\nsignificantly enhances performances on long-horizon planning tasks as well.\nProject Website: https://slowfast-vgen.github.io\n","authors":["Yining Hong","Beide Liu","Maxine Wu","Yuanhao Zhai","Kai-Wei Chang","Lingjie Li","Kevin Lin","Chung-Ching Lin","Jianfeng Wang","Zhengyuan Yang","Yingnian Wu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2410.23277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23274v1","updated":"2024-10-30T17:54:56Z","published":"2024-10-30T17:54:56Z","title":"Multi-student Diffusion Distillation for Better One-step Generators","summary":" Diffusion models achieve high-quality sample generation at the cost of a\nlengthy multistep inference procedure. To overcome this, diffusion distillation\ntechniques produce student generators capable of matching or surpassing the\nteacher in a single step. However, the student model's inference speed is\nlimited by the size of the teacher architecture, preventing real-time\ngeneration for computationally heavy applications. In this work, we introduce\nMulti-Student Distillation (MSD), a framework to distill a conditional teacher\ndiffusion model into multiple single-step generators. Each student generator is\nresponsible for a subset of the conditioning data, thereby obtaining higher\ngeneration quality for the same capacity. MSD trains multiple distilled\nstudents, allowing smaller sizes and, therefore, faster inference. Also, MSD\noffers a lightweight quality boost over single-student distillation with the\nsame architecture. We demonstrate MSD is effective by training multiple\nsame-sized or smaller students on single-step distillation using distribution\nmatching and adversarial distillation techniques. With smaller students, MSD\ngets competitive results with faster inference for single-step generation.\nUsing 4 same-sized students, MSD sets a new state-of-the-art for one-step image\ngeneration: FID 1.20 on ImageNet-64x64 and 8.20 on zero-shot COCO2014.\n","authors":["Yanke Song","Jonathan Lorraine","Weili Nie","Karsten Kreis","James Lucas"],"pdf_url":"https://arxiv.org/pdf/2410.23274v1.pdf","comment":"Project page: https://research.nvidia.com/labs/toronto-ai/MSD/"},{"id":"http://arxiv.org/abs/2410.22217v2","updated":"2024-10-30T17:51:26Z","published":"2024-10-29T16:48:22Z","title":"Towards Unifying Understanding and Generation in the Era of Vision\n Foundation Models: A Survey from the Autoregression Perspective","summary":" Autoregression in large language models (LLMs) has shown impressive\nscalability by unifying all language tasks into the next token prediction\nparadigm. Recently, there is a growing interest in extending this success to\nvision foundation models. In this survey, we review the recent advances and\ndiscuss future directions for autoregressive vision foundation models. First,\nwe present the trend for next generation of vision foundation models, i.e.,\nunifying both understanding and generation in vision tasks. We then analyze the\nlimitations of existing vision foundation models, and present a formal\ndefinition of autoregression with its advantages. Later, we categorize\nautoregressive vision foundation models from their vision tokenizers and\nautoregression backbones. Finally, we discuss several promising research\nchallenges and directions. To the best of our knowledge, this is the first\nsurvey to comprehensively summarize autoregressive vision foundation models\nunder the trend of unifying understanding and generation. A collection of\nrelated resources is available at https://github.com/EmmaSRH/ARVFM.\n","authors":["Shenghao Xie","Wenqiang Zu","Mingyang Zhao","Duo Su","Shilong Liu","Ruohua Shi","Guoqi Li","Shanghang Zhang","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2410.22217v2.pdf","comment":"17 pages, 1 table, 2 figures"},{"id":"http://arxiv.org/abs/2410.23266v1","updated":"2024-10-30T17:50:23Z","published":"2024-10-30T17:50:23Z","title":"TOMATO: Assessing Visual Temporal Reasoning Capabilities in Multimodal\n Foundation Models","summary":" Existing benchmarks often highlight the remarkable performance achieved by\nstate-of-the-art Multimodal Foundation Models (MFMs) in leveraging temporal\ncontext for video understanding. However, how well do the models truly perform\nvisual temporal reasoning? Our study of existing benchmarks shows that this\ncapability of MFMs is likely overestimated as many questions can be solved by\nusing a single, few, or out-of-order frames. To systematically examine current\nvisual temporal reasoning tasks, we propose three principles with corresponding\nmetrics: (1) Multi-Frame Gain, (2) Frame Order Sensitivity, and (3) Frame\nInformation Disparity. Following these principles, we introduce TOMATO,\nTemporal Reasoning Multimodal Evaluation, a novel benchmark crafted to\nrigorously assess MFMs' temporal reasoning capabilities in video understanding.\nTOMATO comprises 1,484 carefully curated, human-annotated questions spanning\nsix tasks (i.e., action count, direction, rotation, shape & trend, velocity &\nfrequency, and visual cues), applied to 1,417 videos, including 805\nself-recorded and -generated videos, that encompass human-centric, real-world,\nand simulated scenarios. Our comprehensive evaluation reveals a human-model\nperformance gap of 57.3% with the best-performing model. Moreover, our in-depth\nanalysis uncovers more fundamental limitations beyond this gap in current MFMs.\nWhile they can accurately recognize events in isolated frames, they fail to\ninterpret these frames as a continuous sequence. We believe TOMATO will serve\nas a crucial testbed for evaluating the next-generation MFMs and as a call to\nthe community to develop AI systems capable of comprehending human world\ndynamics through the video modality.\n","authors":["Ziyao Shangguan","Chuhan Li","Yuxuan Ding","Yanan Zheng","Yilun Zhao","Tesca Fitzgerald","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2410.23266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05670v2","updated":"2024-10-30T17:47:56Z","published":"2024-06-09T06:59:46Z","title":"Certified Robustness to Data Poisoning in Gradient-Based Training","summary":" Modern machine learning pipelines leverage large amounts of public data,\nmaking it infeasible to guarantee data quality and leaving models open to\npoisoning and backdoor attacks. Provably bounding model behavior under such\nattacks remains an open problem. In this work, we address this challenge by\ndeveloping the first framework providing provable guarantees on the behavior of\nmodels trained with potentially manipulated data without modifying the model or\nlearning algorithm. In particular, our framework certifies robustness against\nuntargeted and targeted poisoning, as well as backdoor attacks, for bounded and\nunbounded manipulations of the training inputs and labels. Our method leverages\nconvex relaxations to over-approximate the set of all possible parameter\nupdates for a given poisoning threat model, allowing us to bound the set of all\nreachable parameters for any gradient-based learning algorithm. Given this set\nof parameters, we provide bounds on worst-case behavior, including model\nperformance and backdoor success rate. We demonstrate our approach on multiple\nreal-world datasets from applications including energy consumption, medical\nimaging, and autonomous driving.\n","authors":["Philip Sosnin","Mark N. Müller","Maximilian Baader","Calvin Tsay","Matthew Wicker"],"pdf_url":"https://arxiv.org/pdf/2406.05670v2.pdf","comment":"21 pages, 8 figures"},{"id":"http://arxiv.org/abs/2410.23262v1","updated":"2024-10-30T17:46:31Z","published":"2024-10-30T17:46:31Z","title":"EMMA: End-to-End Multimodal Model for Autonomous Driving","summary":" We introduce EMMA, an End-to-end Multimodal Model for Autonomous driving.\nBuilt on a multi-modal large language model foundation, EMMA directly maps raw\ncamera sensor data into various driving-specific outputs, including planner\ntrajectories, perception objects, and road graph elements. EMMA maximizes the\nutility of world knowledge from the pre-trained large language models, by\nrepresenting all non-sensor inputs (e.g. navigation instructions and ego\nvehicle status) and outputs (e.g. trajectories and 3D locations) as natural\nlanguage text. This approach allows EMMA to jointly process various driving\ntasks in a unified language space, and generate the outputs for each task using\ntask-specific prompts. Empirically, we demonstrate EMMA's effectiveness by\nachieving state-of-the-art performance in motion planning on nuScenes as well\nas competitive results on the Waymo Open Motion Dataset (WOMD). EMMA also\nyields competitive results for camera-primary 3D object detection on the Waymo\nOpen Dataset (WOD). We show that co-training EMMA with planner trajectories,\nobject detection, and road graph tasks yields improvements across all three\ndomains, highlighting EMMA's potential as a generalist model for autonomous\ndriving applications. However, EMMA also exhibits certain limitations: it can\nprocess only a small amount of image frames, does not incorporate accurate 3D\nsensing modalities like LiDAR or radar and is computationally expensive. We\nhope that our results will inspire further research to mitigate these issues\nand to further evolve the state of the art in autonomous driving model\narchitectures.\n","authors":["Jyh-Jing Hwang","Runsheng Xu","Hubert Lin","Wei-Chih Hung","Jingwei Ji","Kristy Choi","Di Huang","Tong He","Paul Covington","Benjamin Sapp","James Guo","Dragomir Anguelov","Mingxing Tan"],"pdf_url":"https://arxiv.org/pdf/2410.23262v1.pdf","comment":"Blog post: https://waymo.com/blog/2024/10/introducing-emma/"},{"id":"http://arxiv.org/abs/2405.15196v2","updated":"2024-10-30T17:38:55Z","published":"2024-05-24T03:58:20Z","title":"DisC-GS: Discontinuity-aware Gaussian Splatting","summary":" Recently, Gaussian Splatting, a method that represents a 3D scene as a\ncollection of Gaussian distributions, has gained significant attention in\naddressing the task of novel view synthesis. In this paper, we highlight a\nfundamental limitation of Gaussian Splatting: its inability to accurately\nrender discontinuities and boundaries in images due to the continuous nature of\nGaussian distributions. To address this issue, we propose a novel framework\nenabling Gaussian Splatting to perform discontinuity-aware image rendering.\nAdditionally, we introduce a B\\'ezier-boundary gradient approximation strategy\nwithin our framework to keep the \"differentiability\" of the proposed\ndiscontinuity-aware rendering process. Extensive experiments demonstrate the\nefficacy of our framework.\n","authors":["Haoxuan Qu","Zhuoling Li","Hossein Rahmani","Yujun Cai","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2405.15196v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23254v1","updated":"2024-10-30T17:37:31Z","published":"2024-10-30T17:37:31Z","title":"Keypoint Abstraction using Large Models for Object-Relative Imitation\n Learning","summary":" Generalization to novel object configurations and instances across diverse\ntasks and environments is a critical challenge in robotics. Keypoint-based\nrepresentations have been proven effective as a succinct representation for\ncapturing essential object features, and for establishing a reference frame in\naction prediction, enabling data-efficient learning of robot skills. However,\ntheir manual design nature and reliance on additional human labels limit their\nscalability. In this paper, we propose KALM, a framework that leverages large\npre-trained vision-language models (LMs) to automatically generate\ntask-relevant and cross-instance consistent keypoints. KALM distills robust and\nconsistent keypoints across views and objects by generating proposals using LMs\nand verifies them against a small set of robot demonstration data. Based on the\ngenerated keypoints, we can train keypoint-conditioned policy models that\npredict actions in keypoint-centric frames, enabling robots to generalize\neffectively across varying object poses, camera views, and object instances\nwith similar functional shapes. Our method demonstrates strong performance in\nthe real world, adapting to different tasks and environments from only a\nhandful of demonstrations while requiring no additional labels. Website:\nhttps://kalm-il.github.io/\n","authors":["Xiaolin Fang","Bo-Ruei Huang","Jiayuan Mao","Jasmine Shone","Joshua B. Tenenbaum","Tomás Lozano-Pérez","Leslie Pack Kaelbling"],"pdf_url":"https://arxiv.org/pdf/2410.23254v1.pdf","comment":"CoRL LangRob Workshop, 2024"},{"id":"http://arxiv.org/abs/2403.17009v2","updated":"2024-10-30T17:35:06Z","published":"2024-03-25T17:59:58Z","title":"Is Your LiDAR Placement Optimized for 3D Scene Understanding?","summary":" The reliability of driving perception systems under unprecedented conditions\nis crucial for practical usage. Latest advancements have prompted increasing\ninterest in multi-LiDAR perception. However, prevailing driving datasets\npredominantly utilize single-LiDAR systems and collect data devoid of adverse\nconditions, failing to capture the complexities of real-world environments\naccurately. Addressing these gaps, we proposed Place3D, a full-cycle pipeline\nthat encompasses LiDAR placement optimization, data generation, and downstream\nevaluations. Our framework makes three appealing contributions. 1) To identify\nthe most effective configurations for multi-LiDAR systems, we introduce the\nSurrogate Metric of the Semantic Occupancy Grids (M-SOG) to evaluate LiDAR\nplacement quality. 2) Leveraging the M-SOG metric, we propose a novel\noptimization strategy to refine multi-LiDAR placements. 3) Centered around the\ntheme of multi-condition multi-LiDAR perception, we collect a 280,000-frame\ndataset from both clean and adverse conditions. Extensive experiments\ndemonstrate that LiDAR placements optimized using our approach outperform\nvarious baselines. We showcase exceptional results in both LiDAR semantic\nsegmentation and 3D object detection tasks, under diverse weather and sensor\nfailure conditions.\n","authors":["Ye Li","Lingdong Kong","Hanjiang Hu","Xiaohao Xu","Xiaonan Huang"],"pdf_url":"https://arxiv.org/pdf/2403.17009v2.pdf","comment":"NeurIPS 2024 (Spotlight); 36 pages, 16 figures, 14 tables; Code at\n https://github.com/ywyeli/Place3D"},{"id":"http://arxiv.org/abs/2410.23247v1","updated":"2024-10-30T17:30:35Z","published":"2024-10-30T17:30:35Z","title":"bit2bit: 1-bit quanta video reconstruction via self-supervised photon\n prediction","summary":" Quanta image sensors, such as SPAD arrays, are an emerging sensor technology,\nproducing 1-bit arrays representing photon detection events over exposures as\nshort as a few nanoseconds. In practice, raw data are post-processed using\nheavy spatiotemporal binning to create more useful and interpretable images at\nthe cost of degrading spatiotemporal resolution. In this work, we propose\nbit2bit, a new method for reconstructing high-quality image stacks at the\noriginal spatiotemporal resolution from sparse binary quanta image data.\nInspired by recent work on Poisson denoising, we developed an algorithm that\ncreates a dense image sequence from sparse binary photon data by predicting the\nphoton arrival location probability distribution. However, due to the binary\nnature of the data, we show that the assumption of a Poisson distribution is\ninadequate. Instead, we model the process with a Bernoulli lattice process from\nthe truncated Poisson. This leads to the proposal of a novel self-supervised\nsolution based on a masked loss function. We evaluate our method using both\nsimulated and real data. On simulated data from a conventional video, we\nachieve 34.35 mean PSNR with extremely photon-sparse binary input (<0.06\nphotons per pixel per frame). We also present a novel dataset containing a wide\nrange of real SPAD high-speed videos under various challenging imaging\nconditions. The scenes cover strong/weak ambient light, strong motion,\nultra-fast events, etc., which will be made available to the community, on\nwhich we demonstrate the promise of our approach. Both reconstruction quality\nand throughput substantially surpass the state-of-the-art methods (e.g., Quanta\nBurst Photography (QBP)). Our approach significantly enhances the visualization\nand usability of the data, enabling the application of existing analysis\ntechniques.\n","authors":["Yehe Liu","Alexander Krull","Hector Basevi","Ales Leonardis","Michael W. Jenkins"],"pdf_url":"https://arxiv.org/pdf/2410.23247v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23245v1","updated":"2024-10-30T17:29:25Z","published":"2024-10-30T17:29:25Z","title":"PointRecon: Online Point-based 3D Reconstruction via Ray-based 2D-3D\n Matching","summary":" We propose a novel online, point-based 3D reconstruction method from posed\nmonocular RGB videos. Our model maintains a global point cloud representation\nof the scene, continuously updating the features and 3D locations of points as\nnew images are observed. It expands the point cloud with newly detected points\nwhile carefully removing redundancies. The point cloud updates and depth\npredictions for new points are achieved through a novel ray-based 2D-3D feature\nmatching technique, which is robust against errors in previous point position\npredictions. In contrast to offline methods, our approach processes\ninfinite-length sequences and provides real-time updates. Additionally, the\npoint cloud imposes no pre-defined resolution or scene size constraints, and\nits unified global representation ensures view consistency across perspectives.\nExperiments on the ScanNet dataset show that our method achieves\nstate-of-the-art quality among online MVS approaches. Project page:\nhttps://arthurhero.github.io/projects/pointrecon\n","authors":["Chen Ziwen","Zexiang Xu","Li Fuxin"],"pdf_url":"https://arxiv.org/pdf/2410.23245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21556v2","updated":"2024-10-30T17:27:58Z","published":"2024-10-28T21:35:08Z","title":"Super-resolution in disordered media using neural networks","summary":" We propose a methodology that exploits large and diverse data sets to\naccurately estimate the ambient medium's Green's functions in strongly\nscattering media. Given these estimates, obtained with and without the use of\nneural networks, excellent imaging results are achieved, with a resolution that\nis better than that of a homogeneous medium. This phenomenon, also known as\nsuper-resolution, occurs because the ambient scattering medium effectively\nenhances the physical imaging aperture.\n","authors":["Alexander Christie","Matan Leibovich","Miguel Moscoso","Alexei Novikov","George Papanicolaou","Chrysoula Tsogka"],"pdf_url":"https://arxiv.org/pdf/2410.21556v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23231v1","updated":"2024-10-30T17:20:08Z","published":"2024-10-30T17:20:08Z","title":"LGU-SLAM: Learnable Gaussian Uncertainty Matching with Deformable\n Correlation Sampling for Deep Visual SLAM","summary":" Deep visual Simultaneous Localization and Mapping (SLAM) techniques, e.g.,\nDROID, have made significant advancements by leveraging deep visual odometry on\ndense flow fields. In general, they heavily rely on global visual similarity\nmatching. However, the ambiguous similarity interference in uncertain regions\ncould often lead to excessive noise in correspondences, ultimately misleading\nSLAM in geometric modeling. To address this issue, we propose a Learnable\nGaussian Uncertainty (LGU) matching. It mainly focuses on precise\ncorrespondence construction. In our scheme, a learnable 2D Gaussian uncertainty\nmodel is designed to associate matching-frame pairs. It could generate\ninput-dependent Gaussian distributions for each correspondence map.\nAdditionally, a multi-scale deformable correlation sampling strategy is devised\nto adaptively fine-tune the sampling of each direction by a priori look-up\nranges, enabling reliable correlation construction. Furthermore, a KAN-bias GRU\ncomponent is adopted to improve a temporal iterative enhancement for\naccomplishing sophisticated spatio-temporal modeling with limited parameters.\nThe extensive experiments on real-world and synthetic datasets are conducted to\nvalidate the effectiveness and superiority of our method.\n","authors":["Yucheng Huang","Luping Ji","Hudong Liu","Mao Ye"],"pdf_url":"https://arxiv.org/pdf/2410.23231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23230v1","updated":"2024-10-30T17:18:53Z","published":"2024-10-30T17:18:53Z","title":"Aligning Audio-Visual Joint Representations with an Agentic Workflow","summary":" Visual content and accompanied audio signals naturally formulate a joint\nrepresentation to improve audio-visual (AV) related applications. While studies\ndevelop various AV representation learning frameworks, the importance of AV\ndata alignment is usually undermined for achieving high-quality representation.\nWe observe that an audio signal may contain background noise interference.\nAlso, non-synchronization may appear between audio and video streams. These\nnon-strict data alignment limits representation quality and downgrade\napplication performance. In this paper, we propose to improve AV joint\nrepresentations from a data-centric perspective by aligning audio signals to\nvisual data. Our alignment is conducted in an agentic workflow controlled by an\nLLM-based assistant named AVAgent. For each input AV data pair, our AVAgent\nuses a multi-modal LLM to convert audio and visual data into language\ndescriptions separately (i.e., tool use). Then, AVAgent reasons whether this\npaired data is aligned well and plans to edit the audio signal if needed (i.e.,\nplanning). The audio editing is executed by predefined actions that filter\nnoise or augment data. Moreover, we use a VLM to evaluate how modified audio\nsignals match the visual content and provide feedback to AVAgent (i.e.,\nreflection). The tool use, planning, and reflection steps operate cyclically to\nbecome an agentic workflow where audio signals are gradually aligned to visual\ncontent. To this end, existing methods can directly leverage the aligned AV\ndata via our agentic workflow to improve AV joint representations. The\nexperimental results comprehensively demonstrate the state-of-the-art\nperformance of the proposed approach against previous baselines in diverse\ndownstream tasks.\n","authors":["Shentong Mo","Yibing Song"],"pdf_url":"https://arxiv.org/pdf/2410.23230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23219v1","updated":"2024-10-30T17:11:00Z","published":"2024-10-30T17:11:00Z","title":"DiaMond: Dementia Diagnosis with Multi-Modal Vision Transformers Using\n MRI and PET","summary":" Diagnosing dementia, particularly for Alzheimer's Disease (AD) and\nfrontotemporal dementia (FTD), is complex due to overlapping symptoms. While\nmagnetic resonance imaging (MRI) and positron emission tomography (PET) data\nare critical for the diagnosis, integrating these modalities in deep learning\nfaces challenges, often resulting in suboptimal performance compared to using\nsingle modalities. Moreover, the potential of multi-modal approaches in\ndifferential diagnosis, which holds significant clinical importance, remains\nlargely unexplored. We propose a novel framework, DiaMond, to address these\nissues with vision Transformers to effectively integrate MRI and PET. DiaMond\nis equipped with self-attention and a novel bi-attention mechanism that\nsynergistically combine MRI and PET, alongside a multi-modal normalization to\nreduce redundant dependency, thereby boosting the performance. DiaMond\nsignificantly outperforms existing multi-modal methods across various datasets,\nachieving a balanced accuracy of 92.4% in AD diagnosis, 65.2% for AD-MCI-CN\nclassification, and 76.5% in differential diagnosis of AD and FTD. We also\nvalidated the robustness of DiaMond in a comprehensive ablation study. The code\nis available at https://github.com/ai-med/DiaMond.\n","authors":["Yitong Li","Morteza Ghahremani","Youssef Wally","Christian Wachinger"],"pdf_url":"https://arxiv.org/pdf/2410.23219v1.pdf","comment":"Accepted by IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2410.23218v1","updated":"2024-10-30T17:10:19Z","published":"2024-10-30T17:10:19Z","title":"OS-ATLAS: A Foundation Action Model for Generalist GUI Agents","summary":" Existing efforts in building GUI agents heavily rely on the availability of\nrobust commercial Vision-Language Models (VLMs) such as GPT-4o and\nGeminiProVision. Practitioners are often reluctant to use open-source VLMs due\nto their significant performance lag compared to their closed-source\ncounterparts, particularly in GUI grounding and Out-Of-Distribution (OOD)\nscenarios. To facilitate future research in this area, we developed OS-Atlas -\na foundational GUI action model that excels at GUI grounding and OOD agentic\ntasks through innovations in both data and modeling. We have invested\nsignificant engineering effort in developing an open-source toolkit for\nsynthesizing GUI grounding data across multiple platforms, including Windows,\nLinux, MacOS, Android, and the web. Leveraging this toolkit, we are releasing\nthe largest open-source cross-platform GUI grounding corpus to date, which\ncontains over 13 million GUI elements. This dataset, combined with innovations\nin model training, provides a solid foundation for OS-Atlas to understand GUI\nscreenshots and generalize to unseen interfaces. Through extensive evaluation\nacross six benchmarks spanning three different platforms (mobile, desktop, and\nweb), OS-Atlas demonstrates significant performance improvements over previous\nstate-of-the-art models. Our evaluation also uncovers valuable insights into\ncontinuously improving and scaling the agentic capabilities of open-source\nVLMs.\n","authors":["Zhiyong Wu","Zhenyu Wu","Fangzhi Xu","Yian Wang","Qiushi Sun","Chengyou Jia","Kanzhi Cheng","Zichen Ding","Liheng Chen","Paul Pu Liang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2410.23218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06007v2","updated":"2024-10-30T17:08:16Z","published":"2024-06-10T04:07:09Z","title":"CARES: A Comprehensive Benchmark of Trustworthiness in Medical Vision\n Language Models","summary":" Artificial intelligence has significantly impacted medical applications,\nparticularly with the advent of Medical Large Vision Language Models\n(Med-LVLMs), sparking optimism for the future of automated and personalized\nhealthcare. However, the trustworthiness of Med-LVLMs remains unverified,\nposing significant risks for future model deployment. In this paper, we\nintroduce CARES and aim to comprehensively evaluate the Trustworthiness of\nMed-LVLMs across the medical domain. We assess the trustworthiness of Med-LVLMs\nacross five dimensions, including trustfulness, fairness, safety, privacy, and\nrobustness. CARES comprises about 41K question-answer pairs in both closed and\nopen-ended formats, covering 16 medical image modalities and 27 anatomical\nregions. Our analysis reveals that the models consistently exhibit concerns\nregarding trustworthiness, often displaying factual inaccuracies and failing to\nmaintain fairness across different demographic groups. Furthermore, they are\nvulnerable to attacks and demonstrate a lack of privacy awareness. We publicly\nrelease our benchmark and code in https://cares-ai.github.io/.\n","authors":["Peng Xia","Ze Chen","Juanxi Tian","Yangrui Gong","Ruibo Hou","Yue Xu","Zhenbang Wu","Zhiyuan Fan","Yiyang Zhou","Kangyu Zhu","Wenhao Zheng","Zhaoyang Wang","Xiao Wang","Xuchao Zhang","Chetan Bansal","Marc Niethammer","Junzhou Huang","Hongtu Zhu","Yun Li","Jimeng Sun","Zongyuan Ge","Gang Li","James Zou","Huaxiu Yao"],"pdf_url":"https://arxiv.org/pdf/2406.06007v2.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2309.01770v2","updated":"2024-10-30T17:05:17Z","published":"2023-09-04T19:16:46Z","title":"StyleAdapter: A Unified Stylized Image Generation Model","summary":" This work focuses on generating high-quality images with specific style of\nreference images and content of provided textual descriptions. Current leading\nalgorithms, i.e., DreamBooth and LoRA, require fine-tuning for each style,\nleading to time-consuming and computationally expensive processes. In this\nwork, we propose StyleAdapter, a unified stylized image generation model\ncapable of producing a variety of stylized images that match both the content\nof a given prompt and the style of reference images, without the need for\nper-style fine-tuning. It introduces a two-path cross-attention (TPCA) module\nto separately process style information and textual prompt, which cooperate\nwith a semantic suppressing vision model (SSVM) to suppress the semantic\ncontent of style images. In this way, it can ensure that the prompt maintains\ncontrol over the content of the generated images, while also mitigating the\nnegative impact of semantic information in style references. This results in\nthe content of the generated image adhering to the prompt, and its style\naligning with the style references. Besides, our StyleAdapter can be integrated\nwith existing controllable synthesis methods, such as T2I-adapter and\nControlNet, to attain a more controllable and stable generation process.\nExtensive experiments demonstrate the superiority of our method over previous\nworks.\n","authors":["Zhouxia Wang","Xintao Wang","Liangbin Xie","Zhongang Qi","Ying Shan","Wenping Wang","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2309.01770v2.pdf","comment":"Accepted by IJCV24"},{"id":"http://arxiv.org/abs/2410.23213v1","updated":"2024-10-30T17:01:28Z","published":"2024-10-30T17:01:28Z","title":"ELMGS: Enhancing memory and computation scaLability through coMpression\n for 3D Gaussian Splatting","summary":" 3D models have recently been popularized by the potentiality of end-to-end\ntraining offered first by Neural Radiance Fields and most recently by 3D\nGaussian Splatting models. The latter has the big advantage of naturally\nproviding fast training convergence and high editability. However, as the\nresearch around these is still in its infancy, there is still a gap in the\nliterature regarding the model's scalability. In this work, we propose an\napproach enabling both memory and computation scalability of such models. More\nspecifically, we propose an iterative pruning strategy that removes redundant\ninformation encoded in the model. We also enhance compressibility for the model\nby including in the optimization strategy a differentiable quantization and\nentropy coding estimator. Our results on popular benchmarks showcase the\neffectiveness of the proposed approach and open the road to the broad\ndeployability of such a solution even on resource-constrained devices.\n","authors":["Muhammad Salman Ali","Sung-Ho Bae","Enzo Tartaglione"],"pdf_url":"https://arxiv.org/pdf/2410.23213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16493v2","updated":"2024-10-30T16:58:25Z","published":"2024-05-26T09:11:46Z","title":"Flow Snapshot Neurons in Action: Deep Neural Networks Generalize to\n Biological Motion Perception","summary":" Biological motion perception (BMP) refers to humans' ability to perceive and\nrecognize the actions of living beings solely from their motion patterns,\nsometimes as minimal as those depicted on point-light displays. While humans\nexcel at these tasks without any prior training, current AI models struggle\nwith poor generalization performance. To close this research gap, we propose\nthe Motion Perceiver (MP). MP solely relies on patch-level optical flows from\nvideo clips as inputs. During training, it learns prototypical flow snapshots\nthrough a competitive binding mechanism and integrates invariant motion\nrepresentations to predict action labels for the given video. During inference,\nwe evaluate the generalization ability of all AI models and humans on 62,656\nvideo stimuli spanning 24 BMP conditions using point-light displays in\nneuroscience. Remarkably, MP outperforms all existing AI models with a maximum\nimprovement of 29% in top-1 action recognition accuracy on these conditions.\nMoreover, we benchmark all AI models in point-light displays of two standard\nvideo datasets in computer vision. MP also demonstrates superior performance in\nthese cases. More interestingly, via psychophysics experiments, we found that\nMP recognizes biological movements in a way that aligns with human behaviors.\nOur data and code are available at\nhttps://github.com/ZhangLab-DeepNeuroCogLab/MotionPerceiver.\n","authors":["Shuangpeng Han","Ziyu Wang","Mengmi Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.16493v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23200v1","updated":"2024-10-30T16:49:59Z","published":"2024-10-30T16:49:59Z","title":"HEX: Hierarchical Emergence Exploitation in Self-Supervised Algorithms","summary":" In this paper, we propose an algorithm that can be used on top of a wide\nvariety of self-supervised (SSL) approaches to take advantage of hierarchical\nstructures that emerge during training. SSL approaches typically work through\nsome invariance term to ensure consistency between similar samples and a\nregularization term to prevent global dimensional collapse. Dimensional\ncollapse refers to data representations spanning a lower-dimensional subspace.\nRecent work has demonstrated that the representation space of these algorithms\ngradually reflects a semantic hierarchical structure as training progresses.\nData samples of the same hierarchical grouping tend to exhibit greater\ndimensional collapse locally compared to the dataset as a whole due to sharing\nfeatures in common with each other. Ideally, SSL algorithms would take\nadvantage of this hierarchical emergence to have an additional regularization\nterm to account for this local dimensional collapse effect. However, the\nconstruction of existing SSL algorithms does not account for this property. To\naddress this, we propose an adaptive algorithm that performs a weighted\ndecomposition of the denominator of the InfoNCE loss into two terms: local\nhierarchical and global collapse regularization respectively. This\ndecomposition is based on an adaptive threshold that gradually lowers to\nreflect the emerging hierarchical structure of the representation space\nthroughout training. It is based on an analysis of the cosine similarity\ndistribution of samples in a batch. We demonstrate that this hierarchical\nemergence exploitation (HEX) approach can be integrated across a wide variety\nof SSL algorithms. Empirically, we show performance improvements of up to 5.6%\nrelative improvement over baseline SSL approaches on classification accuracy on\nImagenet with 100 epochs of training.\n","authors":["Kiran Kokilepersaud","Seulgi Kim","Mohit Prabhushankar","Ghassan AlRegib"],"pdf_url":"https://arxiv.org/pdf/2410.23200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23191v1","updated":"2024-10-30T16:45:59Z","published":"2024-10-30T16:45:59Z","title":"Continuous Spatio-Temporal Memory Networks for 4D Cardiac Cine MRI\n Segmentation","summary":" Current cardiac cine magnetic resonance image (cMR) studies focus on the end\ndiastole (ED) and end systole (ES) phases, while ignoring the abundant temporal\ninformation in the whole image sequence. This is because whole sequence\nsegmentation is currently a tedious process and inaccurate. Conventional whole\nsequence segmentation approaches first estimate the motion field between\nframes, which is then used to propagate the mask along the temporal axis.\nHowever, the mask propagation results could be prone to error, especially for\nthe basal and apex slices, where through-plane motion leads to significant\nmorphology and structural change during the cardiac cycle. Inspired by recent\nadvances in video object segmentation (VOS), based on spatio-temporal memory\n(STM) networks, we propose a continuous STM (CSTM) network for semi-supervised\nwhole heart and whole sequence cMR segmentation. Our CSTM network takes full\nadvantage of the spatial, scale, temporal and through-plane continuity prior of\nthe underlying heart anatomy structures, to achieve accurate and fast 4D\nsegmentation. Results of extensive experiments across multiple cMR datasets\nshow that our method can improve the 4D cMR segmentation performance,\nespecially for the hard-to-segment regions.\n","authors":["Meng Ye","Bingyu Xin","Leon Axel","Dimitris Metaxas"],"pdf_url":"https://arxiv.org/pdf/2410.23191v1.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2410.00485v2","updated":"2024-10-30T16:43:53Z","published":"2024-10-01T08:16:40Z","title":"A Hitchhikers Guide to Fine-Grained Face Forgery Detection Using Common\n Sense Reasoning","summary":" Explainability in artificial intelligence is crucial for restoring trust,\nparticularly in areas like face forgery detection, where viewers often struggle\nto distinguish between real and fabricated content. Vision and Large Language\nModels (VLLM) bridge computer vision and natural language, offering numerous\napplications driven by strong common-sense reasoning. Despite their success in\nvarious tasks, the potential of vision and language remains underexplored in\nface forgery detection, where they hold promise for enhancing explainability by\nleveraging the intrinsic reasoning capabilities of language to analyse\nfine-grained manipulation areas. As such, there is a need for a methodology\nthat converts face forgery detection to a Visual Question Answering (VQA) task\nto systematically and fairly evaluate these capabilities. Previous efforts for\nunified benchmarks in deepfake detection have focused on the simpler binary\ntask, overlooking evaluation protocols for fine-grained detection and\ntext-generative models. We propose a multi-staged approach that diverges from\nthe traditional binary decision paradigm to address this gap. In the first\nstage, we assess the models' performance on the binary task and their\nsensitivity to given instructions using several prompts. In the second stage,\nwe delve deeper into fine-grained detection by identifying areas of\nmanipulation in a multiple-choice VQA setting. In the third stage, we convert\nthe fine-grained detection to an open-ended question and compare several\nmatching strategies for the multi-label classification task. Finally, we\nqualitatively evaluate the fine-grained responses of the VLLMs included in the\nbenchmark. We apply our benchmark to several popular models, providing a\ndetailed comparison of binary, multiple-choice, and open-ended VQA evaluation\nacross seven datasets.\n\\url{https://nickyfot.github.io/hitchhickersguide.github.io/}\n","authors":["Niki Maria Foteinopoulou","Enjie Ghorbel","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2410.00485v2.pdf","comment":"Accepted at NeurIPS'2024 (D&B)"},{"id":"http://arxiv.org/abs/2406.12849v2","updated":"2024-10-30T16:37:01Z","published":"2024-06-18T17:59:31Z","title":"Depth Anywhere: Enhancing 360 Monocular Depth Estimation via Perspective\n Distillation and Unlabeled Data Augmentation","summary":" Accurately estimating depth in 360-degree imagery is crucial for virtual\nreality, autonomous navigation, and immersive media applications. Existing\ndepth estimation methods designed for perspective-view imagery fail when\napplied to 360-degree images due to different camera projections and\ndistortions, whereas 360-degree methods perform inferior due to the lack of\nlabeled data pairs. We propose a new depth estimation framework that utilizes\nunlabeled 360-degree data effectively. Our approach uses state-of-the-art\nperspective depth estimation models as teacher models to generate pseudo labels\nthrough a six-face cube projection technique, enabling efficient labeling of\ndepth in 360-degree images. This method leverages the increasing availability\nof large datasets. Our approach includes two main stages: offline mask\ngeneration for invalid regions and an online semi-supervised joint training\nregime. We tested our approach on benchmark datasets such as Matterport3D and\nStanford2D3D, showing significant improvements in depth estimation accuracy,\nparticularly in zero-shot scenarios. Our proposed training pipeline can enhance\nany 360 monocular depth estimator and demonstrates effective knowledge transfer\nacross different camera projections and data types. See our project page for\nresults: https://albert100121.github.io/Depth-Anywhere/\n","authors":["Ning-Hsu Wang","Yu-Lun Liu"],"pdf_url":"https://arxiv.org/pdf/2406.12849v2.pdf","comment":"NeurIPS 2024. Project page:\n https://albert100121.github.io/Depth-Anywhere/"},{"id":"http://arxiv.org/abs/2404.09326v3","updated":"2024-10-30T16:27:20Z","published":"2024-04-14T18:57:38Z","title":"Weight Copy and Low-Rank Adaptation for Few-Shot Distillation of Vision\n Transformers","summary":" Few-shot knowledge distillation recently emerged as a viable approach to\nharness the knowledge of large-scale pre-trained models, using limited data and\ncomputational resources. In this paper, we propose a novel few-shot feature\ndistillation approach for vision transformers. Our approach is based on two key\nsteps. Leveraging the fact that vision transformers have a consistent\ndepth-wise structure, we first copy the weights from intermittent layers of\nexisting pre-trained vision transformers (teachers) into shallower\narchitectures (students), where the intermittence factor controls the\ncomplexity of the student transformer with respect to its teacher. Next, we\nemploy an enhanced version of Low-Rank Adaptation (LoRA) to distill knowledge\ninto the student in a few-shot scenario, aiming to recover the information\nprocessing carried out by the skipped teacher layers. We present comprehensive\nexperiments with supervised and self-supervised transformers as teachers, on\nsix data sets from various domains (natural, medical and satellite images) and\ntasks (classification and segmentation). The empirical results confirm the\nsuperiority of our approach over state-of-the-art competitors. Moreover, the\nablation results demonstrate the usefulness of each component of the proposed\npipeline. We release our code at https://github.com/dianagrigore/WeCoLoRA.\n","authors":["Diana-Nicoleta Grigore","Mariana-Iuliana Georgescu","Jon Alvarez Justo","Tor Johansen","Andreea Iuliana Ionescu","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2404.09326v3.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2405.02730v3","updated":"2024-10-30T16:13:42Z","published":"2024-05-04T18:27:29Z","title":"U-DiTs: Downsample Tokens in U-Shaped Diffusion Transformers","summary":" Diffusion Transformers (DiTs) introduce the transformer architecture to\ndiffusion tasks for latent-space image generation. With an isotropic\narchitecture that chains a series of transformer blocks, DiTs demonstrate\ncompetitive performance and good scalability; but meanwhile, the abandonment of\nU-Net by DiTs and their following improvements is worth rethinking. To this\nend, we conduct a simple toy experiment by comparing a U-Net architectured DiT\nwith an isotropic one. It turns out that the U-Net architecture only gain a\nslight advantage amid the U-Net inductive bias, indicating potential\nredundancies within the U-Net-style DiT. Inspired by the discovery that U-Net\nbackbone features are low-frequency-dominated, we perform token downsampling on\nthe query-key-value tuple for self-attention that bring further improvements\ndespite a considerable amount of reduction in computation. Based on\nself-attention with downsampled tokens, we propose a series of U-shaped DiTs\n(U-DiTs) in the paper and conduct extensive experiments to demonstrate the\nextraordinary performance of U-DiT models. The proposed U-DiT could outperform\nDiT-XL/2 with only 1/6 of its computation cost. Codes are available at\nhttps://github.com/YuchuanTian/U-DiT.\n","authors":["Yuchuan Tian","Zhijun Tu","Hanting Chen","Jie Hu","Chao Xu","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2405.02730v3.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2410.23159v1","updated":"2024-10-30T16:12:56Z","published":"2024-10-30T16:12:56Z","title":"Fourier Amplitude and Correlation Loss: Beyond Using L2 Loss for\n Skillful Precipitation Nowcasting","summary":" Deep learning approaches have been widely adopted for precipitation\nnowcasting in recent years. Previous studies mainly focus on proposing new\nmodel architectures to improve pixel-wise metrics. However, they frequently\nresult in blurry predictions which provide limited utility to forecasting\noperations. In this work, we propose a new Fourier Amplitude and Correlation\nLoss (FACL) which consists of two novel loss terms: Fourier Amplitude Loss\n(FAL) and Fourier Correlation Loss (FCL). FAL regularizes the Fourier amplitude\nof the model prediction and FCL complements the missing phase information. The\ntwo loss terms work together to replace the traditional $L_2$ losses such as\nMSE and weighted MSE for the spatiotemporal prediction problem on signal-based\ndata. Our method is generic, parameter-free and efficient. Extensive\nexperiments using one synthetic dataset and three radar echo datasets\ndemonstrate that our method improves perceptual metrics and meteorology skill\nscores, with a small trade-off to pixel-wise accuracy and structural\nsimilarity. Moreover, to improve the error margin in meteorological skill\nscores such as Critical Success Index (CSI) and Fractions Skill Score (FSS), we\npropose and adopt the Regional Histogram Divergence (RHD), a distance metric\nthat considers the patch-wise similarity between signal-based imagery patterns\nwith tolerance to local transforms. Code is available at\nhttps://github.com/argenycw/FACL\n","authors":["Chiu-Wai Yan","Shi Quan Foo","Van Hoan Trinh","Dit-Yan Yeung","Ka-Hing Wong","Wai-Kin Wong"],"pdf_url":"https://arxiv.org/pdf/2410.23159v1.pdf","comment":"Accepted by NeurIPS 2024. Camera-ready submission"},{"id":"http://arxiv.org/abs/2410.23156v1","updated":"2024-10-30T16:11:05Z","published":"2024-10-30T16:11:05Z","title":"VisualPredicator: Learning Abstract World Models with Neuro-Symbolic\n Predicates for Robot Planning","summary":" Broadly intelligent agents should form task-specific abstractions that\nselectively expose the essential elements of a task, while abstracting away the\ncomplexity of the raw sensorimotor space. In this work, we present\nNeuro-Symbolic Predicates, a first-order abstraction language that combines the\nstrengths of symbolic and neural knowledge representations. We outline an\nonline algorithm for inventing such predicates and learning abstract world\nmodels. We compare our approach to hierarchical reinforcement learning,\nvision-language model planning, and symbolic predicate invention approaches, on\nboth in- and out-of-distribution tasks across five simulated robotic domains.\nResults show that our approach offers better sample complexity, stronger\nout-of-distribution generalization, and improved interpretability.\n","authors":["Yichao Liang","Nishanth Kumar","Hao Tang","Adrian Weller","Joshua B. Tenenbaum","Tom Silver","João F. Henriques","Kevin Ellis"],"pdf_url":"https://arxiv.org/pdf/2410.23156v1.pdf","comment":"In submission"},{"id":"http://arxiv.org/abs/2410.18975v2","updated":"2024-10-30T16:10:33Z","published":"2024-10-24T17:59:31Z","title":"Unbounded: A Generative Infinite Game of Character Life Simulation","summary":" We introduce the concept of a generative infinite game, a video game that\ntranscends the traditional boundaries of finite, hard-coded systems by using\ngenerative models. Inspired by James P. Carse's distinction between finite and\ninfinite games, we leverage recent advances in generative AI to create\nUnbounded: a game of character life simulation that is fully encapsulated in\ngenerative models. Specifically, Unbounded draws inspiration from sandbox life\nsimulations and allows you to interact with your autonomous virtual character\nin a virtual world by feeding, playing with and guiding it - with open-ended\nmechanics generated by an LLM, some of which can be emergent. In order to\ndevelop Unbounded, we propose technical innovations in both the LLM and visual\ngeneration domains. Specifically, we present: (1) a specialized, distilled\nlarge language model (LLM) that dynamically generates game mechanics,\nnarratives, and character interactions in real-time, and (2) a new dynamic\nregional image prompt Adapter (IP-Adapter) for vision models that ensures\nconsistent yet flexible visual generation of a character across multiple\nenvironments. We evaluate our system through both qualitative and quantitative\nanalysis, showing significant improvements in character life simulation, user\ninstruction following, narrative coherence, and visual consistency for both\ncharacters and the environments compared to traditional related approaches.\n","authors":["Jialu Li","Yuanzhen Li","Neal Wadhwa","Yael Pritch","David E. Jacobs","Michael Rubinstein","Mohit Bansal","Nataniel Ruiz"],"pdf_url":"https://arxiv.org/pdf/2410.18975v2.pdf","comment":"Project page: https://generative-infinite-game.github.io/"},{"id":"http://arxiv.org/abs/2410.23154v1","updated":"2024-10-30T16:08:43Z","published":"2024-10-30T16:08:43Z","title":"Nested ResNet: A Vision-Based Method for Detecting the Sensing Area of a\n Drop-in Gamma Probe","summary":" Purpose: Drop-in gamma probes are widely used in robotic-assisted minimally\ninvasive surgery (RAMIS) for lymph node detection. However, these devices only\nprovide audio feedback on signal intensity, lacking the visual feedback\nnecessary for precise localisation. Previous work attempted to predict the\nsensing area location using laparoscopic images, but the prediction accuracy\nwas unsatisfactory. Improvements are needed in the deep learning-based\nregression approach.\n Methods: We introduce a three-branch deep learning framework to predict the\nsensing area of the probe. Specifically, we utilise the stereo laparoscopic\nimages as input for the main branch and develop a Nested ResNet architecture.\nThe framework also incorporates depth estimation via transfer learning and\norientation guidance through probe axis sampling. The combined features from\neach branch enhanced the accuracy of the prediction.\n Results: Our approach has been evaluated on a publicly available dataset,\ndemonstrating superior performance over previous methods. In particular, our\nmethod resulted in a 22.10\\% decrease in 2D mean error and a 41.67\\% reduction\nin 3D mean error. Additionally, qualitative comparisons further demonstrated\nthe improved precision of our approach.\n Conclusion: With extensive evaluation, our solution significantly enhances\nthe accuracy and reliability of sensing area predictions. This advancement\nenables visual feedback during the use of the drop-in gamma probe in surgery,\nproviding surgeons with more accurate and reliable localisation.}\n","authors":["Songyu Xu","Yicheng Hu","Jionglong Su","Daniel Elson","Baoru Huang"],"pdf_url":"https://arxiv.org/pdf/2410.23154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23142v1","updated":"2024-10-30T15:58:03Z","published":"2024-10-30T15:58:03Z","title":"FAIR-TAT: Improving Model Fairness Using Targeted Adversarial Training","summary":" Deep neural networks are susceptible to adversarial attacks and common\ncorruptions, which undermine their robustness. In order to enhance model\nresilience against such challenges, Adversarial Training (AT) has emerged as a\nprominent solution. Nevertheless, adversarial robustness is often attained at\nthe expense of model fairness during AT, i.e., disparity in class-wise\nrobustness of the model. While distinctive classes become more robust towards\nsuch adversaries, hard to detect classes suffer. Recently, research has focused\non improving model fairness specifically for perturbed images, overlooking the\naccuracy of the most likely non-perturbed data. Additionally, despite their\nrobustness against the adversaries encountered during model training,\nstate-of-the-art adversarial trained models have difficulty maintaining\nrobustness and fairness when confronted with diverse adversarial threats or\ncommon corruptions. In this work, we address the above concerns by introducing\na novel approach called Fair Targeted Adversarial Training (FAIR-TAT). We show\nthat using targeted adversarial attacks for adversarial training (instead of\nuntargeted attacks) can allow for more favorable trade-offs with respect to\nadversarial fairness. Empirical results validate the efficacy of our approach.\n","authors":["Tejaswini Medi","Steffen Jung","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2410.23142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23132v1","updated":"2024-10-30T15:42:59Z","published":"2024-10-30T15:42:59Z","title":"Revisiting MAE pre-training for 3D medical image segmentation","summary":" Self-Supervised Learning (SSL) presents an exciting opportunity to unlock the\npotential of vast, untapped clinical datasets, for various downstream\napplications that suffer from the scarcity of labeled data. While SSL has\nrevolutionized fields like natural language processing and computer vision,\ntheir adoption in 3D medical image computing has been limited by three key\npitfalls: Small pre-training dataset sizes, architectures inadequate for 3D\nmedical image analysis, and insufficient evaluation practices. We address these\nissues by i) leveraging a large-scale dataset of 44k 3D brain MRI volumes and\nii) using a Residual Encoder U-Net architecture within the state-of-the-art\nnnU-Net framework. iii) A robust development framework, incorporating 5\ndevelopment and 8 testing brain MRI segmentation datasets, allowed\nperformance-driven design decisions to optimize the simple concept of Masked\nAuto Encoders (MAEs) for 3D CNNs. The resulting model not only surpasses\nprevious SSL methods but also outperforms the strong nnU-Net baseline by an\naverage of approximately 3 Dice points. Furthermore, our model demonstrates\nexceptional stability, achieving the highest average rank of 2 out of 7\nmethods, compared to the second-best method's mean rank of 3.\n","authors":["Tassilo Wald","Constantin Ulrich","Stanislav Lukyanenko","Andrei Goncharov","Alberto Paderno","Leander Maerkisch","Paul F. Jäger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2410.23132v1.pdf","comment":"Arxiv Preprint. Currently under Review"},{"id":"http://arxiv.org/abs/2410.23130v1","updated":"2024-10-30T15:41:35Z","published":"2024-10-30T15:41:35Z","title":"Compositional Segmentation of Cardiac Images Leveraging Metadata","summary":" Cardiac image segmentation is essential for automated cardiac function\nassessment and monitoring of changes in cardiac structures over time. Inspired\nby coarse-to-fine approaches in image analysis, we propose a novel multitask\ncompositional segmentation approach that can simultaneously localize the heart\nin a cardiac image and perform part-based segmentation of different regions of\ninterest. We demonstrate that this compositional approach achieves better\nresults than direct segmentation of the anatomies. Further, we propose a novel\nCross-Modal Feature Integration (CMFI) module to leverage the metadata related\nto cardiac imaging collected during image acquisition. We perform experiments\non two different modalities, MRI and ultrasound, using public datasets,\nMulti-disease, Multi-View, and Multi-Centre (M&Ms-2) and Multi-structure\nUltrasound Segmentation (CAMUS) data, to showcase the efficiency of the\nproposed compositional segmentation method and Cross-Modal Feature Integration\nmodule incorporating metadata within the proposed compositional segmentation\nnetwork. The source code is available:\nhttps://github.com/kabbas570/CompSeg-MetaData.\n","authors":["Abbas Khan","Muhammad Asad","Martin Benning","Caroline Roney","Gregory Slabaugh"],"pdf_url":"https://arxiv.org/pdf/2410.23130v1.pdf","comment":"IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)\n 2025"},{"id":"http://arxiv.org/abs/2410.23129v1","updated":"2024-10-30T15:41:30Z","published":"2024-10-30T15:41:30Z","title":"Why Fine-grained Labels in Pretraining Benefit Generalization?","summary":" Recent studies show that pretraining a deep neural network with fine-grained\nlabeled data, followed by fine-tuning on coarse-labeled data for downstream\ntasks, often yields better generalization than pretraining with coarse-labeled\ndata. While there is ample empirical evidence supporting this, the theoretical\njustification remains an open problem. This paper addresses this gap by\nintroducing a \"hierarchical multi-view\" structure to confine the input data\ndistribution. Under this framework, we prove that: 1) coarse-grained\npretraining only allows a neural network to learn the common features well,\nwhile 2) fine-grained pretraining helps the network learn the rare features in\naddition to the common ones, leading to improved accuracy on hard downstream\ntest samples.\n","authors":["Guan Zhe Hong","Yin Cui","Ariel Fuxman","Stanely Chan","Enming Luo"],"pdf_url":"https://arxiv.org/pdf/2410.23129v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.16887"},{"id":"http://arxiv.org/abs/2406.08773v2","updated":"2024-10-30T15:40:28Z","published":"2024-06-13T03:05:36Z","title":"DenoiseRep: Denoising Model for Representation Learning","summary":" The denoising model has been proven a powerful generative model but has\nlittle exploration of discriminative tasks. Representation learning is\nimportant in discriminative tasks, which is defined as \"learning\nrepresentations (or features) of the data that make it easier to extract useful\ninformation when building classifiers or other predictors\". In this paper, we\npropose a novel Denoising Model for Representation Learning (DenoiseRep) to\nimprove feature discrimination with joint feature extraction and denoising.\nDenoiseRep views each embedding layer in a backbone as a denoising layer,\nprocessing the cascaded embedding layers as if we are recursively denoise\nfeatures step-by-step. This unifies the frameworks of feature extraction and\ndenoising, where the former progressively embeds features from low-level to\nhigh-level, and the latter recursively denoises features step-by-step. After\nthat, DenoiseRep fuses the parameters of feature extraction and denoising\nlayers, and theoretically demonstrates its equivalence before and after the\nfusion, thus making feature denoising computation-free. DenoiseRep is a\nlabel-free algorithm that incrementally improves features but also\ncomplementary to the label if available. Experimental results on various\ndiscriminative vision tasks, including re-identification (Market-1501,\nDukeMTMC-reID, MSMT17, CUHK-03, vehicleID), image classification (ImageNet,\nUB200, Oxford-Pet, Flowers), object detection (COCO), image segmentation\n(ADE20K) show stability and impressive improvements. We also validate its\neffectiveness on the CNN (ResNet) and Transformer (ViT, Swin, Vmamda)\narchitectures.\n","authors":["Zhengrui Xu","Guan'an Wang","Xiaowen Huang","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2406.08773v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06353v3","updated":"2024-10-30T15:40:11Z","published":"2024-02-09T12:01:22Z","title":"Copycats: the many lives of a publicly available medical imaging dataset","summary":" Medical Imaging (MI) datasets are fundamental to artificial intelligence in\nhealthcare. The accuracy, robustness, and fairness of diagnostic algorithms\ndepend on the data (and its quality) used to train and evaluate the models. MI\ndatasets used to be proprietary, but have become increasingly available to the\npublic, including on community-contributed platforms (CCPs) like Kaggle or\nHuggingFace. While open data is important to enhance the redistribution of\ndata's public value, we find that the current CCP governance model fails to\nuphold the quality needed and recommended practices for sharing, documenting,\nand evaluating datasets. In this paper, we conduct an analysis of publicly\navailable machine learning datasets on CCPs, discussing datasets' context, and\nidentifying limitations and gaps in the current CCP landscape. We highlight\ndifferences between MI and computer vision datasets, particularly in the\npotentially harmful downstream effects from poor adoption of recommended\ndataset management practices. We compare the analyzed datasets across several\ndimensions, including data sharing, data documentation, and maintenance. We\nfind vague licenses, lack of persistent identifiers and storage, duplicates,\nand missing metadata, with differences between the platforms. Our research\ncontributes to efforts in responsible data curation and AI algorithms for\nhealthcare.\n","authors":["Amelia Jiménez-Sánchez","Natalia-Rozalia Avlona","Dovile Juodelyte","Théo Sourget","Caroline Vang-Larsen","Anna Rogers","Hubert Dariusz Zając","Veronika Cheplygina"],"pdf_url":"https://arxiv.org/pdf/2402.06353v3.pdf","comment":"NeurIPS 2024 Track on Datasets and Benchmarks. Please note that v1\n has a different title"},{"id":"http://arxiv.org/abs/2410.23114v1","updated":"2024-10-30T15:25:06Z","published":"2024-10-30T15:25:06Z","title":"Unified Triplet-Level Hallucination Evaluation for Large Vision-Language\n Models","summary":" Despite the outstanding performance in vision-language reasoning, Large\nVision-Language Models (LVLMs) might generate hallucinated contents that do not\nexist in the given image. Most existing LVLM hallucination benchmarks are\nconstrained to evaluate the object-related hallucinations. However, the\npotential hallucination on the relations between two objects, i.e., relation\nhallucination, still lacks investigation. To remedy that, in this paper we\ndesign a unified framework to measure object and relation hallucination in\nLVLMs simultaneously. The core idea of our framework is to conduct\nhallucination evaluation on (object, relation, object) triplets extracted from\nLVLMs' responses, and thus, could be easily generalized to different\nvision-language tasks. Based on our framework, we further introduce Tri-HE, a\nnovel Triplet-level Hallucination Evaluation benchmark which can be used to\nstudy both object and relation hallucination at the same time. We conduct\ncomprehensive evaluations on Tri-HE and observe that the relation hallucination\nissue is even more serious than object hallucination among existing LVLMs,\nhighlighting a previously neglected problem towards reliable LVLMs. Moreover,\nbased on our findings, we design a simple yet effective training-free approach\nto mitigate hallucinations for LVLMs, with which, we exceed all open-sourced\ncounterparts on Tri-HE, achieving comparable performance with the powerful\nGPT-4V. Our dataset and code for the reproduction of our experiments are\navailable publicly at https://github.com/wujunjie1998/Tri-HE.\n","authors":["Junjie Wu","Tsz Ting Chung","Kai Chen","Dit-Yan Yeung"],"pdf_url":"https://arxiv.org/pdf/2410.23114v1.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2410.23109v1","updated":"2024-10-30T15:20:10Z","published":"2024-10-30T15:20:10Z","title":"NASM: Neural Anisotropic Surface Meshing","summary":" This paper introduces a new learning-based method, NASM, for anisotropic\nsurface meshing. Our key idea is to propose a graph neural network to embed an\ninput mesh into a high-dimensional (high-d) Euclidean embedding space to\npreserve curvature-based anisotropic metric by using a dot product loss between\nhigh-d edge vectors. This can dramatically reduce the computational time and\nincrease the scalability. Then, we propose a novel feature-sensitive remeshing\non the generated high-d embedding to automatically capture sharp geometric\nfeatures. We define a high-d normal metric, and then derive an automatic\ndifferentiation on a high-d centroidal Voronoi tessellation (CVT) optimization\nwith the normal metric to simultaneously preserve geometric features and\ncurvature anisotropy that exhibit in the original 3D shapes. To our knowledge,\nthis is the first time that a deep learning framework and a large dataset are\nproposed to construct a high-d Euclidean embedding space for 3D anisotropic\nsurface meshing. Experimental results are evaluated and compared with the\nstate-of-the-art in anisotropic surface meshing on a large number of surface\nmodels from Thingi10K dataset as well as tested on extensive unseen 3D shapes\nfrom Multi-Garment Network dataset and FAUST human dataset.\n","authors":["Hongbo Li","Haikuan Zhu","Sikai Zhong","Ningna Wang","Cheng Lin","Xiaohu Guo","Shiqing Xin","Wenping Wang","Jing Hua","Zichun Zhong"],"pdf_url":"https://arxiv.org/pdf/2410.23109v1.pdf","comment":"SIGGRAPH Asia 2024 (Conference Track)"},{"id":"http://arxiv.org/abs/2410.23107v1","updated":"2024-10-30T15:17:58Z","published":"2024-10-30T15:17:58Z","title":"Decoupling Semantic Similarity from Spatial Alignment for Neural\n Networks","summary":" What representation do deep neural networks learn? How similar are images to\neach other for neural networks? Despite the overwhelming success of deep\nlearning methods key questions about their internal workings still remain\nlargely unanswered, due to their internal high dimensionality and complexity.\nTo address this, one approach is to measure the similarity of activation\nresponses to various inputs. Representational Similarity Matrices (RSMs)\ndistill this similarity into scalar values for each input pair. These matrices\nencapsulate the entire similarity structure of a system, indicating which input\nleads to similar responses. While the similarity between images is ambiguous,\nwe argue that the spatial location of semantic objects does neither influence\nhuman perception nor deep learning classifiers. Thus this should be reflected\nin the definition of similarity between image responses for computer vision\nsystems. Revisiting the established similarity calculations for RSMs we expose\ntheir sensitivity to spatial alignment. In this paper, we propose to solve this\nthrough semantic RSMs, which are invariant to spatial permutation. We measure\nsemantic similarity between input responses by formulating it as a set-matching\nproblem. Further, we quantify the superiority of semantic RSMs over\nspatio-semantic RSMs through image retrieval and by comparing the similarity\nbetween representations to the similarity between predicted class\nprobabilities.\n","authors":["Tassilo Wald","Constantin Ulrich","Gregor Köhler","David Zimmerer","Stefan Denner","Michael Baumgartner","Fabian Isensee","Priyank Jaini","Klaus H. Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2410.23107v1.pdf","comment":"Accepted at NeurIPS2024"},{"id":"http://arxiv.org/abs/2402.18503v2","updated":"2024-10-30T15:16:52Z","published":"2024-02-28T17:31:39Z","title":"Detection of Micromobility Vehicles in Urban Traffic Videos","summary":" Urban traffic environments present unique challenges for object detection,\nparticularly with the increasing presence of micromobility vehicles like\ne-scooters and bikes. To address this object detection problem, this work\nintroduces an adapted detection model that combines the accuracy and speed of\nsingle-frame object detection with the richer features offered by video object\ndetection frameworks. This is done by applying aggregated feature maps from\nconsecutive frames processed through motion flow to the YOLOX architecture.\nThis fusion brings a temporal perspective to YOLOX detection abilities,\nallowing for a better understanding of urban mobility patterns and\nsubstantially improving detection reliability. Tested on a custom dataset\ncurated for urban micromobility scenarios, our model showcases substantial\nimprovement over existing state-of-the-art methods, demonstrating the need to\nconsider spatio-temporal information for detecting such small and thin objects.\nOur approach enhances detection in challenging conditions, including\nocclusions, ensuring temporal consistency, and effectively mitigating motion\nblur.\n","authors":["Khalil Sabri","Célia Djilali","Guillaume-Alexandre Bilodeau","Nicolas Saunier","Wassim Bouachir"],"pdf_url":"https://arxiv.org/pdf/2402.18503v2.pdf","comment":"Accepted at the 21st Conference on Robots and Vision (CRV), 2024"},{"id":"http://arxiv.org/abs/2409.10582v3","updated":"2024-10-30T15:16:43Z","published":"2024-09-16T04:16:52Z","title":"WaveMixSR-V2: Enhancing Super-resolution with Higher Efficiency","summary":" Recent advancements in single image super-resolution have been predominantly\ndriven by token mixers and transformer architectures. WaveMixSR utilized the\nWaveMix architecture, employing a two-dimensional discrete wavelet transform\nfor spatial token mixing, achieving superior performance in super-resolution\ntasks with remarkable resource efficiency. In this work, we present an enhanced\nversion of the WaveMixSR architecture by (1) replacing the traditional\ntranspose convolution layer with a pixel shuffle operation and (2) implementing\na multistage design for higher resolution tasks ($4\\times$). Our experiments\ndemonstrate that our enhanced model -- WaveMixSR-V2 -- outperforms other\narchitectures in multiple super-resolution tasks, achieving state-of-the-art\nfor the BSD100 dataset, while also consuming fewer resources, exhibits higher\nparameter efficiency, lower latency and higher throughput. Our code is\navailable at https://github.com/pranavphoenix/WaveMixSR.\n","authors":["Pranav Jeevan","Neeraj Nixon","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2409.10582v3.pdf","comment":"10 pages. Accepted in AAAI 2025. arXiv admin note: text overlap with\n arXiv:2307.00430"},{"id":"http://arxiv.org/abs/2410.23105v1","updated":"2024-10-30T15:15:41Z","published":"2024-10-30T15:15:41Z","title":"Automated Image-Based Identification and Consistent Classification of\n Fire Patterns with Quantitative Shape Analysis and Spatial Location\n Identification","summary":" Fire patterns, consisting of fire effects that offer insights into fire\nbehavior and origin, are traditionally classified based on investigators'\nvisual observations, leading to subjective interpretations. This study proposes\na framework for quantitative fire pattern classification to support fire\ninvestigators, aiming for consistency and accuracy. The framework integrates\nfour components. First, it leverages human-computer interaction to extract fire\npatterns from surfaces, combining investigator expertise with computational\nanalysis. Second, it employs an aspect ratio-based random forest model to\nclassify fire pattern shapes. Third, fire scene point cloud segmentation\nenables precise identification of fire-affected areas and the mapping of 2D\nfire patterns to 3D scenes. Lastly, spatial relationships between fire patterns\nand indoor elements support an interpretation of the fire scene. These\ncomponents provide a method for fire pattern analysis that synthesizes\nqualitative and quantitative data. The framework's classification results\nachieve 93% precision on synthetic data and 83% on real fire patterns.\n","authors":["Pengkun Liu","Shuna Ni","Stanislav I. Stoliarov","Pingbo Tang"],"pdf_url":"https://arxiv.org/pdf/2410.23105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07355v5","updated":"2024-10-30T15:08:37Z","published":"2023-10-11T10:12:43Z","title":"IMITATE: Clinical Prior Guided Hierarchical Vision-Language Pre-training","summary":" In the field of medical Vision-Language Pre-training (VLP), significant\nefforts have been devoted to deriving text and image features from both\nclinical reports and associated medical images. However, most existing methods\nmay have overlooked the opportunity in leveraging the inherent hierarchical\nstructure of clinical reports, which are generally split into `findings' for\ndescriptive content and `impressions' for conclusive observation. Instead of\nutilizing this rich, structured format, current medical VLP approaches often\nsimplify the report into either a unified entity or fragmented tokens. In this\nwork, we propose a novel clinical prior guided VLP framework named IMITATE to\nlearn the structure information from medical reports with hierarchical\nvision-language alignment. The framework derives multi-level visual features\nfrom the chest X-ray (CXR) images and separately aligns these features with the\ndescriptive and the conclusive text encoded in the hierarchical medical report.\nFurthermore, a new clinical-informed contrastive loss is introduced for\ncross-modal learning, which accounts for clinical prior knowledge in\nformulating sample correlations in contrastive learning. The proposed model,\nIMITATE, outperforms baseline VLP methods across six different datasets,\nspanning five medical imaging downstream tasks. Comprehensive experimental\nresults highlight the advantages of integrating the hierarchical structure of\nmedical reports for vision-language alignment. The code related to this paper\nis available at https://github.com/cheliu-computation/IMITATE-TMI2024.\n","authors":["Che Liu","Sibo Cheng","Miaojing Shi","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2310.07355v5.pdf","comment":"Accepted by TMI2024"},{"id":"http://arxiv.org/abs/2410.23092v1","updated":"2024-10-30T15:06:58Z","published":"2024-10-30T15:06:58Z","title":"First Place Solution to the ECCV 2024 ROAD++ Challenge @ ROAD++ Atomic\n Activity Recognition 2024","summary":" This report presents our team's technical solution for participating in Track\n3 of the 2024 ECCV ROAD++ Challenge. The task of Track 3 is atomic activity\nrecognition, which aims to identify 64 types of atomic activities in road\nscenes based on video content. Our approach primarily addresses the challenges\nof small objects, discriminating between single object and a group of objects,\nas well as model overfitting in this task. Firstly, we construct a multi-branch\nactivity recognition framework that not only separates different object\ncategories but also the tasks of single object and object group recognition,\nthereby enhancing recognition accuracy. Subsequently, we develop various model\nensembling strategies, including integrations of multiple frame sampling\nsequences, different frame sampling sequence lengths, multiple training epochs,\nand different backbone networks. Furthermore, we propose an atomic activity\nrecognition data augmentation method, which greatly expands the sample space by\nflipping video frames and road topology, effectively mitigating model\noverfitting. Our methods rank first in the test set of Track 3 for the ROAD++\nChallenge 2024, and achieve 69% mAP.\n","authors":["Ruyang Li","Tengfei Zhang","Heng Zhang","Tiejun Liu","Yanwei Wang","Xuelei Li"],"pdf_url":"https://arxiv.org/pdf/2410.23092v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23091v1","updated":"2024-10-30T15:06:44Z","published":"2024-10-30T15:06:44Z","title":"CausalDiff: Causality-Inspired Disentanglement via Diffusion Model for\n Adversarial Defense","summary":" Despite ongoing efforts to defend neural classifiers from adversarial\nattacks, they remain vulnerable, especially to unseen attacks. In contrast,\nhumans are difficult to be cheated by subtle manipulations, since we make\njudgments only based on essential factors. Inspired by this observation, we\nattempt to model label generation with essential label-causative factors and\nincorporate label-non-causative factors to assist data generation. For an\nadversarial example, we aim to discriminate the perturbations as non-causative\nfactors and make predictions only based on the label-causative factors.\nConcretely, we propose a casual diffusion model (CausalDiff) that adapts\ndiffusion models for conditional data generation and disentangles the two types\nof casual factors by learning towards a novel casual information bottleneck\nobjective. Empirically, CausalDiff has significantly outperformed\nstate-of-the-art defense methods on various unseen attacks, achieving an\naverage robustness of 86.39% (+4.01%) on CIFAR-10, 56.25% (+3.13%) on\nCIFAR-100, and 82.62% (+4.93%) on GTSRB (German Traffic Sign Recognition\nBenchmark).\n","authors":["Mingkun Zhang","Keping Bi","Wei Chen","Quanrun Chen","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2410.23091v1.pdf","comment":"accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23089v1","updated":"2024-10-30T15:05:17Z","published":"2024-10-30T15:05:17Z","title":"PIP-MM: Pre-Integrating Prompt Information into Visual Encoding via\n Existing MLLM Structures","summary":" The Multimodal Large Language Models (MLLMs) have activated the\ncapabilitiesof Large Language Models (LLMs) in solving visual-language tasks by\nintegratingvisual information. The prevailing approach in existing MLLMs\ninvolvesemploying an image encoder to extract visual features, converting\nthesefeatures into visual tokens via an adapter, and then integrating them with\ntheprompt into the LLM. However, because the process of image encoding\nisprompt-agnostic, the extracted visual features only provide a\ncoarsedescription of the image, impossible to focus on the requirements of\ntheprompt. On one hand, it is easy for image features to lack information\naboutthe prompt-specified objects, resulting in unsatisfactory responses. On\ntheother hand, the visual features contain a large amount of\nirrelevantinformation, which not only increases the burden on memory but also\nworsens thegeneration effectiveness. To address the aforementioned issues, we\npropose\\textbf{PIP-MM}, a framework that\n\\textbf{P}re-\\textbf{I}ntegrates\\textbf{P}rompt information into the visual\nencoding process using existingmodules of MLLMs. Specifically, We utilize the\nfrozen LLM in the MLLM tovectorize the input prompt, which summarizes the\nrequirements of the prompt.Then, we input the prompt vector into our trained\nMulti-Layer Perceptron (MLP)to align with the visual input requirements, and\nsubsequently replace the classembedding in the image encoder. Since our model\nonly requires adding atrainable MLP, it can be applied to any MLLM. To validate\nthe effectiveness ofPIP-MM, we conducted experiments on multiple benchmarks.\nAutomated evaluationmetrics and manual assessments demonstrate the strong\nperformance of PIP-MM.Particularly noteworthy is that our model maintains\nexcellent generationresults even when half of the visual tokens are reduced.\n","authors":["Tianxiang Wu","Minxin Nie","Ziqiang Cao"],"pdf_url":"https://arxiv.org/pdf/2410.23089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19458v2","updated":"2024-10-30T15:02:54Z","published":"2024-05-29T19:12:08Z","title":"MemControl: Mitigating Memorization in Diffusion Models via Automated\n Parameter Selection","summary":" Diffusion models excel in generating images that closely resemble their\ntraining data but are also susceptible to data memorization, raising privacy,\nethical, and legal concerns, particularly in sensitive domains such as medical\nimaging. We hypothesize that this memorization stems from the\noverparameterization of deep models and propose that regularizing model\ncapacity during fine-tuning can mitigate this issue. Firstly, we empirically\nshow that regulating the model capacity via Parameter-efficient fine-tuning\n(PEFT) mitigates memorization to some extent, however, it further requires the\nidentification of the exact parameter subsets to be fine-tuned for high-quality\ngeneration. To identify these subsets, we introduce a bi-level optimization\nframework, MemControl, that automates parameter selection using memorization\nand generation quality metrics as rewards during fine-tuning. The parameter\nsubsets discovered through MemControl achieve a superior tradeoff between\ngeneration quality and memorization. For the task of medical image generation,\nour approach outperforms existing state-of-the-art memorization mitigation\nstrategies by fine-tuning as few as 0.019% of model parameters. Moreover, we\ndemonstrate that the discovered parameter subsets are transferable to\nnon-medical domains. Our framework is scalable to large datasets, agnostic to\nreward functions, and can be integrated with existing approaches for further\nmemorization mitigation. To the best of our knowledge, this is the first study\nto empirically evaluate memorization in medical images and propose a targeted\nyet universal mitigation strategy. The code is available at\nhttps://github.com/Raman1121/Diffusion_Memorization_HPO\n","authors":["Raman Dutt","Ondrej Bohdal","Pedro Sanchez","Sotirios A. Tsaftaris","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2405.19458v2.pdf","comment":"Accepted at WACV'25 (Applications Track)"},{"id":"http://arxiv.org/abs/2410.23085v1","updated":"2024-10-30T15:00:06Z","published":"2024-10-30T15:00:06Z","title":"S3PT: Scene Semantics and Structure Guided Clustering to Boost\n Self-Supervised Pre-Training for Autonomous Driving","summary":" Recent self-supervised clustering-based pre-training techniques like DINO and\nCribo have shown impressive results for downstream detection and segmentation\ntasks. However, real-world applications such as autonomous driving face\nchallenges with imbalanced object class and size distributions and complex\nscene geometries. In this paper, we propose S3PT a novel scene semantics and\nstructure guided clustering to provide more scene-consistent objectives for\nself-supervised training. Specifically, our contributions are threefold: First,\nwe incorporate semantic distribution consistent clustering to encourage better\nrepresentation of rare classes such as motorcycles or animals. Second, we\nintroduce object diversity consistent spatial clustering, to handle imbalanced\nand diverse object sizes, ranging from large background areas to small objects\nsuch as pedestrians and traffic signs. Third, we propose a depth-guided spatial\nclustering to regularize learning based on geometric information of the scene,\nthus further refining region separation on the feature level. Our learned\nrepresentations significantly improve performance in downstream semantic\nsegmentation and 3D object detection tasks on the nuScenes, nuImages, and\nCityscapes datasets and show promising domain translation properties.\n","authors":["Maciej K. Wozniak","Hariprasath Govindarajan","Marvin Klingner","Camille Maurice","Ravi Kiran","Senthil Yogamani"],"pdf_url":"https://arxiv.org/pdf/2410.23085v1.pdf","comment":"Accepted for WACV 2025"},{"id":"http://arxiv.org/abs/2410.23084v1","updated":"2024-10-30T14:59:57Z","published":"2024-10-30T14:59:57Z","title":"AI-assisted prostate cancer detection and localisation on biparametric\n MR by classifying radiologist-positives","summary":" Prostate cancer diagnosis through MR imaging have currently relied on\nradiologists' interpretation, whilst modern AI-based methods have been\ndeveloped to detect clinically significant cancers independent of radiologists.\nIn this study, we propose to develop deep learning models that improve the\noverall cancer diagnostic accuracy, by classifying radiologist-identified\npatients or lesions (i.e. radiologist-positives), as opposed to the existing\nmodels that are trained to discriminate over all patients. We develop a single\nvoxel-level classification model, with a simple percentage threshold to\ndetermine positive cases, at levels of lesions, Barzell-zones and patients.\nBased on the presented experiments from two clinical data sets, consisting of\nhistopathology-labelled MR images from more than 800 and 500 patients in the\nrespective UCLA and UCL PROMIS studies, we show that the proposed strategy can\nimprove the diagnostic accuracy, by augmenting the radiologist reading of the\nMR imaging. Among varying definition of clinical significance, the proposed\nstrategy, for example, achieved a specificity of 44.1% (with AI assistance)\nfrom 36.3% (by radiologists alone), at a controlled sensitivity of 80.0% on the\npublicly available UCLA data set. This provides measurable clinical values in a\nrange of applications such as reducing unnecessary biopsies, lowering cost in\ncancer screening and quantifying risk in therapies.\n","authors":["Xiangcen Wu","Yipei Wang","Qianye Yang","Natasha Thorley","Shonit Punwani","Veeru Kasivisvanathan","Ester Bonmati","Yipeng Hu"],"pdf_url":"https://arxiv.org/pdf/2410.23084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02506v2","updated":"2024-10-30T14:55:58Z","published":"2024-03-04T21:52:25Z","title":"Differentially Private Representation Learning via Image Captioning","summary":" Differentially private (DP) machine learning is considered the gold-standard\nsolution for training a model from sensitive data while still preserving\nprivacy. However, a major barrier to achieving this ideal is its sub-optimal\nprivacy-accuracy trade-off, which is particularly visible in DP representation\nlearning. Specifically, it has been shown that under modest privacy budgets,\nmost models learn representations that are not significantly better than\nhand-crafted features. In this work, we show that effective DP representation\nlearning can be done via image captioning and scaling up to internet-scale\nmultimodal datasets. Through a series of engineering tricks, we successfully\ntrain a DP image captioner (DP-Cap) on a 233M subset of LAION-2B from scratch\nusing a reasonable amount of computation, and obtaining unprecedented\nhigh-quality image features that can be used in a variety of downstream vision\nand vision-language tasks. For example, under a privacy budget of\n$\\varepsilon=8$ for the LAION dataset, a linear classifier trained on top of\nlearned DP-Cap features attains $65.8\\%$ accuracy on ImageNet-1K, considerably\nimproving the previous SOTA of $56.5\\%$.\n","authors":["Tom Sander","Yaodong Yu","Maziar Sanjabi","Alain Durmus","Yi Ma","Kamalika Chaudhuri","Chuan Guo"],"pdf_url":"https://arxiv.org/pdf/2403.02506v2.pdf","comment":"Accepted and presented at ICML 2024"},{"id":"http://arxiv.org/abs/2410.13147v4","updated":"2024-10-30T14:54:25Z","published":"2024-10-17T02:04:57Z","title":"Utilizing Large Language Models in an iterative paradigm with Domain\n feedback for Zero-shot Molecule optimization","summary":" Molecule optimization is a critical task in drug discovery to optimize\ndesired properties of a given molecule through chemical modification. Despite\nLarge Language Models (LLMs) holding the potential to efficiently simulate this\ntask by using natural language to direct the optimization, straightforwardly\nutilizing shows limited performance. In this work, we facilitate utilizing LLMs\nin an iterative paradigm by proposing a simple yet highly effective domain\nfeedback provider, namely $\\text{Re}^3$DF. In detail, $\\text{Re}^3$DF harnesses\nan external toolkit, RDKit, to handle the molecule hallucination, if the\nmodified molecule is chemically invalid. Otherwise, its desired properties are\ncomputed and compared to the original one, establishing reliable domain\nfeedback with correct direction and distance towards the objective, followed by\na retrieved example, to explicitly guide the LLM to refine the modified\nmolecule. We conduct experiments across both single- and multi-property\nobjectives with 2 thresholds, where $\\text{Re}^3$DF shows significant\nimprovements. Particularly, for 20 single-property objectives, $\\text{Re}^3$DF\nenhances Hit ratio by 16.95% and 20.76% under loose and strict thresholds,\nrespectively. For 32 multi-property objectives, $\\text{Re}^3$DF enhances Hit\nratio by 6.04% and 5.25%.\n","authors":["Khiem Le","Nitesh V. Chawla"],"pdf_url":"https://arxiv.org/pdf/2410.13147v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23077v1","updated":"2024-10-30T14:52:43Z","published":"2024-10-30T14:52:43Z","title":"First Place Solution to the ECCV 2024 ROAD++ Challenge @ ROAD++\n Spatiotemporal Agent Detection 2024","summary":" This report presents our team's solutions for the Track 1 of the 2024 ECCV\nROAD++ Challenge. The task of Track 1 is spatiotemporal agent detection, which\naims to construct an \"agent tube\" for road agents in consecutive video frames.\nOur solutions focus on the challenges in this task, including extreme-size\nobjects, low-light scenarios, class imbalance, and fine-grained classification.\nFirstly, the extreme-size object detection heads are introduced to improve the\ndetection performance of large and small objects. Secondly, we design a\ndual-stream detection model with a low-light enhancement stream to improve the\nperformance of spatiotemporal agent detection in low-light scenes, and the\nfeature fusion module to integrate features from different branches.\nSubsequently, we develop a multi-branch detection framework to mitigate the\nissues of class imbalance and fine-grained classification, and we design a\npre-training and fine-tuning approach to optimize the above multi-branch\nframework. Besides, we employ some common data augmentation techniques, and\nimprove the loss function and upsampling operation. We rank first in the test\nset of Track 1 for the ROAD++ Challenge 2024, and achieve 30.82% average\nvideo-mAP.\n","authors":["Tengfei Zhang","Heng Zhang","Ruyang Li","Qi Deng","Yaqian Zhao","Rengang Li"],"pdf_url":"https://arxiv.org/pdf/2410.23077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23073v1","updated":"2024-10-30T14:46:35Z","published":"2024-10-30T14:46:35Z","title":"RSNet: A Light Framework for The Detection of Multi-scale Remote Sensing\n Targets","summary":" Recent developments in synthetic aperture radar (SAR) ship detection have\nseen deep learning techniques achieve remarkable progress in accuracy and\nspeed. However, the detection of small targets against complex backgrounds\nremains a significant challenge. To tackle these difficulties, this letter\npresents RSNet, a lightweight framework aimed at enhancing ship detection\ncapabilities in SAR imagery. RSNet features the Waveletpool-ContextGuided (WCG)\nbackbone for enhanced accuracy with fewer parameters, and the\nWaveletpool-StarFusion (WSF) head for efficient parameter reduction.\nAdditionally, a Lightweight-Shared (LS) module minimizes the detection head's\nparameter load. Experiments on the SAR Ship Detection Dataset (SSDD) and\nHigh-Resolution SAR Image Dataset (HRSID) demonstrate that RSNet achieves a\nstrong balance between lightweight design and detection performance, surpassing\nmany state-of-the-art detectors, reaching 72.5\\% and 67.6\\% in\n\\textbf{\\(\\mathbf{mAP_{.50:95}}\\) }respectively with 1.49M parameters. Our code\nwill be released soon.\n","authors":["Hongyu Chen","Chengcheng Chen","Fei Wang","Yuhu Shi","Weiming Zeng"],"pdf_url":"https://arxiv.org/pdf/2410.23073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23072v1","updated":"2024-10-30T14:46:34Z","published":"2024-10-30T14:46:34Z","title":"CNN Explainability with Multivector Tucker Saliency Maps for\n Self-Supervised Models","summary":" Interpreting the decisions of Convolutional Neural Networks (CNNs) is\nessential for understanding their behavior, yet explainability remains a\nsignificant challenge, particularly for self-supervised models. Most existing\nmethods for generating saliency maps rely on ground truth labels, restricting\ntheir use to supervised tasks. EigenCAM is the only notable label-independent\nalternative, leveraging Singular Value Decomposition to generate saliency maps\napplicable across CNN models, but it does not fully exploit the tensorial\nstructure of feature maps. In this work, we introduce the Tucker Saliency Map\n(TSM) method, which applies Tucker tensor decomposition to better capture the\ninherent structure of feature maps, producing more accurate singular vectors\nand values. These are used to generate high-fidelity saliency maps, effectively\nhighlighting objects of interest in the input. We further extend EigenCAM and\nTSM into multivector variants -Multivec-EigenCAM and Multivector Tucker\nSaliency Maps (MTSM)- which utilize all singular vectors and values, further\nimproving saliency map quality. Quantitative evaluations on supervised\nclassification models demonstrate that TSM, Multivec-EigenCAM, and MTSM achieve\ncompetitive performance with label-dependent methods. Moreover, TSM enhances\nexplainability by approximately 50% over EigenCAM for both supervised and\nself-supervised models. Multivec-EigenCAM and MTSM further advance\nstate-of-the-art explainability performance on self-supervised models, with\nMTSM achieving the best results.\n","authors":["Aymene Mohammed Bouayed","Samuel Deslauriers-Gauthier","Adrian Iaccovelli","David Naccache"],"pdf_url":"https://arxiv.org/pdf/2410.23072v1.pdf","comment":"29 pages, 20 figures"},{"id":"http://arxiv.org/abs/2405.12399v2","updated":"2024-10-30T14:34:49Z","published":"2024-05-20T22:51:05Z","title":"Diffusion for World Modeling: Visual Details Matter in Atari","summary":" World models constitute a promising approach for training reinforcement\nlearning agents in a safe and sample-efficient manner. Recent world models\npredominantly operate on sequences of discrete latent variables to model\nenvironment dynamics. However, this compression into a compact discrete\nrepresentation may ignore visual details that are important for reinforcement\nlearning. Concurrently, diffusion models have become a dominant approach for\nimage generation, challenging well-established methods modeling discrete\nlatents. Motivated by this paradigm shift, we introduce DIAMOND (DIffusion As a\nModel Of eNvironment Dreams), a reinforcement learning agent trained in a\ndiffusion world model. We analyze the key design choices that are required to\nmake diffusion suitable for world modeling, and demonstrate how improved visual\ndetails can lead to improved agent performance. DIAMOND achieves a mean human\nnormalized score of 1.46 on the competitive Atari 100k benchmark; a new best\nfor agents trained entirely within a world model. We further demonstrate that\nDIAMOND's diffusion world model can stand alone as an interactive neural game\nengine by training on static Counter-Strike: Global Offensive gameplay. To\nfoster future research on diffusion for world modeling, we release our code,\nagents, videos and playable world models at https://diamond-wm.github.io.\n","authors":["Eloi Alonso","Adam Jelley","Vincent Micheli","Anssi Kanervisto","Amos Storkey","Tim Pearce","François Fleuret"],"pdf_url":"https://arxiv.org/pdf/2405.12399v2.pdf","comment":"NeurIPS 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2410.23054v1","updated":"2024-10-30T14:21:33Z","published":"2024-10-30T14:21:33Z","title":"Controlling Language and Diffusion Models by Transporting Activations","summary":" The increasing capabilities of large generative models and their ever more\nwidespread deployment have raised concerns about their reliability, safety, and\npotential misuse. To address these issues, recent works have proposed to\ncontrol model generation by steering model activations in order to effectively\ninduce or prevent the emergence of concepts or behaviors in the generated\noutput. In this paper we introduce Activation Transport (AcT), a general\nframework to steer activations guided by optimal transport theory that\ngeneralizes many previous activation-steering works. AcT is modality-agnostic\nand provides fine-grained control over the model behavior with negligible\ncomputational overhead, while minimally impacting model abilities. We\nexperimentally show the effectiveness and versatility of our approach by\naddressing key challenges in large language models (LLMs) and text-to-image\ndiffusion models (T2Is). For LLMs, we show that AcT can effectively mitigate\ntoxicity, induce arbitrary concepts, and increase their truthfulness. In T2Is,\nwe show how AcT enables fine-grained style control and concept negation.\n","authors":["Pau Rodriguez","Arno Blaas","Michal Klein","Luca Zappella","Nicholas Apostoloff","Marco Cuturi","Xavier Suau"],"pdf_url":"https://arxiv.org/pdf/2410.23054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23039v1","updated":"2024-10-30T14:06:51Z","published":"2024-10-30T14:06:51Z","title":"Neural Attention Field: Emerging Point Relevance in 3D Scenes for\n One-Shot Dexterous Grasping","summary":" One-shot transfer of dexterous grasps to novel scenes with object and context\nvariations has been a challenging problem. While distilled feature fields from\nlarge vision models have enabled semantic correspondences across 3D scenes,\ntheir features are point-based and restricted to object surfaces, limiting\ntheir capability of modeling complex semantic feature distributions for\nhand-object interactions. In this work, we propose the \\textit{neural attention\nfield} for representing semantic-aware dense feature fields in the 3D space by\nmodeling inter-point relevance instead of individual point features. Core to it\nis a transformer decoder that computes the cross-attention between any 3D query\npoint with all the scene points, and provides the query point feature with an\nattention-based aggregation. We further propose a self-supervised framework for\ntraining the transformer decoder from only a few 3D pointclouds without hand\ndemonstrations. Post-training, the attention field can be applied to novel\nscenes for semantics-aware dexterous grasping from one-shot demonstration.\nExperiments show that our method provides better optimization landscapes by\nencouraging the end-effector to focus on task-relevant scene regions, resulting\nin significant improvements in success rates on real robots compared with the\nfeature-field-based methods.\n","authors":["Qianxu Wang","Congyue Deng","Tyler Ga Wei Lum","Yuanpei Chen","Yaodong Yang","Jeannette Bohg","Yixin Zhu","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2410.23039v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12715v4","updated":"2024-10-30T13:58:35Z","published":"2023-05-22T04:50:28Z","title":"Imprecise Label Learning: A Unified Framework for Learning with Various\n Imprecise Label Configurations","summary":" Learning with reduced labeling standards, such as noisy label, partial label,\nand multiple label candidates, which we generically refer to as\n\\textit{imprecise} labels, is a commonplace challenge in machine learning\ntasks. Previous methods tend to propose specific designs for every emerging\nimprecise label configuration, which is usually unsustainable when multiple\nconfigurations of imprecision coexist. In this paper, we introduce imprecise\nlabel learning (ILL), a framework for the unification of learning with various\nimprecise label configurations. ILL leverages expectation-maximization (EM) for\nmodeling the imprecise label information, treating the precise labels as latent\nvariables.Instead of approximating the correct labels for training, it\nconsiders the entire distribution of all possible labeling entailed by the\nimprecise information. We demonstrate that ILL can seamlessly adapt to partial\nlabel learning, semi-supervised learning, noisy label learning, and, more\nimportantly, a mixture of these settings. Notably, ILL surpasses the existing\nspecified techniques for handling imprecise labels, marking the first unified\nframework with robust and effective performance across various challenging\nsettings. We hope our work will inspire further research on this topic,\nunleashing the full potential of ILL in wider scenarios where precise labels\nare expensive and complicated to obtain.\n","authors":["Hao Chen","Ankit Shah","Jindong Wang","Ran Tao","Yidong Wang","Xing Xie","Masashi Sugiyama","Rita Singh","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2305.12715v4.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2310.14692v3","updated":"2024-10-30T13:55:46Z","published":"2023-10-23T08:32:50Z","title":"On Unsupervised Partial Shape Correspondence","summary":" While dealing with matching shapes to their parts, we often apply a tool\nknown as functional maps. The idea is to translate the shape matching problem\ninto \"convenient\" spaces by which matching is performed algebraically by\nsolving a least squares problem. Here, we argue that such formulations, though\npopular in this field, introduce errors in the estimated match when partiality\nis invoked. Such errors are unavoidable even for advanced feature extraction\nnetworks, and they can be shown to escalate with increasing degrees of shape\npartiality, adversely affecting the learning capability of such systems. To\ncircumvent these limitations, we propose a novel approach for partial shape\nmatching. Our study of functional maps led us to a novel method that\nestablishes direct correspondence between partial and full shapes through\nfeature matching bypassing the need for functional map intermediate spaces. The\nGromov Distance between metric spaces leads to the construction of the first\npart of our loss functions. For regularization we use two options: a term based\non the area preserving property of the mapping, and a relaxed version that\navoids the need to resort to functional maps. The proposed approach shows\nsuperior performance on the SHREC'16 dataset, outperforming existing\nunsupervised methods for partial shape matching.Notably, it achieves\nstate-of-the-art results on the SHREC'16 HOLES benchmark, superior also\ncompared to supervised methods. We demonstrate the benefits of the proposed\nunsupervised method when applied to a new dataset PFAUST for part-to-full shape\ncorrespondence.\n","authors":["Amit Bracha","Thomas Dagès","Ron Kimmel"],"pdf_url":"https://arxiv.org/pdf/2310.14692v3.pdf","comment":"Updated version, accepted for publication at the Asian Conference on\n Computer Vision (ACCV) 2024"},{"id":"http://arxiv.org/abs/2405.20494v2","updated":"2024-10-30T13:52:56Z","published":"2024-05-30T21:35:48Z","title":"Slight Corruption in Pre-training Data Makes Better Diffusion Models","summary":" Diffusion models (DMs) have shown remarkable capabilities in generating\nrealistic high-quality images, audios, and videos. They benefit significantly\nfrom extensive pre-training on large-scale datasets, including web-crawled data\nwith paired data and conditions, such as image-text and image-class pairs.\nDespite rigorous filtering, these pre-training datasets often inevitably\ncontain corrupted pairs where conditions do not accurately describe the data.\nThis paper presents the first comprehensive study on the impact of such\ncorruption in pre-training data of DMs. We synthetically corrupt ImageNet-1K\nand CC3M to pre-train and evaluate over 50 conditional DMs. Our empirical\nfindings reveal that various types of slight corruption in pre-training can\nsignificantly enhance the quality, diversity, and fidelity of the generated\nimages across different DMs, both during pre-training and downstream adaptation\nstages. Theoretically, we consider a Gaussian mixture model and prove that\nslight corruption in the condition leads to higher entropy and a reduced\n2-Wasserstein distance to the ground truth of the data distribution generated\nby the corruptly trained DMs. Inspired by our analysis, we propose a simple\nmethod to improve the training of DMs on practical datasets by adding condition\nembedding perturbations (CEP). CEP significantly improves the performance of\nvarious DMs in both pre-training and downstream tasks. We hope that our study\nprovides new insights into understanding the data and pre-training processes of\nDMs and all models are released at https://huggingface.co/DiffusionNoise.\n","authors":["Hao Chen","Yujin Han","Diganta Misra","Xiang Li","Kai Hu","Difan Zou","Masashi Sugiyama","Jindong Wang","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2405.20494v2.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2410.18013v2","updated":"2024-10-30T13:40:01Z","published":"2024-10-23T16:42:56Z","title":"Scalable Ranked Preference Optimization for Text-to-Image Generation","summary":" Direct Preference Optimization (DPO) has emerged as a powerful approach to\nalign text-to-image (T2I) models with human feedback. Unfortunately, successful\napplication of DPO to T2I models requires a huge amount of resources to collect\nand label large-scale datasets, e.g., millions of generated paired images\nannotated with human preferences. In addition, these human preference datasets\ncan get outdated quickly as the rapid improvements of T2I models lead to higher\nquality images. In this work, we investigate a scalable approach for collecting\nlarge-scale and fully synthetic datasets for DPO training. Specifically, the\npreferences for paired images are generated using a pre-trained reward\nfunction, eliminating the need for involving humans in the annotation process,\ngreatly improving the dataset collection efficiency. Moreover, we demonstrate\nthat such datasets allow averaging predictions across multiple models and\ncollecting ranked preferences as opposed to pairwise preferences. Furthermore,\nwe introduce RankDPO to enhance DPO-based methods using the ranking feedback.\nApplying RankDPO on SDXL and SD3-Medium models with our synthetically generated\npreference dataset \"Syn-Pic\" improves both prompt-following (on benchmarks like\nT2I-Compbench, GenEval, and DPG-Bench) and visual quality (through user\nstudies). This pipeline presents a practical and scalable solution to develop\nbetter preference datasets to enhance the performance of text-to-image models.\n","authors":["Shyamgopal Karthik","Huseyin Coskun","Zeynep Akata","Sergey Tulyakov","Jian Ren","Anil Kag"],"pdf_url":"https://arxiv.org/pdf/2410.18013v2.pdf","comment":"Project Page: https://snap-research.github.io/RankDPO/"},{"id":"http://arxiv.org/abs/2406.14515v3","updated":"2024-10-30T13:38:10Z","published":"2024-06-20T17:26:01Z","title":"MMBench-Video: A Long-Form Multi-Shot Benchmark for Holistic Video\n Understanding","summary":" The advent of large vision-language models (LVLMs) has spurred research into\ntheir applications in multi-modal contexts, particularly in video\nunderstanding. Traditional VideoQA benchmarks, despite providing quantitative\nmetrics, often fail to encompass the full spectrum of video content and\ninadequately assess models' temporal comprehension. To address these\nlimitations, we introduce MMBench-Video, a quantitative benchmark designed to\nrigorously evaluate LVLMs' proficiency in video understanding. MMBench-Video\nincorporates lengthy videos from YouTube and employs free-form questions,\nmirroring practical use cases. The benchmark is meticulously crafted to probe\nthe models' temporal reasoning skills, with all questions human-annotated\naccording to a carefully constructed ability taxonomy. We employ GPT-4 for\nautomated assessment, demonstrating superior accuracy and robustness over\nearlier LLM-based evaluations. Utilizing MMBench-Video, we have conducted\ncomprehensive evaluations that include both proprietary and open-source LVLMs\nfor images and videos. MMBench-Video stands as a valuable resource for the\nresearch community, facilitating improved evaluation of LVLMs and catalyzing\nprogress in the field of video understanding. The evalutation code of\nMMBench-Video will be integrated into VLMEvalKit:\nhttps://github.com/open-compass/VLMEvalKit.\n","authors":["Xinyu Fang","Kangrui Mao","Haodong Duan","Xiangyu Zhao","Yining Li","Dahua Lin","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2406.14515v3.pdf","comment":"Accepted in NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2404.13437v2","updated":"2024-10-30T13:31:07Z","published":"2024-04-20T18:06:26Z","title":"High-fidelity Endoscopic Image Synthesis by Utilizing Depth-guided\n Neural Surfaces","summary":" In surgical oncology, screening colonoscopy plays a pivotal role in providing\ndiagnostic assistance, such as biopsy, and facilitating surgical navigation,\nparticularly in polyp detection. Computer-assisted endoscopic surgery has\nrecently gained attention and amalgamated various 3D computer vision\ntechniques, including camera localization, depth estimation, surface\nreconstruction, etc. Neural Radiance Fields (NeRFs) and Neural Implicit\nSurfaces (NeuS) have emerged as promising methodologies for deriving accurate\n3D surface models from sets of registered images, addressing the limitations of\nexisting colon reconstruction approaches stemming from constrained camera\nmovement.\n However, the inadequate tissue texture representation and confused scale\nproblem in monocular colonoscopic image reconstruction still impede the\nprogress of the final rendering results. In this paper, we introduce a novel\nmethod for colon section reconstruction by leveraging NeuS applied to\nendoscopic images, supplemented by a single frame of depth map. Notably, we\npioneered the exploration of utilizing only one frame depth map in\nphotorealistic reconstruction and neural rendering applications while this\nsingle depth map can be easily obtainable from other monocular depth estimation\nnetworks with an object scale. Through rigorous experimentation and validation\non phantom imagery, our approach demonstrates exceptional accuracy in\ncompletely rendering colon sections, even capturing unseen portions of the\nsurface. This breakthrough opens avenues for achieving stable and consistently\nscaled reconstructions, promising enhanced quality in cancer screening\nprocedures and treatment interventions.\n","authors":["Baoru Huang","Yida Wang","Anh Nguyen","Daniel Elson","Francisco Vasconcelos","Danail Stoyanov"],"pdf_url":"https://arxiv.org/pdf/2404.13437v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23004v1","updated":"2024-10-30T13:30:39Z","published":"2024-10-30T13:30:39Z","title":"DexGraspNet 2.0: Learning Generative Dexterous Grasping in Large-scale\n Synthetic Cluttered Scenes","summary":" Grasping in cluttered scenes remains highly challenging for dexterous hands\ndue to the scarcity of data. To address this problem, we present a large-scale\nsynthetic benchmark, encompassing 1319 objects, 8270 scenes, and 427 million\ngrasps. Beyond benchmarking, we also propose a novel two-stage grasping method\nthat learns efficiently from data by using a diffusion model that conditions on\nlocal geometry. Our proposed generative method outperforms all baselines in\nsimulation experiments. Furthermore, with the aid of test-time-depth\nrestoration, our method demonstrates zero-shot sim-to-real transfer, attaining\n90.7% real-world dexterous grasping success rate in cluttered scenes.\n","authors":["Jialiang Zhang","Haoran Liu","Danshi Li","Xinqiang Yu","Haoran Geng","Yufei Ding","Jiayi Chen","He Wang"],"pdf_url":"https://arxiv.org/pdf/2410.23004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11965v4","updated":"2024-10-30T13:22:45Z","published":"2024-08-21T19:36:27Z","title":"CT-AGRG: Automated Abnormality-Guided Report Generation from 3D Chest CT\n Volumes","summary":" The rapid increase of computed tomography (CT) scans and their time-consuming\nmanual analysis have created an urgent need for robust automated analysis\ntechniques in clinical settings. These aim to assist radiologists and help them\nmanaging their growing workload. Existing methods typically generate entire\nreports directly from 3D CT images, without explicitly focusing on observed\nabnormalities. This unguided approach often results in repetitive content or\nincomplete reports, failing to prioritize anomaly-specific descriptions. We\npropose a new anomaly-guided report generation model, which first predicts\nabnormalities and then generates targeted descriptions for each. Evaluation on\na public dataset demonstrates significant improvements in report quality and\nclinical relevance. We extend our work by conducting an ablation study to\ndemonstrate its effectiveness.\n","authors":["Theo Di Piazza"],"pdf_url":"https://arxiv.org/pdf/2408.11965v4.pdf","comment":"15 pages, 9 figures, submitted to ISBI 2025"},{"id":"http://arxiv.org/abs/2410.22995v1","updated":"2024-10-30T13:19:44Z","published":"2024-10-30T13:19:44Z","title":"VisAidMath: Benchmarking Visual-Aided Mathematical Reasoning","summary":" Although previous research on large language models (LLMs) and large\nmulti-modal models (LMMs) has systematically explored mathematical\nproblem-solving (MPS) within visual contexts, the analysis of how these models\nprocess visual information during problem-solving remains insufficient. To\naddress this gap, we present VisAidMath, a benchmark for evaluating the MPS\nprocess related to visual information. We follow a rigorous data curation\npipeline involving both automated processes and manual annotations to ensure\ndata quality and reliability. Consequently, this benchmark includes 1,200\nchallenging problems from various mathematical branches, vision-aid\nformulations, and difficulty levels, collected from diverse sources such as\ntextbooks, examination papers, and Olympiad problems. Based on the proposed\nbenchmark, we conduct comprehensive evaluations on ten mainstream LLMs and\nLMMs, highlighting deficiencies in the visual-aided reasoning process. For\nexample, GPT-4V only achieves 45.33% accuracy in the visual-aided reasoning\ntask, even with a drop of 2 points when provided with golden visual aids.\nIn-depth analysis reveals that the main cause of deficiencies lies in\nhallucination regarding the implicit visual reasoning process, shedding light\non future research directions in the visual-aided MPS process.\n","authors":["Jingkun Ma","Runzhe Zhan","Derek F. Wong","Yang Li","Di Sun","Hou Pong Chan","Lidia S. Chao"],"pdf_url":"https://arxiv.org/pdf/2410.22995v1.pdf","comment":"58 pages, 28 figures"},{"id":"http://arxiv.org/abs/2406.12459v2","updated":"2024-10-30T12:50:27Z","published":"2024-06-18T10:05:33Z","title":"HumanSplat: Generalizable Single-Image Human Gaussian Splatting with\n Structure Priors","summary":" Despite recent advancements in high-fidelity human reconstruction techniques,\nthe requirements for densely captured images or time-consuming per-instance\noptimization significantly hinder their applications in broader scenarios. To\ntackle these issues, we present HumanSplat which predicts the 3D Gaussian\nSplatting properties of any human from a single input image in a generalizable\nmanner. In particular, HumanSplat comprises a 2D multi-view diffusion model and\na latent reconstruction transformer with human structure priors that adeptly\nintegrate geometric priors and semantic features within a unified framework. A\nhierarchical loss that incorporates human semantic information is further\ndesigned to achieve high-fidelity texture modeling and better constrain the\nestimated multiple views. Comprehensive experiments on standard benchmarks and\nin-the-wild images demonstrate that HumanSplat surpasses existing\nstate-of-the-art methods in achieving photorealistic novel-view synthesis.\n","authors":["Panwang Pan","Zhuo Su","Chenguo Lin","Zhen Fan","Yongjie Zhang","Zeming Li","Tingting Shen","Yadong Mu","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2406.12459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22979v1","updated":"2024-10-30T12:44:08Z","published":"2024-10-30T12:44:08Z","title":"LumiSculpt: A Consistency Lighting Control Network for Video Generation","summary":" Lighting plays a pivotal role in ensuring the naturalness of video\ngeneration, significantly influencing the aesthetic quality of the generated\ncontent. However, due to the deep coupling between lighting and the temporal\nfeatures of videos, it remains challenging to disentangle and model independent\nand coherent lighting attributes, limiting the ability to control lighting in\nvideo generation. In this paper, inspired by the established controllable T2I\nmodels, we propose LumiSculpt, which, for the first time, enables precise and\nconsistent lighting control in T2V generation models.LumiSculpt equips the\nvideo generation with strong interactive capabilities, allowing the input of\ncustom lighting reference image sequences. Furthermore, the core learnable\nplug-and-play module of LumiSculpt facilitates remarkable control over lighting\nintensity, position, and trajectory in latent video diffusion models based on\nthe advanced DiT backbone.Additionally, to effectively train LumiSculpt and\naddress the issue of insufficient lighting data, we construct LumiHuman, a new\nlightweight and flexible dataset for portrait lighting of images and videos.\nExperimental results demonstrate that LumiSculpt achieves precise and\nhigh-quality lighting control in video generation.\n","authors":["Yuxin Zhang","Dandan Zheng","Biao Gong","Jingdong Chen","Ming Yang","Weiming Dong","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2410.22979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20838v2","updated":"2024-10-30T12:35:56Z","published":"2024-05-31T14:25:45Z","title":"einspace: Searching for Neural Architectures from Fundamental Operations","summary":" Neural architecture search (NAS) finds high performing networks for a given\ntask. Yet the results of NAS are fairly prosaic; they did not e.g. create a\nshift from convolutional structures to transformers. This is not least because\nthe search spaces in NAS often aren't diverse enough to include such\ntransformations a priori. Instead, for NAS to provide greater potential for\nfundamental design shifts, we need a novel expressive search space design which\nis built from more fundamental operations. To this end, we introduce einspace,\na search space based on a parameterised probabilistic context-free grammar. Our\nspace is versatile, supporting architectures of various sizes and complexities,\nwhile also containing diverse network operations which allow it to model\nconvolutions, attention components and more. It contains many existing\ncompetitive architectures, and provides flexibility for discovering new ones.\nUsing this search space, we perform experiments to find novel architectures as\nwell as improvements on existing ones on the diverse Unseen NAS datasets. We\nshow that competitive architectures can be obtained by searching from scratch,\nand we consistently find large improvements when initialising the search with\nstrong baselines. We believe that this work is an important advancement towards\na transformative NAS paradigm where search space expressivity and strategic\nsearch initialisation play key roles.\n","authors":["Linus Ericsson","Miguel Espinosa","Chenhongyi Yang","Antreas Antoniou","Amos Storkey","Shay B. Cohen","Steven McDonagh","Elliot J. Crowley"],"pdf_url":"https://arxiv.org/pdf/2405.20838v2.pdf","comment":"NeurIPS 2024. Project page at\n https://linusericsson.github.io/einspace/"},{"id":"http://arxiv.org/abs/2311.08110v3","updated":"2024-10-30T12:34:16Z","published":"2023-11-14T12:14:54Z","title":"Improving Hateful Meme Detection through Retrieval-Guided Contrastive\n Learning","summary":" Hateful memes have emerged as a significant concern on the Internet.\nDetecting hateful memes requires the system to jointly understand the visual\nand textual modalities. Our investigation reveals that the embedding space of\nexisting CLIP-based systems lacks sensitivity to subtle differences in memes\nthat are vital for correct hatefulness classification. We propose constructing\na hatefulness-aware embedding space through retrieval-guided contrastive\ntraining. Our approach achieves state-of-the-art performance on the\nHatefulMemes dataset with an AUROC of 87.0, outperforming much larger\nfine-tuned large multimodal models. We demonstrate a retrieval-based hateful\nmemes detection system, which is capable of identifying hatefulness based on\ndata unseen in training. This allows developers to update the hateful memes\ndetection system by simply adding new examples without retraining, a desirable\nfeature for real services in the constantly evolving landscape of hateful memes\non the Internet.\n","authors":["Jingbiao Mei","Jinghong Chen","Weizhe Lin","Bill Byrne","Marcus Tomalin"],"pdf_url":"https://arxiv.org/pdf/2311.08110v3.pdf","comment":"ACL 2024 Main. The code is available from:\n https://github.com/JingbiaoMei/RGCL"},{"id":"http://arxiv.org/abs/2110.11128v3","updated":"2024-10-30T12:27:48Z","published":"2021-10-21T13:25:52Z","title":"A Strong Baseline for Semi-Supervised Incremental Few-Shot Learning","summary":" Few-shot learning (FSL) aims to learn models that generalize to novel classes\nwith limited training samples. Recent works advance FSL towards a scenario\nwhere unlabeled examples are also available and propose semi-supervised FSL\nmethods. Another line of methods also cares about the performance of base\nclasses in addition to the novel ones and thus establishes the incremental FSL\nscenario. In this paper, we generalize the above two under a more realistic yet\ncomplex setting, named by Semi-Supervised Incremental Few-Shot Learning (S2\nI-FSL). To tackle the task, we propose a novel paradigm containing two parts:\n(1) a well-designed meta-training algorithm for mitigating ambiguity between\nbase and novel classes caused by unreliable pseudo labels and (2) a model\nadaptation mechanism to learn discriminative features for novel classes while\npreserving base knowledge using few labeled and all the unlabeled data.\nExtensive experiments on standard FSL, semi-supervised FSL, incremental FSL,\nand the firstly built S2 I-FSL benchmarks demonstrate the effectiveness of our\nproposed method.\n","authors":["Linglan Zhao","Dashan Guo","Yunlu Xu","Liang Qiao","Zhanzhan Cheng","Shiliang Pu","Yi Niu","Xiangzhong Fang"],"pdf_url":"https://arxiv.org/pdf/2110.11128v3.pdf","comment":"Accepted by BMVC2021"},{"id":"http://arxiv.org/abs/2410.22959v1","updated":"2024-10-30T12:16:35Z","published":"2024-10-30T12:16:35Z","title":"EnsIR: An Ensemble Algorithm for Image Restoration via Gaussian Mixture\n Models","summary":" Image restoration has experienced significant advancements due to the\ndevelopment of deep learning. Nevertheless, it encounters challenges related to\nill-posed problems, resulting in deviations between single model predictions\nand ground-truths. Ensemble learning, as a powerful machine learning technique,\naims to address these deviations by combining the predictions of multiple base\nmodels. Most existing works adopt ensemble learning during the design of\nrestoration models, while only limited research focuses on the inference-stage\nensemble of pre-trained restoration models. Regression-based methods fail to\nenable efficient inference, leading researchers in academia and industry to\nprefer averaging as their choice for post-training ensemble. To address this,\nwe reformulate the ensemble problem of image restoration into Gaussian mixture\nmodels (GMMs) and employ an expectation maximization (EM)-based algorithm to\nestimate ensemble weights for aggregating prediction candidates. We estimate\nthe range-wise ensemble weights on a reference set and store them in a lookup\ntable (LUT) for efficient ensemble inference on the test set. Our algorithm is\nmodel-agnostic and training-free, allowing seamless integration and enhancement\nof various pre-trained image restoration models. It consistently outperforms\nregression based methods and averaging ensemble approaches on 14 benchmarks\nacross 3 image restoration tasks, including super-resolution, deblurring and\nderaining. The codes and all estimated weights have been released in Github.\n","authors":["Shangquan Sun","Wenqi Ren","Zikun Liu","Hyunhee Park","Rui Wang","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2410.22959v1.pdf","comment":"10 pages for main manuscript, additional 17 pages for appendix, 18\n figures, 17MB"},{"id":"http://arxiv.org/abs/2405.12895v2","updated":"2024-10-30T12:11:33Z","published":"2024-05-21T16:04:32Z","title":"Implicit-ARAP: Efficient Handle-Guided Deformation of High-Resolution\n Meshes and Neural Fields via Local Patch Meshing","summary":" In this work, we present the local patch mesh representation for neural\nsigned distance fields. This technique allows to discretize local regions of\nthe level sets of an input SDF by projecting and deforming flat patch meshes\nonto the level set surface, using exclusively the SDF information and its\ngradient. Our analysis reveals this method to be more accurate than the\nstandard marching cubes algorithm for approximating the implicit surface. Then,\nwe apply this representation in the setting of handle-guided deformation: we\nintroduce two distinct pipelines, which make use of 3D neural fields to compute\nAs-Rigid-As-Possible deformations of both high-resolution meshes and neural\nfields under a given set of constraints. We run a comprehensive evaluation of\nour method and various baselines for neural field and mesh deformation which\nshow both pipelines achieve impressive efficiency and notable improvements in\nterms of quality of results and robustness. With our novel pipeline, we\nintroduce a scalable approach to solve a well-established geometry processing\nproblem on high-resolution meshes, and pave the way for extending other\ngeometric tasks to the domain of implicit surfaces via local patch meshing.\n","authors":["Daniele Baieri","Filippo Maggioli","Zorah Lähner","Simone Melzi","Emanuele Rodolà"],"pdf_url":"https://arxiv.org/pdf/2405.12895v2.pdf","comment":"12 pages, 16 figures"},{"id":"http://arxiv.org/abs/2410.22952v1","updated":"2024-10-30T12:08:30Z","published":"2024-10-30T12:08:30Z","title":"Efficient Adaptation of Pre-trained Vision Transformer via Householder\n Transformation","summary":" A common strategy for Parameter-Efficient Fine-Tuning (PEFT) of pre-trained\nVision Transformers (ViTs) involves adapting the model to downstream tasks by\nlearning a low-rank adaptation matrix. This matrix is decomposed into a product\nof down-projection and up-projection matrices, with the bottleneck\ndimensionality being crucial for reducing the number of learnable parameters,\nas exemplified by prevalent methods like LoRA and Adapter. However, these\nlow-rank strategies typically employ a fixed bottleneck dimensionality, which\nlimits their flexibility in handling layer-wise variations. To address this\nlimitation, we propose a novel PEFT approach inspired by Singular Value\nDecomposition (SVD) for representing the adaptation matrix. SVD decomposes a\nmatrix into the product of a left unitary matrix, a diagonal matrix of scaling\nvalues, and a right unitary matrix. We utilize Householder transformations to\nconstruct orthogonal matrices that efficiently mimic the unitary matrices,\nrequiring only a vector. The diagonal values are learned in a layer-wise\nmanner, allowing them to flexibly capture the unique properties of each layer.\nThis approach enables the generation of adaptation matrices with varying ranks\nacross different layers, providing greater flexibility in adapting pre-trained\nmodels. Experiments on standard downstream vision tasks demonstrate that our\nmethod achieves promising fine-tuning performance.\n","authors":["Wei Dong","Yuan Sun","Yiting Yang","Xing Zhang","Zhijun Lin","Qingsen Yan","Haokui Zhang","Peng Wang","Yang Yang","Hengtao Shen"],"pdf_url":"https://arxiv.org/pdf/2410.22952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03761v2","updated":"2024-10-30T12:08:08Z","published":"2024-08-07T13:30:58Z","title":"MMSummary: Multimodal Summary Generation for Fetal Ultrasound Video","summary":" We present the first automated multimodal summary generation system,\nMMSummary, for medical imaging video, particularly with a focus on fetal\nultrasound analysis. Imitating the examination process performed by a human\nsonographer, MMSummary is designed as a three-stage pipeline, progressing from\nkeyframe detection to keyframe captioning and finally anatomy segmentation and\nmeasurement. In the keyframe detection stage, an innovative automated workflow\nis proposed to progressively select a concise set of keyframes, preserving\nsufficient video information without redundancy. Subsequently, we adapt a large\nlanguage model to generate meaningful captions for fetal ultrasound keyframes\nin the keyframe captioning stage. If a keyframe is captioned as fetal biometry,\nthe segmentation and measurement stage estimates biometric parameters by\nsegmenting the region of interest according to the textual prior. The MMSummary\nsystem provides comprehensive summaries for fetal ultrasound examinations and\nbased on reported experiments is estimated to reduce scanning time by\napproximately 31.5%, thereby suggesting the potential to enhance clinical\nworkflow efficiency.\n","authors":["Xiaoqing Guo","Qianhui Men","J. Alison Noble"],"pdf_url":"https://arxiv.org/pdf/2408.03761v2.pdf","comment":"MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.09495v2","updated":"2024-10-30T11:57:22Z","published":"2024-05-31T09:37:54Z","title":"Image captioning in different languages","summary":" This short position paper provides a manually curated list of non-English\nimage captioning datasets (as of May 2024). Through this list, we can observe\nthe dearth of datasets in different languages: only 23 different languages are\nrepresented. With the addition of the Crossmodal-3600 dataset (Thapliyal et\nal., 2022, 36 languages) this number increases somewhat, but still this number\nis small compared to the +/-500 institutional languages that are out there.\nThis paper closes with some open questions for the field of Vision & Language.\n","authors":["Emiel van Miltenburg"],"pdf_url":"https://arxiv.org/pdf/2407.09495v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22939v1","updated":"2024-10-30T11:49:06Z","published":"2024-10-30T11:49:06Z","title":"AdaptiveISP: Learning an Adaptive Image Signal Processor for Object\n Detection","summary":" Image Signal Processors (ISPs) convert raw sensor signals into digital\nimages, which significantly influence the image quality and the performance of\ndownstream computer vision tasks. Designing ISP pipeline and tuning ISP\nparameters are two key steps for building an imaging and vision system. To find\noptimal ISP configurations, recent works use deep neural networks as a proxy to\nsearch for ISP parameters or ISP pipelines. However, these methods are\nprimarily designed to maximize the image quality, which are sub-optimal in the\nperformance of high-level computer vision tasks such as detection, recognition,\nand tracking. Moreover, after training, the learned ISP pipelines are mostly\nfixed at the inference time, whose performance degrades in dynamic scenes. To\njointly optimize ISP structures and parameters, we propose AdaptiveISP, a\ntask-driven and scene-adaptive ISP. One key observation is that for the\nmajority of input images, only a few processing modules are needed to improve\nthe performance of downstream recognition tasks, and only a few inputs require\nmore processing. Based on this, AdaptiveISP utilizes deep reinforcement\nlearning to automatically generate an optimal ISP pipeline and the associated\nISP parameters to maximize the detection performance. Experimental results show\nthat AdaptiveISP not only surpasses the prior state-of-the-art methods for\nobject detection but also dynamically manages the trade-off between detection\nperformance and computational cost, especially suitable for scenes with large\ndynamic range variations. Project website:\nhttps://openimaginglab.github.io/AdaptiveISP/.\n","authors":["Yujin Wang","Tianyi Xu","Fan Zhang","Tianfan Xue","Jinwei Gu"],"pdf_url":"https://arxiv.org/pdf/2410.22939v1.pdf","comment":"Accepted at NeurIPS2024"},{"id":"http://arxiv.org/abs/2410.22936v1","updated":"2024-10-30T11:43:55Z","published":"2024-10-30T11:43:55Z","title":"Bringing NeRFs to the Latent Space: Inverse Graphics Autoencoder","summary":" While pre-trained image autoencoders are increasingly utilized in computer\nvision, the application of inverse graphics in 2D latent spaces has been\nunder-explored. Yet, besides reducing the training and rendering complexity,\napplying inverse graphics in the latent space enables a valuable\ninteroperability with other latent-based 2D methods. The major challenge is\nthat inverse graphics cannot be directly applied to such image latent spaces\nbecause they lack an underlying 3D geometry. In this paper, we propose an\nInverse Graphics Autoencoder (IG-AE) that specifically addresses this issue. To\nthis end, we regularize an image autoencoder with 3D-geometry by aligning its\nlatent space with jointly trained latent 3D scenes. We utilize the trained\nIG-AE to bring NeRFs to the latent space with a latent NeRF training pipeline,\nwhich we implement in an open-source extension of the Nerfstudio framework,\nthereby unlocking latent scene learning for its supported methods. We\nexperimentally confirm that Latent NeRFs trained with IG-AE present an improved\nquality compared to a standard autoencoder, all while exhibiting training and\nrendering accelerations with respect to NeRFs trained in the image space. Our\nproject page can be found at https://ig-ae.github.io .\n","authors":["Antoine Schnepf","Karim Kassab","Jean-Yves Franceschi","Laurent Caraffa","Flavian Vasile","Jeremie Mary","Andrew Comport","Valerie Gouet-Brunet"],"pdf_url":"https://arxiv.org/pdf/2410.22936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22927v1","updated":"2024-10-30T11:34:55Z","published":"2024-10-30T11:34:55Z","title":"An Individual Identity-Driven Framework for Animal Re-Identification","summary":" Reliable re-identification of individuals within large wildlife populations\nis crucial for biological studies, ecological research, and wildlife\nconservation. Classic computer vision techniques offer a promising direction\nfor Animal Re-identification (Animal ReID), but their backbones' close-set\nnature limits their applicability and generalizability. Despite the\ndemonstrated effectiveness of vision-language models like CLIP in\nre-identifying persons and vehicles, their application to Animal ReID remains\nlimited due to unique challenges, such as the various visual representations of\nanimals, including variations in poses and forms. To address these limitations,\nwe leverage CLIP's cross-modal capabilities to introduce a two-stage framework,\nthe \\textbf{Indiv}idual \\textbf{A}nimal \\textbf{ID}entity-Driven (IndivAID)\nframework, specifically designed for Animal ReID. In the first stage, IndivAID\ntrains a text description generator by extracting individual semantic\ninformation from each image, generating both image-specific and\nindividual-specific textual descriptions that fully capture the diverse visual\nconcepts of each individual across animal images. In the second stage, IndivAID\nrefines its learning of visual concepts by dynamically incorporating\nindividual-specific textual descriptions with an integrated attention module to\nfurther highlight discriminative features of individuals for Animal ReID.\nEvaluation against state-of-the-art methods across eight benchmark datasets and\na real-world Stoat dataset demonstrates IndivAID's effectiveness and\napplicability. Code is available at \\url{https://github.com/ywu840/IndivAID}.\n","authors":["Yihao Wu","Di Zhao","Jingfeng Zhang","Yun Sing Koh"],"pdf_url":"https://arxiv.org/pdf/2410.22927v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2410.22922v1","updated":"2024-10-30T11:27:06Z","published":"2024-10-30T11:27:06Z","title":"High-Fidelity Document Stain Removal via A Large-Scale Real-World\n Dataset and A Memory-Augmented Transformer","summary":" Document images are often degraded by various stains, significantly impacting\ntheir readability and hindering downstream applications such as document\ndigitization and analysis. The absence of a comprehensive stained document\ndataset has limited the effectiveness of existing document enhancement methods\nin removing stains while preserving fine-grained details. To address this\nchallenge, we construct StainDoc, the first large-scale, high-resolution\n($2145\\times2245$) dataset specifically designed for document stain removal.\nStainDoc comprises over 5,000 pairs of stained and clean document images across\nmultiple scenes. This dataset encompasses a diverse range of stain types,\nseverities, and document backgrounds, facilitating robust training and\nevaluation of document stain removal algorithms. Furthermore, we propose\nStainRestorer, a Transformer-based document stain removal approach.\nStainRestorer employs a memory-augmented Transformer architecture that captures\nhierarchical stain representations at part, instance, and semantic levels via\nthe DocMemory module. The Stain Removal Transformer (SRTransformer) leverages\nthese feature representations through a dual attention mechanism: an enhanced\nspatial attention with an expanded receptive field, and a channel attention\ncaptures channel-wise feature importance. This combination enables precise\nstain removal while preserving document content integrity. Extensive\nexperiments demonstrate StainRestorer's superior performance over\nstate-of-the-art methods on the StainDoc dataset and its variants\nStainDoc\\_Mark and StainDoc\\_Seal, establishing a new benchmark for document\nstain removal. Our work highlights the potential of memory-augmented\nTransformers for this task and contributes a valuable dataset to advance future\nresearch.\n","authors":["Mingxian Li","Hao Sun","Yingtie Lei","Xiaofeng Zhang","Yihang Dong","Yilin Zhou","Zimeng Li","Xuhang Chen"],"pdf_url":"https://arxiv.org/pdf/2410.22922v1.pdf","comment":"Accepted by WACV2025"},{"id":"http://arxiv.org/abs/2410.21759v2","updated":"2024-10-30T11:20:46Z","published":"2024-10-29T05:50:17Z","title":"IntLoRA: Integral Low-rank Adaptation of Quantized Diffusion Models","summary":" Fine-tuning large-scale text-to-image diffusion models for various downstream\ntasks has yielded impressive results. However, the heavy computational burdens\nof tuning large models prevent personal customization. Recent advances have\nattempted to employ parameter-efficient fine-tuning (PEFT) techniques to adapt\nthe floating-point (FP) or quantized pre-trained weights. Nonetheless, the\nadaptation parameters in existing works are still restricted to FP arithmetic,\nhindering hardware-friendly acceleration. In this work, we propose IntLoRA, to\nfurther push the efficiency limits by using integer type (INT) low-rank\nparameters to adapt the quantized diffusion models. By working in the integer\narithmetic, our IntLoRA offers three key advantages: (i) for fine-tuning, the\npre-trained weights are quantized, reducing memory usage; (ii) for storage,\nboth pre-trained and low-rank weights are in INT which consumes less disk\nspace; (iii) for inference, IntLoRA weights can be naturally merged into\nquantized pre-trained weights through efficient integer multiplication or\nbit-shifting, eliminating additional post-training quantization. Extensive\nexperiments demonstrate that IntLoRA can achieve performance on par with or\neven superior to the vanilla LoRA, accompanied by significant efficiency\nimprovements. Code is available at \\url{https://github.com/csguoh/IntLoRA}.\n","authors":["Hang Guo","Yawei Li","Tao Dai","Shu-Tao Xia","Luca Benini"],"pdf_url":"https://arxiv.org/pdf/2410.21759v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2410.22909v1","updated":"2024-10-30T11:06:23Z","published":"2024-10-30T11:06:23Z","title":"UniRiT: Towards Few-Shot Non-Rigid Point Cloud Registration","summary":" Non-rigid point cloud registration is a critical challenge in 3D scene\nunderstanding, particularly in surgical navigation. Although existing methods\nachieve excellent performance when trained on large-scale, high-quality\ndatasets, these datasets are prohibitively expensive to collect and annotate,\ne.g., organ data in authentic medical scenarios. With insufficient training\nsamples and data noise, existing methods degrade significantly since non-rigid\npatterns are more flexible and complicated than rigid ones, and the\ndistributions across samples are more distinct, leading to higher difficulty in\nrepresentation learning with few data. In this work, we aim to deal with this\nchallenging few-shot non-rigid point cloud registration problem. Based on the\nobservation that complex non-rigid transformation patterns can be decomposed\ninto rigid and small non-rigid transformations, we propose a novel and\neffective framework, UniRiT. UniRiT adopts a two-step registration strategy\nthat first aligns the centroids of the source and target point clouds and then\nrefines the registration with non-rigid transformations, thereby significantly\nreducing the problem complexity. To validate the performance of UniRiT on\nreal-world datasets, we introduce a new dataset, MedMatch3D, which consists of\nreal human organs and exhibits high variability in sample distribution. We\nfurther establish a new challenging benchmark for few-shot non-rigid\nregistration. Extensive empirical results demonstrate that UniRiT achieves\nstate-of-the-art performance on MedMatch3D, improving the existing best\napproach by 94.22%.\n","authors":["Geng Li","Haozhi Cao","Mingyang Liu","Chenxi Jiang","Jianfei Yang"],"pdf_url":"https://arxiv.org/pdf/2410.22909v1.pdf","comment":"21 pages, 14 figures, under review"},{"id":"http://arxiv.org/abs/2410.22901v1","updated":"2024-10-30T11:00:51Z","published":"2024-10-30T11:00:51Z","title":"HelloMeme: Integrating Spatial Knitting Attentions to Embed High-Level\n and Fidelity-Rich Conditions in Diffusion Models","summary":" We propose an effective method for inserting adapters into text-to-image\nfoundation models, which enables the execution of complex downstream tasks\nwhile preserving the generalization ability of the base model. The core idea of\nthis method is to optimize the attention mechanism related to 2D feature maps,\nwhich enhances the performance of the adapter. This approach was validated on\nthe task of meme video generation and achieved significant results. We hope\nthis work can provide insights for post-training tasks of large text-to-image\nmodels. Additionally, as this method demonstrates good compatibility with SD1.5\nderivative models, it holds certain value for the open-source community.\nTherefore, we will release the related code\n(\\url{https://songkey.github.io/hellomeme}).\n","authors":["Shengkai Zhang","Nianhong Jiao","Tian Li","Chaojie Yang","Chenhui Xue","Boya Niu","Jun Gao"],"pdf_url":"https://arxiv.org/pdf/2410.22901v1.pdf","comment":"11 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2410.22899v1","updated":"2024-10-30T10:58:21Z","published":"2024-10-30T10:58:21Z","title":"Wormhole Loss for Partial Shape Matching","summary":" When matching parts of a surface to its whole, a fundamental question arises:\nWhich points should be included in the matching process? The issue is\nintensified when using isometry to measure similarity, as it requires the\nvalidation of whether distances measured between pairs of surface points should\ninfluence the matching process. The approach we propose treats surfaces as\nmanifolds equipped with geodesic distances, and addresses the partial shape\nmatching challenge by introducing a novel criterion to meticulously search for\nconsistent distances between pairs of points. The new criterion explores the\nrelation between intrinsic geodesic distances between the points, geodesic\ndistances between the points and surface boundaries, and extrinsic distances\nbetween boundary points measured in the embedding space. It is shown to be less\nrestrictive compared to previous measures and achieves state-of-the-art results\nwhen used as a loss function in training networks for partial shape matching.\n","authors":["Amit Bracha","Thomas Dagès","Ron Kimmel"],"pdf_url":"https://arxiv.org/pdf/2410.22899v1.pdf","comment":"Accepted for publication at the conference on Neural Information\n Processing Systems (NeurIPS) 2024"},{"id":"http://arxiv.org/abs/2410.22898v1","updated":"2024-10-30T10:57:46Z","published":"2024-10-30T10:57:46Z","title":"YOLOv11 for Vehicle Detection: Advancements, Performance, and\n Applications in Intelligent Transportation Systems","summary":" Accurate vehicle detection is essential for the development of intelligent\ntransportation systems, autonomous driving, and traffic monitoring. This paper\npresents a detailed analysis of YOLO11, the latest advancement in the YOLO\nseries of deep learning models, focusing exclusively on vehicle detection\ntasks. Building upon the success of its predecessors, YOLO11 introduces\narchitectural improvements designed to enhance detection speed, accuracy, and\nrobustness in complex environments. Using a comprehensive dataset comprising\nmultiple vehicle types-cars, trucks, buses, motorcycles, and bicycles we\nevaluate YOLO11's performance using metrics such as precision, recall, F1\nscore, and mean average precision (mAP). Our findings demonstrate that YOLO11\nsurpasses previous versions (YOLOv8 and YOLOv10) in detecting smaller and more\noccluded vehicles while maintaining a competitive inference time, making it\nwell-suited for real-time applications. Comparative analysis shows significant\nimprovements in the detection of complex vehicle geometries, further\ncontributing to the development of efficient and scalable vehicle detection\nsystems. This research highlights YOLO11's potential to enhance autonomous\nvehicle performance and traffic monitoring systems, offering insights for\nfuture developments in the field.\n","authors":["Mujadded Al Rabbani Alif"],"pdf_url":"https://arxiv.org/pdf/2410.22898v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2410.22888v1","updated":"2024-10-30T10:33:10Z","published":"2024-10-30T10:33:10Z","title":"Effective and Efficient Adversarial Detection for Vision-Language Models\n via A Single Vector","summary":" Visual Language Models (VLMs) are vulnerable to adversarial attacks,\nespecially those from adversarial images, which is however under-explored in\nliterature. To facilitate research on this critical safety problem, we first\nconstruct a new laRge-scale Adervsarial images dataset with Diverse hArmful\nResponses (RADAR), given that existing datasets are either small-scale or only\ncontain limited types of harmful responses. With the new RADAR dataset, we\nfurther develop a novel and effective iN-time Embedding-based AdveRSarial Image\nDEtection (NEARSIDE) method, which exploits a single vector that distilled from\nthe hidden states of VLMs, which we call the attacking direction, to achieve\nthe detection of adversarial images against benign ones in the input. Extensive\nexperiments with two victim VLMs, LLaVA and MiniGPT-4, well demonstrate the\neffectiveness, efficiency, and cross-model transferrability of our proposed\nmethod. Our code is available at https://github.com/mob-scu/RADAR-NEARSIDE\n","authors":["Youcheng Huang","Fengbin Zhu","Jingkun Tang","Pan Zhou","Wenqiang Lei","Jiancheng Lv","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2410.22888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03492v3","updated":"2024-10-30T10:32:18Z","published":"2024-02-05T20:08:53Z","title":"Beyond Strong labels: Weakly-supervised Learning Based on Gaussian\n Pseudo Labels for The Segmentation of Ellipse-like Vascular Structures in\n Non-contrast CTs","summary":" Deep-learning-based automated segmentation of vascular structures in\npreoperative CT scans contributes to computer-assisted diagnosis and\nintervention procedure in vascular diseases. While CT angiography (CTA) is the\ncommon standard, non-contrast CT imaging is significant as a contrast-risk-free\nalternative, avoiding complications associated with contrast agents. However,\nthe challenges of labor-intensive labeling and high labeling variability due to\nthe ambiguity of vascular boundaries hinder conventional strong-label-based,\nfully-supervised learning in non-contrast CTs. This paper introduces a\nweakly-supervised framework using ellipses' topology in slices, including 1) an\nefficient annotation process based on predefined standards, 2) ellipse-fitting\nprocessing, 3) the generation of 2D Gaussian heatmaps serving as pseudo labels,\n4) a training process through a combination of voxel reconstruction loss and\ndistribution loss with the pseudo labels. We assess the effectiveness of the\nproposed method on one local and two public datasets comprising non-contrast CT\nscans, particularly focusing on the abdominal aorta. On the local dataset, our\nweakly-supervised learning approach based on pseudo labels outperforms\nstrong-label-based fully-supervised learning (1.54\\% of Dice score on average),\nreducing labeling time by around 82.0\\%. The efficiency in generating pseudo\nlabels allows the inclusion of label-agnostic external data in the training\nset, leading to an additional improvement in performance (2.74\\% of Dice score\non average) with a reduction of 66.3\\% labeling time, where the labeling time\nremains considerably less than that of strong labels. On the public dataset,\nthe pseudo labels achieve an overall improvement of 1.95\\% in Dice score for 2D\nmodels while a reduction of 11.65 voxel spacing in Hausdorff distance for 3D\nmodel.\n","authors":["Qixiang Ma","Antoine Łucas","Huazhong Shu","Adrien Kaladji","Pascal Haigron"],"pdf_url":"https://arxiv.org/pdf/2402.03492v3.pdf","comment":"Accepted by journal of Medical Image Analysis"},{"id":"http://arxiv.org/abs/2410.22883v1","updated":"2024-10-30T10:25:22Z","published":"2024-10-30T10:25:22Z","title":"Adaptive Paradigm Synergy: Can a Cross-Paradigm Objective Enhance\n Long-Tailed Learning?","summary":" Self-supervised learning (SSL) has achieved impressive results across several\ncomputer vision tasks, even rivaling supervised methods. However, its\nperformance degrades on real-world datasets with long-tailed distributions due\nto difficulties in capturing inherent class imbalances. Although supervised\nlong-tailed learning offers significant insights, the absence of labels in SSL\nprevents direct transfer of these strategies.To bridge this gap, we introduce\nAdaptive Paradigm Synergy (APS), a cross-paradigm objective that seeks to unify\nthe strengths of both paradigms. Our approach reexamines contrastive learning\nfrom a spatial structure perspective, dynamically adjusting the uniformity of\nlatent space structure through adaptive temperature tuning. Furthermore, we\ndraw on a re-weighting strategy from supervised learning to compensate for the\nshortcomings of temperature adjustment in explicit quantity\nperception.Extensive experiments on commonly used long-tailed datasets\ndemonstrate that APS improves performance effectively and efficiently. Our\nfindings reveal the potential for deeper integration between supervised and\nself-supervised learning, paving the way for robust models that handle\nreal-world class imbalance.\n","authors":["Haowen Xiao","Guanghui Liu","Xinyi Gao","Yang Li","Fengmao Lv","Jielei Chu"],"pdf_url":"https://arxiv.org/pdf/2410.22883v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2410.22881v1","updated":"2024-10-30T10:21:23Z","published":"2024-10-30T10:21:23Z","title":"SFA-UNet: More Attention to Multi-Scale Contrast and Contextual\n Information in Infrared Small Object Segmentation","summary":" Computer vision researchers have extensively worked on fundamental infrared\nvisual recognition for the past few decades. Among various approaches, deep\nlearning has emerged as the most promising candidate. However, Infrared Small\nObject Segmentation (ISOS) remains a major focus due to several challenges\nincluding: 1) the lack of effective utilization of local contrast and global\ncontextual information; 2) the potential loss of small objects in deep models;\nand 3) the struggling to capture fine-grained details and ignore noise. To\naddress these challenges, we propose a modified U-Net architecture, named\nSFA-UNet, by combining Scharr Convolution (SC) and Fast Fourier Convolution\n(FFC) in addition to vertical and horizontal Attention gates (AG) into UNet.\nSFA-UNet utilizes double convolution layers with the addition of SC and FFC in\nits encoder and decoder layers. SC helps to learn the foreground-to-background\ncontrast information whereas FFC provide multi-scale contextual information\nwhile mitigating the small objects vanishing problem. Additionally, the\nintroduction of vertical AGs in encoder layers enhances the model's focus on\nthe targeted object by ignoring irrelevant regions. We evaluated the proposed\napproach on publicly available, SIRST and IRSTD datasets, and achieved superior\nperformance by an average 0.75% with variance of 0.025 of all combined metrics\nin multiple runs as compared to the existing state-of-the-art methods\n","authors":["Imad Ali Shah","Fahad Mumtaz Malik","Muhammad Waqas Ashraf"],"pdf_url":"https://arxiv.org/pdf/2410.22881v1.pdf","comment":"Accepted and Presented at PRIP 2023"},{"id":"http://arxiv.org/abs/2410.06613v2","updated":"2024-10-30T10:21:13Z","published":"2024-10-09T07:09:29Z","title":"ES-Gaussian: Gaussian Splatting Mapping via Error Space-Based Gaussian\n Completion","summary":" Accurate and affordable indoor 3D reconstruction is critical for effective\nrobot navigation and interaction. Traditional LiDAR-based mapping provides high\nprecision but is costly, heavy, and power-intensive, with limited ability for\nnovel view rendering. Vision-based mapping, while cost-effective and capable of\ncapturing visual data, often struggles with high-quality 3D reconstruction due\nto sparse point clouds. We propose ES-Gaussian, an end-to-end system using a\nlow-altitude camera and single-line LiDAR for high-quality 3D indoor\nreconstruction. Our system features Visual Error Construction (VEC) to enhance\nsparse point clouds by identifying and correcting areas with insufficient\ngeometric detail from 2D error maps. Additionally, we introduce a novel 3DGS\ninitialization method guided by single-line LiDAR, overcoming the limitations\nof traditional multi-view setups and enabling effective reconstruction in\nresource-constrained environments. Extensive experimental results on our new\nDreame-SR dataset and a publicly available dataset demonstrate that ES-Gaussian\noutperforms existing methods, particularly in challenging scenarios. The\nproject page is available at https://chenlu-china.github.io/ES-Gaussian/.\n","authors":["Lu Chen","Yingfu Zeng","Haoang Li","Zhitao Deng","Jiafu Yan","Zhenjun Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.06613v2.pdf","comment":"This preprint has been withdrawn due to concerns regarding the\n originality of certain technical elements, as well as its basis in a company\n project report that was intended solely for internal discussions. To avoid\n any potential misunderstandings, we have decided to withdraw this submission\n from public access. We apologize for any confusion this may have caused"},{"id":"http://arxiv.org/abs/2410.17751v2","updated":"2024-10-30T10:13:18Z","published":"2024-10-23T10:28:17Z","title":"VISAGE: Video Synthesis using Action Graphs for Surgery","summary":" Surgical data science (SDS) is a field that analyzes patient data before,\nduring, and after surgery to improve surgical outcomes and skills. However,\nsurgical data is scarce, heterogeneous, and complex, which limits the\napplicability of existing machine learning methods. In this work, we introduce\nthe novel task of future video generation in laparoscopic surgery. This task\ncan augment and enrich the existing surgical data and enable various\napplications, such as simulation, analysis, and robot-aided surgery.\nUltimately, it involves not only understanding the current state of the\noperation but also accurately predicting the dynamic and often unpredictable\nnature of surgical procedures. Our proposed method, VISAGE (VIdeo Synthesis\nusing Action Graphs for Surgery), leverages the power of action scene graphs to\ncapture the sequential nature of laparoscopic procedures and utilizes diffusion\nmodels to synthesize temporally coherent video sequences. VISAGE predicts the\nfuture frames given only a single initial frame, and the action graph triplets.\nBy incorporating domain-specific knowledge through the action graph, VISAGE\nensures the generated videos adhere to the expected visual and motion patterns\nobserved in real laparoscopic procedures. The results of our experiments\ndemonstrate high-fidelity video generation for laparoscopy procedures, which\nenables various applications in SDS.\n","authors":["Yousef Yeganeh","Rachmadio Lazuardi","Amir Shamseddin","Emine Dari","Yash Thirani","Nassir Navab","Azade Farshad"],"pdf_url":"https://arxiv.org/pdf/2410.17751v2.pdf","comment":"Accepted at MICCAI 2024 Embodied AI and Robotics for HealTHcare\n (EARTH) Workshop"},{"id":"http://arxiv.org/abs/2410.22866v1","updated":"2024-10-30T10:03:55Z","published":"2024-10-30T10:03:55Z","title":"Towards Population Scale Testis Volume Segmentation in DIXON MRI","summary":" Testis size is known to be one of the main predictors of male fertility,\nusually assessed in clinical workup via palpation or imaging. Despite its\npotential, population-level evaluation of testicular volume using imaging\nremains underexplored. Previous studies, limited by small and biased datasets,\nhave demonstrated the feasibility of machine learning for testis volume\nsegmentation. This paper presents an evaluation of segmentation methods for\ntesticular volume using Magnet Resonance Imaging data from the UKBiobank. The\nbest model achieves a median dice score of $0.87$, compared to median dice\nscore of $0.83$ for human interrater reliability on the same dataset, enabling\nlarge-scale annotation on a population scale for the first time. Our overall\naim is to provide a trained model, comparative baseline methods, and annotated\ntraining data to enhance accessibility and reproducibility in testis MRI\nsegmentation research.\n","authors":["Jan Ernsting","Phillip Nikolas Beeken","Lynn Ogoniak","Jacqueline Kockwelp","Tim Hahn","Alexander Siegfried Busch","Benjamin Risse"],"pdf_url":"https://arxiv.org/pdf/2410.22866v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22865v1","updated":"2024-10-30T10:02:42Z","published":"2024-10-30T10:02:42Z","title":"Prune and Repaint: Content-Aware Image Retargeting for any Ratio","summary":" Image retargeting is the task of adjusting the aspect ratio of images to suit\ndifferent display devices or presentation environments. However, existing\nretargeting methods often struggle to balance the preservation of key semantics\nand image quality, resulting in either deformation or loss of important\nobjects, or the introduction of local artifacts such as discontinuous pixels\nand inconsistent regenerated content. To address these issues, we propose a\ncontent-aware retargeting method called PruneRepaint. It incorporates semantic\nimportance for each pixel to guide the identification of regions that need to\nbe pruned or preserved in order to maintain key semantics. Additionally, we\nintroduce an adaptive repainting module that selects image regions for\nrepainting based on the distribution of pruned pixels and the proportion\nbetween foreground size and target aspect ratio, thus achieving local\nsmoothness after pruning. By focusing on the content and structure of the\nforeground, our PruneRepaint approach adaptively avoids key content loss and\ndeformation, while effectively mitigating artifacts with local repainting. We\nconduct experiments on the public RetargetMe benchmark and demonstrate through\nobjective experimental results and subjective user studies that our method\noutperforms previous approaches in terms of preserving semantics and\naesthetics, as well as better generalization across diverse aspect ratios.\nCodes will be available at https://github.com/fhshen2022/PruneRepaint.\n","authors":["Feihong Shen","Chao Li","Yifeng Geng","Yongjian Deng","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2410.22865v1.pdf","comment":"NeurIPS24"},{"id":"http://arxiv.org/abs/2410.22862v1","updated":"2024-10-30T09:55:30Z","published":"2024-10-30T09:55:30Z","title":"AtGCN: A Graph Convolutional Network For Ataxic Gait Detection","summary":" Video-based gait analysis can be defined as the task of diagnosing\npathologies, such as ataxia, using videos of patients walking in front of a\ncamera. This paper presents a graph convolution network called AtGCN for\ndetecting ataxic gait and identifying its severity using 2D videos. The problem\nis especially challenging as the deviation of an ataxic gait from a healthy\ngait is very subtle. The datasets for ataxic gait detection are also quite\nsmall, with the largest dataset having only 149 videos. The paper addresses the\nfirst problem using special spatiotemporal graph convolution that successfully\ncaptures important gait-related features. To handle the small dataset size, a\ndeep spatiotemporal graph convolution network pre-trained on an action\nrecognition dataset is systematically truncated and then fine-tuned on the\nataxia dataset to obtain the AtGCN model. The paper also presents an\naugmentation strategy that segments a video sequence into multiple gait cycles.\nThe proposed AtGCN model then operates on a graph of body part locations\nbelonging to a single gait cycle. The evaluation results support the strength\nof the proposed AtGCN model, as it outperforms the state-of-the-art in\ndetection and severity prediction with an accuracy of 93.46% and a MAE of\n0.4169, respectively.\n","authors":["Karan Bania","Tanmay Verlekar"],"pdf_url":"https://arxiv.org/pdf/2410.22862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10142v2","updated":"2024-10-30T09:52:38Z","published":"2024-07-14T10:26:38Z","title":"PARE-Net: Position-Aware Rotation-Equivariant Networks for Robust Point\n Cloud Registration","summary":" Learning rotation-invariant distinctive features is a fundamental requirement\nfor point cloud registration. Existing methods often use rotation-sensitive\nnetworks to extract features, while employing rotation augmentation to learn an\napproximate invariant mapping rudely. This makes networks fragile to rotations,\noverweight, and hinders the distinctiveness of features. To tackle these\nproblems, we propose a novel position-aware rotation-equivariant network, for\nefficient, light-weighted, and robust registration. The network can provide a\nstrong model inductive bias to learn rotation-equivariant/invariant features,\nthus addressing the aforementioned limitations. To further improve the\ndistinctiveness of descriptors, we propose a position-aware convolution, which\ncan better learn spatial information of local structures. Moreover, we also\npropose a feature-based hypothesis proposer. It leverages rotation-equivariant\nfeatures that encode fine-grained structure orientations to generate reliable\nmodel hypotheses. Each correspondence can generate a hypothesis, thus it is\nmore efficient than classic estimators that require multiple reliable\ncorrespondences. Accordingly, a contrastive rotation loss is presented to\nenhance the robustness of rotation-equivariant features against data\ndegradation. Extensive experiments on indoor and outdoor datasets demonstrate\nthat our method significantly outperforms the SOTA methods in terms of\nregistration recall while being lightweight and keeping a fast speed. Moreover,\nexperiments on rotated datasets demonstrate its robustness against rotation\nvariations. Code is available at https://github.com/yaorz97/PARENet.\n","authors":["Runzhao Yao","Shaoyi Du","Wenting Cui","Canhui Tang","Chengwu Yang"],"pdf_url":"https://arxiv.org/pdf/2407.10142v2.pdf","comment":"Accepted by ECCV 2025"},{"id":"http://arxiv.org/abs/2410.22857v1","updated":"2024-10-30T09:42:47Z","published":"2024-10-30T09:42:47Z","title":"DAVINCI: A Single-Stage Architecture for Constrained CAD Sketch\n Inference","summary":" This work presents DAVINCI, a unified architecture for single-stage\nComputer-Aided Design (CAD) sketch parameterization and constraint inference\ndirectly from raster sketch images. By jointly learning both outputs, DAVINCI\nminimizes error accumulation and enhances the performance of constrained CAD\nsketch inference. Notably, DAVINCI achieves state-of-the-art results on the\nlarge-scale SketchGraphs dataset, demonstrating effectiveness on both precise\nand hand-drawn raster CAD sketches. To reduce DAVINCI's reliance on large-scale\nannotated datasets, we explore the efficacy of CAD sketch augmentations. We\nintroduce Constraint-Preserving Transformations (CPTs), i.e. random\npermutations of the parametric primitives of a CAD sketch that preserve its\nconstraints. This data augmentation strategy allows DAVINCI to achieve\nreasonable performance when trained with only 0.1% of the SketchGraphs dataset.\nFurthermore, this work contributes a new version of SketchGraphs, augmented\nwith CPTs. The newly introduced CPTSketchGraphs dataset includes 80 million\nCPT-augmented sketches, thus providing a rich resource for future research in\nthe CAD sketch domain.\n","authors":["Ahmet Serdar Karadeniz","Dimitrios Mallis","Nesryne Mejri","Kseniya Cherenkova","Anis Kacem","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2410.22857v1.pdf","comment":"Accepted at BMVC 2024"},{"id":"http://arxiv.org/abs/2406.05774v2","updated":"2024-10-30T09:40:39Z","published":"2024-06-09T13:15:43Z","title":"VCR-GauS: View Consistent Depth-Normal Regularizer for Gaussian Surface\n Reconstruction","summary":" Although 3D Gaussian Splatting has been widely studied because of its\nrealistic and efficient novel-view synthesis, it is still challenging to\nextract a high-quality surface from the point-based representation. Previous\nworks improve the surface by incorporating geometric priors from the\noff-the-shelf normal estimator. However, there are two main limitations: 1)\nSupervising normals rendered from 3D Gaussians effectively updates the rotation\nparameter but is less effective for other geometric parameters; 2) The\ninconsistency of predicted normal maps across multiple views may lead to severe\nreconstruction artifacts. In this paper, we propose a Depth-Normal regularizer\nthat directly couples normal with other geometric parameters, leading to full\nupdates of the geometric parameters from normal regularization. We further\npropose a confidence term to mitigate inconsistencies of normal predictions\nacross multiple views. Moreover, we also introduce a densification and\nsplitting strategy to regularize the size and distribution of 3D Gaussians for\nmore accurate surface modeling. Compared with Gaussian-based baselines,\nexperiments show that our approach obtains better reconstruction quality and\nmaintains competitive appearance quality at faster training speed and 100+ FPS\nrendering.\n","authors":["Hanlin Chen","Fangyin Wei","Chen Li","Tianxin Huang","Yunsong Wang","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2406.05774v2.pdf","comment":"Project page: https://hlinchen.github.io/projects/VCR-GauS/"},{"id":"http://arxiv.org/abs/2410.22837v1","updated":"2024-10-30T09:17:23Z","published":"2024-10-30T09:17:23Z","title":"SFDFusion: An Efficient Spatial-Frequency Domain Fusion Network for\n Infrared and Visible Image Fusion","summary":" Infrared and visible image fusion aims to utilize the complementary\ninformation from two modalities to generate fused images with prominent targets\nand rich texture details. Most existing algorithms only perform pixel-level or\nfeature-level fusion from different modalities in the spatial domain. They\nusually overlook the information in the frequency domain, and some of them\nsuffer from inefficiency due to excessively complex structures. To tackle these\nchallenges, this paper proposes an efficient Spatial-Frequency Domain Fusion\n(SFDFusion) network for infrared and visible image fusion. First, we propose a\nDual-Modality Refinement Module (DMRM) to extract complementary information.\nThis module extracts useful information from both the infrared and visible\nmodalities in the spatial domain and enhances fine-grained spatial details.\nNext, to introduce frequency domain information, we construct a Frequency\nDomain Fusion Module (FDFM) that transforms the spatial domain to the frequency\ndomain through Fast Fourier Transform (FFT) and then integrates frequency\ndomain information. Additionally, we design a frequency domain fusion loss to\nprovide guidance for the fusion process. Extensive experiments on public\ndatasets demonstrate that our method produces fused images with significant\nadvantages in various fusion metrics and visual effects. Furthermore, our\nmethod demonstrates high efficiency in image fusion and good performance on\ndownstream detection tasks, thereby satisfying the real-time demands of\nadvanced visual tasks.\n","authors":["Kun Hu","Qingle Zhang","Maoxun Yuan","Yitian Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.22837v1.pdf","comment":"accept in ECAI 2024"},{"id":"http://arxiv.org/abs/2410.22830v1","updated":"2024-10-30T09:14:13Z","published":"2024-10-30T09:14:13Z","title":"Latent Diffusion, Implicit Amplification: Efficient Continuous-Scale\n Super-Resolution for Remote Sensing Images","summary":" Recent advancements in diffusion models have significantly improved\nperformance in super-resolution (SR) tasks. However, previous research often\noverlooks the fundamental differences between SR and general image generation.\nGeneral image generation involves creating images from scratch, while SR\nfocuses specifically on enhancing existing low-resolution (LR) images by adding\ntypically missing high-frequency details. This oversight not only increases the\ntraining difficulty but also limits their inference efficiency. Furthermore,\nprevious diffusion-based SR methods are typically trained and inferred at fixed\ninteger scale factors, lacking flexibility to meet the needs of up-sampling\nwith non-integer scale factors. To address these issues, this paper proposes an\nefficient and elastic diffusion-based SR model (E$^2$DiffSR), specially\ndesigned for continuous-scale SR in remote sensing imagery. E$^2$DiffSR employs\na two-stage latent diffusion paradigm. During the first stage, an autoencoder\nis trained to capture the differential priors between high-resolution (HR) and\nLR images. The encoder intentionally ignores the existing LR content to\nalleviate the encoding burden, while the decoder introduces an SR branch\nequipped with a continuous scale upsampling module to accomplish the\nreconstruction under the guidance of the differential prior. In the second\nstage, a conditional diffusion model is learned within the latent space to\npredict the true differential prior encoding. Experimental results demonstrate\nthat E$^2$DiffSR achieves superior objective metrics and visual quality\ncompared to the state-of-the-art SR methods. Additionally, it reduces the\ninference time of diffusion-based SR methods to a level comparable to that of\nnon-diffusion methods.\n","authors":["Hanlin Wu","Jiangwei Mo","Xiaohui Sun","Jie Ma"],"pdf_url":"https://arxiv.org/pdf/2410.22830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22829v1","updated":"2024-10-30T09:11:25Z","published":"2024-10-30T09:11:25Z","title":"Situational Scene Graph for Structured Human-centric Situation\n Understanding","summary":" Graph based representation has been widely used in modelling spatio-temporal\nrelationships in video understanding. Although effective, existing graph-based\napproaches focus on capturing the human-object relationships while ignoring\nfine-grained semantic properties of the action components. These semantic\nproperties are crucial for understanding the current situation, such as where\ndoes the action takes place, what tools are used and functional properties of\nthe objects. In this work, we propose a graph-based representation called\nSituational Scene Graph (SSG) to encode both human-object relationships and the\ncorresponding semantic properties. The semantic details are represented as\npredefined roles and values inspired by situation frame, which is originally\ndesigned to represent a single action. Based on our proposed representation, we\nintroduce the task of situational scene graph generation and propose a\nmulti-stage pipeline Interactive and Complementary Network (InComNet) to\naddress the task. Given that the existing datasets are not applicable to the\ntask, we further introduce a SSG dataset whose annotations consist of semantic\nrole-value frames for human, objects and verb predicates of human-object\nrelations. Finally, we demonstrate the effectiveness of our proposed SSG\nrepresentation by testing on different downstream tasks. Experimental results\nshow that the unified representation can not only benefit predicate\nclassification and semantic role-value classification, but also benefit\nreasoning tasks on human-centric situation understanding. We will release the\ncode and the dataset soon.\n","authors":["Chinthani Sugandhika","Chen Li","Deepu Rajan","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2410.22829v1.pdf","comment":"Accepted for WACV 2025"},{"id":"http://arxiv.org/abs/2410.20806v2","updated":"2024-10-30T09:07:27Z","published":"2024-10-28T07:54:07Z","title":"Transformer-Based Tooth Alignment Prediction With Occlusion And\n Collision Constraints","summary":" The planning of digital orthodontic treatment requires providing tooth\nalignment, which not only consumes a lot of time and labor to determine\nmanually but also relays clinical experiences heavily. In this work, we\nproposed a lightweight tooth alignment neural network based on\nSwin-transformer. We first re-organized 3D point clouds based on virtual arch\nlines and converted them into order-sorted multi-channel textures, which\nimproves the accuracy and efficiency simultaneously. We then designed two new\nocclusal loss functions that quantitatively evaluate the occlusal relationship\nbetween the upper and lower jaws. They are important clinical constraints,\nfirst introduced to the best of our knowledge, and lead to cutting-edge\nprediction accuracy. To train our network, we collected a large digital\northodontic dataset that has 591 clinical cases, including various complex\nclinical cases. This dataset will benefit the community after its release since\nthere is no open dataset so far. Furthermore, we also proposed two new\northodontic dataset augmentation methods considering tooth spatial distribution\nand occlusion. We evaluated our method with this dataset and extensive\nexperiments, including comparisons with STAT methods and ablation studies, and\ndemonstrate the high prediction accuracy of our method.\n","authors":["ZhenXing Dong","JiaZhou Chen","YangHui Xu"],"pdf_url":"https://arxiv.org/pdf/2410.20806v2.pdf","comment":"add key words and email information"},{"id":"http://arxiv.org/abs/2410.22817v1","updated":"2024-10-30T08:51:29Z","published":"2024-10-30T08:51:29Z","title":"Epipolar-Free 3D Gaussian Splatting for Generalizable Novel View\n Synthesis","summary":" Generalizable 3D Gaussian splitting (3DGS) can reconstruct new scenes from\nsparse-view observations in a feed-forward inference manner, eliminating the\nneed for scene-specific retraining required in conventional 3DGS. However,\nexisting methods rely heavily on epipolar priors, which can be unreliable in\ncomplex realworld scenes, particularly in non-overlapping and occluded regions.\nIn this paper, we propose eFreeSplat, an efficient feed-forward 3DGS-based\nmodel for generalizable novel view synthesis that operates independently of\nepipolar line constraints. To enhance multiview feature extraction with 3D\nperception, we employ a selfsupervised Vision Transformer (ViT) with cross-view\ncompletion pre-training on large-scale datasets. Additionally, we introduce an\nIterative Cross-view Gaussians Alignment method to ensure consistent depth\nscales across different views. Our eFreeSplat represents an innovative approach\nfor generalizable novel view synthesis. Different from the existing pure\ngeometry-free methods, eFreeSplat focuses more on achieving epipolar-free\nfeature matching and encoding by providing 3D priors through cross-view\npretraining. We evaluate eFreeSplat on wide-baseline novel view synthesis tasks\nusing the RealEstate10K and ACID datasets. Extensive experiments demonstrate\nthat eFreeSplat surpasses state-of-the-art baselines that rely on epipolar\npriors, achieving superior geometry reconstruction and novel view synthesis\nquality. Project page: https://tatakai1.github.io/efreesplat/.\n","authors":["Zhiyuan Min","Yawei Luo","Jianwen Sun","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2410.22817v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.22811v1","updated":"2024-10-30T08:43:18Z","published":"2024-10-30T08:43:18Z","title":"Adaptive Multi Scale Document Binarisation Using Vision Mamba","summary":" Enhancing and preserving the readability of document images, particularly\nhistorical ones, is crucial for effective document image analysis. Numerous\nmodels have been proposed for this task, including convolutional-based,\ntransformer-based, and hybrid convolutional-transformer architectures. While\nhybrid models address the limitations of purely convolutional or\ntransformer-based methods, they often suffer from issues like quadratic time\ncomplexity. In this work, we propose a Mamba-based architecture for document\nbinarisation, which efficiently handles long sequences by scaling linearly and\noptimizing memory usage. Additionally, we introduce novel modifications to the\nskip connections by incorporating Difference of Gaussians (DoG) features,\ninspired by conventional signal processing techniques. These multiscale\nhigh-frequency features enable the model to produce high-quality, detailed\noutputs.\n","authors":["Mohd. Azfar","Siddhant Bharadwaj","Ashwin Sasikumar"],"pdf_url":"https://arxiv.org/pdf/2410.22811v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06645v3","updated":"2024-10-30T08:32:36Z","published":"2024-10-09T07:57:47Z","title":"Continual Learning in the Frequency Domain","summary":" Continual learning (CL) is designed to learn new tasks while preserving\nexisting knowledge. Replaying samples from earlier tasks has proven to be an\neffective method to mitigate the forgetting of previously acquired knowledge.\nHowever, the current research on the training efficiency of rehearsal-based\nmethods is insufficient, which limits the practical application of CL systems\nin resource-limited scenarios. The human visual system (HVS) exhibits varying\nsensitivities to different frequency components, enabling the efficient\nelimination of visually redundant information. Inspired by HVS, we propose a\nnovel framework called Continual Learning in the Frequency Domain (CLFD). To\nour knowledge, this is the first study to utilize frequency domain features to\nenhance the performance and efficiency of CL training on edge devices. For the\ninput features of the feature extractor, CLFD employs wavelet transform to map\nthe original input image into the frequency domain, thereby effectively\nreducing the size of input feature maps. Regarding the output features of the\nfeature extractor, CLFD selectively utilizes output features for distinct\nclasses for classification, thereby balancing the reusability and interference\nof output features based on the frequency domain similarity of the classes\nacross various tasks. Optimizing only the input and output features of the\nfeature extractor allows for seamless integration of CLFD with various\nrehearsal-based methods. Extensive experiments conducted in both cloud and edge\nenvironments demonstrate that CLFD consistently improves the performance of\nstate-of-the-art (SOTA) methods in both precision and training efficiency.\nSpecifically, CLFD can increase the accuracy of the SOTA CL method by up to\n6.83% and reduce the training time by 2.6$\\times$.\n","authors":["Ruiqi Liu","Boyu Diao","Libo Huang","Zijia An","Zhulin An","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2410.06645v3.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.22802v1","updated":"2024-10-30T08:31:48Z","published":"2024-10-30T08:31:48Z","title":"Wavelet Burst Accumulation for turbulence mitigation","summary":" In this paper, we investigate the extension of the recently proposed weighted\nFourier burst accumulation (FBA) method into the wavelet domain. The purpose of\nFBA is to reconstruct a clean and sharp image from a sequence of blurred\nframes. This concept lies in the construction of weights to amplify dominant\nfrequencies in the Fourier spectrum of each frame. The reconstructed image is\nthen obtained by taking the inverse Fourier transform of the average of all\nprocessed spectra. In this paper, we first suggest to replace the rigid\nregistration step used in the original algorithm by a non-rigid registration in\norder to be able to process sequences acquired through atmospheric turbulence.\nSecond, we propose to work in a wavelet domain instead of the Fourier one. This\nleads us to the construction of two types of algorithms. Finally, we propose an\nalternative approach to replace the weighting idea by an approach promoting the\nsparsity in the used space. Several experiments are provided to illustrate the\nefficiency of the proposed methods.\n","authors":["Jerome Gilles","Stanley Osher"],"pdf_url":"https://arxiv.org/pdf/2410.22802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11666v2","updated":"2024-10-30T08:28:42Z","published":"2024-10-15T14:53:07Z","title":"Degradation Oriented and Regularized Network for Real-World Depth\n Super-Resolution","summary":" Recent RGB-guided depth super-resolution methods have achieved impressive\nperformance under the assumption of fixed and known degradation (e.g., bicubic\ndownsampling). However, in real-world scenarios, captured depth data often\nsuffer from unconventional and unknown degradation due to sensor limitations\nand complex imaging environments (e.g., low reflective surfaces, varying\nillumination). Consequently, the performance of these methods significantly\ndeclines when real-world degradation deviate from their assumptions. In this\npaper, we propose the Degradation Oriented and Regularized Network (DORNet), a\nnovel framework designed to adaptively address unknown degradation in\nreal-world scenes through implicit degradation representations. Our approach\nbegins with the development of a self-supervised degradation learning strategy,\nwhich models the degradation representations of low-resolution depth data using\nrouting selection-based degradation regularization. To facilitate effective\nRGB-D fusion, we further introduce a degradation-oriented feature\ntransformation module that selectively propagates RGB content into the depth\ndata based on the learned degradation priors. Extensive experimental results on\nboth real and synthetic datasets demonstrate the superiority of our DORNet. The\ncode is available at https://github.com/yanzq95/DORNet.\n","authors":["Zhengxue Wang","Zhiqiang Yan","Jinshan Pan","Guangwei Gao","Kai Zhang","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2410.11666v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2410.22791v1","updated":"2024-10-30T08:10:18Z","published":"2024-10-30T08:10:18Z","title":"Open Turbulent Image Set (OTIS)","summary":" Long distance imaging is subject to the impact of the turbulent atmosphere.\nThis results into geometric distortions and some blur effect in the observed\nframes. Despite the existence of several turbulence mitigation algorithms in\nthe literature, no common dataset exists to objectively evaluate their\nefficiency. In this paper, we describe a new dataset called OTIS (Open\nTurbulent Images Set) which contains several sequences (either static or\ndynamic) acquired through the turbulent atmosphere. For almost all sequences,\nwe provide the corresponding groundtruth in order to make the comparison\nbetween algorithms easier. We also discuss possible metrics to perform such\ncomparisons.\n","authors":["Nicholas B. Ferrante","Jerome Gilles"],"pdf_url":"https://arxiv.org/pdf/2410.22791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13219v2","updated":"2024-10-30T08:04:59Z","published":"2024-06-19T05:15:21Z","title":"MC-MKE: A Fine-Grained Multimodal Knowledge Editing Benchmark\n Emphasizing Modality Consistency","summary":" Multimodal large language models (MLLMs) are prone to non-factual or outdated\nknowledge issues, which can manifest as misreading and misrecognition errors\ndue to the complexity of multimodal knowledge. Previous benchmarks have not\nsystematically analyzed the performance of editing methods in correcting these\ntwo error types. To better represent and correct these errors, we decompose\nmultimodal knowledge into its visual and textual components. Different error\ntypes correspond to different editing formats, which edit distinct parts of the\nmultimodal knowledge. We present MC-MKE, a fine-grained Multimodal Knowledge\nEditing benchmark emphasizing Modality Consistency. Our benchmark facilitates\nindependent correction of misreading and misrecognition errors by editing the\ncorresponding knowledge component. We evaluate four multimodal knowledge\nediting methods on MC-MKE, revealing their limitations, particularly in terms\nof modality consistency. Our work highlights the challenges posed by multimodal\nknowledge editing and motivates further research in developing effective\ntechniques for this task.\n","authors":["Junzhe Zhang","Huixuan Zhang","Xunjian Yin","Baizhou Huang","Xu Zhang","Xinyu Hu","Xiaojun Wan"],"pdf_url":"https://arxiv.org/pdf/2406.13219v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22784v1","updated":"2024-10-30T07:59:52Z","published":"2024-10-30T07:59:52Z","title":"Contrastive Learning and Adversarial Disentanglement for\n Privacy-Preserving Task-Oriented Semantic Communications","summary":" Task-oriented semantic communication systems have emerged as a promising\napproach to achieving efficient and intelligent data transmission, where only\ninformation relevant to a specific task is communicated. However, existing\nmethods struggle to fully disentangle task-relevant and task-irrelevant\ninformation, leading to privacy concerns and subpar performance. To address\nthis, we propose an information-bottleneck method, named CLAD (contrastive\nlearning and adversarial disentanglement). CLAD leverages contrastive learning\nto effectively capture task-relevant features while employing adversarial\ndisentanglement to discard task-irrelevant information. Additionally, due to\nthe lack of reliable and reproducible methods to gain insight into the\ninformativeness and minimality of the encoded feature vectors, we introduce a\nnew technique to compute the information retention index (IRI), a comparative\nmetric used as a proxy for the mutual information between the encoded features\nand the input, reflecting the minimality of the encoded features. The IRI\nquantifies the minimality and informativeness of the encoded feature vectors\nacross different task-oriented communication techniques. Our extensive\nexperiments demonstrate that CLAD outperforms state-of-the-art baselines in\nterms of task performance, privacy preservation, and IRI. CLAD achieves a\npredictive performance improvement of around 2.5-3%, along with a 77-90%\nreduction in IRI and a 57-76% decrease in adversarial accuracy.\n","authors":["Omar Erak","Omar Alhussein","Wen Tong"],"pdf_url":"https://arxiv.org/pdf/2410.22784v1.pdf","comment":"Submitted to EEE Journal on Selected Areas in Communications (JSAC):\n Intelligent Communications for Real-Time Computer Vision (Comm4CV)"},{"id":"http://arxiv.org/abs/2409.19608v2","updated":"2024-10-30T07:52:32Z","published":"2024-09-29T08:18:50Z","title":"Causal Deciphering and Inpainting in Spatio-Temporal Dynamics via\n Diffusion Model","summary":" Spatio-temporal (ST) prediction has garnered a De facto attention in earth\nsciences, such as meteorological prediction, human mobility perception.\nHowever, the scarcity of data coupled with the high expenses involved in sensor\ndeployment results in notable data imbalances. Furthermore, models that are\nexcessively customized and devoid of causal connections further undermine the\ngeneralizability and interpretability. To this end, we establish a causal\nframework for ST predictions, termed CaPaint, which targets to identify causal\nregions in data and endow model with causal reasoning ability in a two-stage\nprocess. Going beyond this process, we utilize the back-door adjustment to\nspecifically address the sub-regions identified as non-causal in the upstream\nphase. Specifically, we employ a novel image inpainting technique. By using a\nfine-tuned unconditional Diffusion Probabilistic Model (DDPM) as the generative\nprior, we in-fill the masks defined as environmental parts, offering the\npossibility of reliable extrapolation for potential data distributions. CaPaint\novercomes the high complexity dilemma of optimal ST causal discovery models by\nreducing the data generation complexity from exponential to quasi-linear\nlevels. Extensive experiments conducted on five real-world ST benchmarks\ndemonstrate that integrating the CaPaint concept allows models to achieve\nimprovements ranging from 4.3% to 77.3%. Moreover, compared to traditional\nmainstream ST augmenters, CaPaint underscores the potential of diffusion models\nin ST enhancement, offering a novel paradigm for this field. Our project is\navailable at https://anonymous.4open.science/r/12345-DFCC.\n","authors":["Yifan Duan","Jian Zhao"," pengcheng","Junyuan Mao","Hao Wu","Jingyu Xu","Shilong Wang","Caoyuan Ma","Kai Wang","Kun Wang","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2409.19608v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22777v1","updated":"2024-10-30T07:46:06Z","published":"2024-10-30T07:46:06Z","title":"Bregman implementation of Meyer's $G-$norm for cartoon + textures\n decomposition","summary":" In this paper, we design a very simple algorithm based on Split Bregman\niterations to numerically solve the cartoon + textures decomposition model of\nMeyer. This results in a significant gain in speed compared to Chambolle's\nnonlinear projectors.\n","authors":["Jerome Gilles","Stanley Osher"],"pdf_url":"https://arxiv.org/pdf/2410.22777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22775v1","updated":"2024-10-30T07:43:29Z","published":"2024-10-30T07:43:29Z","title":"Diffusion Beats Autoregressive: An Evaluation of Compositional\n Generation in Text-to-Image Models","summary":" Text-to-image (T2I) generative models, such as Stable Diffusion and DALL-E,\nhave shown remarkable proficiency in producing high-quality, realistic, and\nnatural images from textual descriptions. However, these models sometimes fail\nto accurately capture all the details specified in the input prompts,\nparticularly concerning entities, attributes, and spatial relationships. This\nissue becomes more pronounced when the prompt contains novel or complex\ncompositions, leading to what are known as compositional generation failure\nmodes. Recently, a new open-source diffusion-based T2I model, FLUX, has been\nintroduced, demonstrating strong performance in high-quality image generation.\nAdditionally, autoregressive T2I models like LlamaGen have claimed competitive\nvisual quality performance compared to diffusion-based models. In this study,\nwe evaluate the compositional generation capabilities of these newly introduced\nmodels against established models using the T2I-CompBench benchmark. Our\nfindings reveal that LlamaGen, as a vanilla autoregressive model, is not yet on\npar with state-of-the-art diffusion models for compositional generation tasks\nunder the same criteria, such as model size and inference time. On the other\nhand, the open-source diffusion-based model FLUX exhibits compositional\ngeneration capabilities comparable to the state-of-the-art closed-source model\nDALL-E3.\n","authors":["Arash Marioriyad","Parham Rezaei","Mahdieh Soleymani Baghshah","Mohammad Hossein Rohban"],"pdf_url":"https://arxiv.org/pdf/2410.22775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22771v1","updated":"2024-10-30T07:40:08Z","published":"2024-10-30T07:40:08Z","title":"FuseAnyPart: Diffusion-Driven Facial Parts Swapping via Multiple\n Reference Images","summary":" Facial parts swapping aims to selectively transfer regions of interest from\nthe source image onto the target image while maintaining the rest of the target\nimage unchanged. Most studies on face swapping designed specifically for\nfull-face swapping, are either unable or significantly limited when it comes to\nswapping individual facial parts, which hinders fine-grained and customized\ncharacter designs. However, designing such an approach specifically for facial\nparts swapping is challenged by a reasonable multiple reference feature fusion,\nwhich needs to be both efficient and effective. To overcome this challenge,\nFuseAnyPart is proposed to facilitate the seamless \"fuse-any-part\"\ncustomization of the face. In FuseAnyPart, facial parts from different people\nare assembled into a complete face in latent space within the Mask-based Fusion\nModule. Subsequently, the consolidated feature is dispatched to the\nAddition-based Injection Module for fusion within the UNet of the diffusion\nmodel to create novel characters. Extensive experiments qualitatively and\nquantitatively validate the superiority and robustness of FuseAnyPart. Source\ncodes are available at https://github.com/Thomas-wyh/FuseAnyPart.\n","authors":["Zheng Yu","Yaohua Wang","Siying Cui","Aixi Zhang","Wei-Long Zheng","Senzhang Wang"],"pdf_url":"https://arxiv.org/pdf/2410.22771v1.pdf","comment":"Accepted by the NeurIPS 2024 (Spotlight). Homepage:\n https://thomas-wyh.github.io/"},{"id":"http://arxiv.org/abs/2401.14121v2","updated":"2024-10-30T07:24:42Z","published":"2024-01-25T12:04:53Z","title":"Incorporating Test-Time Optimization into Training with Dual Networks\n for Human Mesh Recovery","summary":" Human Mesh Recovery (HMR) is the task of estimating a parameterized 3D human\nmesh from an image. There is a kind of methods first training a regression\nmodel for this problem, then further optimizing the pretrained regression model\nfor any specific sample individually at test time. However, the pretrained\nmodel may not provide an ideal optimization starting point for the test-time\noptimization. Inspired by meta-learning, we incorporate the test-time\noptimization into training, performing a step of test-time optimization for\neach sample in the training batch before really conducting the training\noptimization over all the training samples. In this way, we obtain a\nmeta-model, the meta-parameter of which is friendly to the test-time\noptimization. At test time, after several test-time optimization steps starting\nfrom the meta-parameter, we obtain much higher HMR accuracy than the test-time\noptimization starting from the simply pretrained regression model. Furthermore,\nwe find test-time HMR objectives are different from training-time objectives,\nwhich reduces the effectiveness of the learning of the meta-model. To solve\nthis problem, we propose a dual-network architecture that unifies the\ntraining-time and test-time objectives. Our method, armed with meta-learning\nand the dual networks, outperforms state-of-the-art regression-based and\noptimization-based HMR approaches, as validated by the extensive experiments.\nThe codes are available at https://github.com/fmx789/Meta-HMR.\n","authors":["Yongwei Nie","Mingxian Fan","Chengjiang Long","Qing Zhang","Jian Zhu","Xuemiao Xu"],"pdf_url":"https://arxiv.org/pdf/2401.14121v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22748v1","updated":"2024-10-30T07:11:41Z","published":"2024-10-30T07:11:41Z","title":"Analysis of Classifier Training on Synthetic Data for Cross-Domain\n Datasets","summary":" A major challenges of deep learning (DL) is the necessity to collect huge\namounts of training data. Often, the lack of a sufficiently large dataset\ndiscourages the use of DL in certain applications. Typically, acquiring the\nrequired amounts of data costs considerable time, material and effort. To\nmitigate this problem, the use of synthetic images combined with real data is a\npopular approach, widely adopted in the scientific community to effectively\ntrain various detectors. In this study, we examined the potential of synthetic\ndata-based training in the field of intelligent transportation systems. Our\nfocus is on camera-based traffic sign recognition applications for advanced\ndriver assistance systems and autonomous driving. The proposed augmentation\npipeline of synthetic datasets includes novel augmentation processes such as\nstructured shadows and gaussian specular highlights. A well-known DL model was\ntrained with different datasets to compare the performance of synthetic and\nreal image-based trained models. Additionally, a new, detailed method to\nobjectively compare these models is proposed. Synthetic images are generated\nusing a semi-supervised errors-guide method which is also described. Our\nexperiments showed that a synthetic image-based approach outperforms in most\ncases real image-based training when applied to cross-domain test datasets\n(+10% precision for GTSRB dataset) and consequently, the generalization of the\nmodel is improved decreasing the cost of acquiring images.\n","authors":["Andoni Cortés","Clemente Rodríguez","Gorka Velez","Javier Barandiarán","Marcos Nieto"],"pdf_url":"https://arxiv.org/pdf/2410.22748v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2406.07476v3","updated":"2024-10-30T06:49:54Z","published":"2024-06-11T17:22:23Z","title":"VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio\n Understanding in Video-LLMs","summary":" In this paper, we present the VideoLLaMA 2, a set of Video Large Language\nModels (Video-LLMs) designed to enhance spatial-temporal modeling and audio\nunderstanding in video and audio-oriented tasks. Building upon its predecessor,\nVideoLLaMA 2 incorporates a tailor-made Spatial-Temporal Convolution (STC)\nconnector, which effectively captures the intricate spatial and temporal\ndynamics of video data. Additionally, we integrate an Audio Branch into the\nmodel through joint training, thereby enriching the multimodal understanding\ncapabilities of the model by seamlessly incorporating audio cues. Comprehensive\nevaluations on multiple-choice video question answering (MC-VQA), open-ended\nvideo question answering (OE-VQA), and video captioning (VC) tasks demonstrate\nthat VideoLLaMA 2 consistently achieves competitive results among open-source\nmodels and even gets close to some proprietary models on several benchmarks.\nFurthermore, VideoLLaMA 2 exhibits reasonable improvements in audio-only and\naudio-video question-answering (AQA & OE-AVQA) benchmarks over existing models.\nThese advancements underline VideoLLaMA 2's superior performance in multimodal\ncomprehension, setting a new standard for intelligent video analysis systems.\nAll models are public to facilitate further research.\n","authors":["Zesen Cheng","Sicong Leng","Hang Zhang","Yifei Xin","Xin Li","Guanzheng Chen","Yongxin Zhu","Wenqi Zhang","Ziyang Luo","Deli Zhao","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2406.07476v3.pdf","comment":"ZC, SL, HZ, YX, and XL contributed equally to this project. Code:\n https://github.com/DAMO-NLP-SG/VideoLLaMA2"},{"id":"http://arxiv.org/abs/2406.05768v4","updated":"2024-10-30T06:49:52Z","published":"2024-06-09T12:55:50Z","title":"TLCM: Training-efficient Latent Consistency Model for Image Generation\n with 2-8 Steps","summary":" Distilling latent diffusion models (LDMs) into ones that are fast to sample\nfrom is attracting growing research interest. However, the majority of existing\nmethods face two critical challenges: (1) They hinge on long training using a\nhuge volume of real data. (2) They routinely lead to quality degradation for\ngeneration, especially in text-image alignment. This paper proposes a novel\ntraining-efficient Latent Consistency Model (TLCM) to overcome these\nchallenges. Our method first accelerates LDMs via data-free multistep latent\nconsistency distillation (MLCD), and then data-free latent consistency\ndistillation is proposed to efficiently guarantee the inter-segment consistency\nin MLCD. Furthermore, we introduce bags of techniques, e.g., distribution\nmatching, adversarial learning, and preference learning, to enhance TLCM's\nperformance at few-step inference without any real data. TLCM demonstrates a\nhigh level of flexibility by enabling adjustment of sampling steps within the\nrange of 2 to 8 while still producing competitive outputs compared to full-step\napproaches. Notably, TLCM enjoys the data-free merit by employing synthetic\ndata from the teacher for distillation. With just 70 training hours on an A100\nGPU, a 3-step TLCM distilled from SDXL achieves an impressive CLIP Score of\n33.68 and an Aesthetic Score of 5.97 on the MSCOCO-2017 5K benchmark,\nsurpassing various accelerated models and even outperforming the teacher model\nin human preference metrics. We also demonstrate the versatility of TLCMs in\napplications including image style transfer, controllable generation, and\nChinese-to-image generation.\n","authors":["Qingsong Xie","Zhenyi Liao","Zhijie Deng","Chen chen","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2406.05768v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22733v1","updated":"2024-10-30T06:39:27Z","published":"2024-10-30T06:39:27Z","title":"ETO:Efficient Transformer-based Local Feature Matching by Organizing\n Multiple Homography Hypotheses","summary":" We tackle the efficiency problem of learning local feature matching.Recent\nadvancements have given rise to purely CNN-based and transformer-based\napproaches, each augmented with deep learning techniques. While CNN-based\nmethods often excel in matching speed, transformer-based methods tend to\nprovide more accurate matches. We propose an efficient transformer-based\nnetwork architecture for local feature matching.This technique is built on\nconstructing multiple homography hypotheses to approximate the continuous\ncorrespondence in the real world and uni-directional cross-attention to\naccelerate the refinement. On the YFCC100M dataset, our matching accuracy is\ncompetitive with LoFTR, a state-of-the-art transformer-based architecture,\nwhile the inference speed is boosted to 4 times, even outperforming the\nCNN-based methods.Comprehensive evaluations on other open datasets such as\nMegadepth, ScanNet, and HPatches demonstrate our method's efficacy,\nhighlighting its potential to significantly enhance a wide array of downstream\napplications.\n","authors":["Junjie Ni","Guofeng Zhang","Guanglin Li","Yijin Li","Xinyang Liu","Zhaoyang Huang","Hujun Bao"],"pdf_url":"https://arxiv.org/pdf/2410.22733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22732v1","updated":"2024-10-30T06:37:55Z","published":"2024-10-30T06:37:55Z","title":"st-DTPM: Spatial-Temporal Guided Diffusion Transformer Probabilistic\n Model for Delayed Scan PET Image Prediction","summary":" PET imaging is widely employed for observing biological metabolic activities\nwithin the human body. However, numerous benign conditions can cause increased\nuptake of radiopharmaceuticals, confounding differentiation from malignant\ntumors. Several studies have indicated that dual-time PET imaging holds promise\nin distinguishing between malignant and benign tumor processes. Nevertheless,\nthe hour-long distribution period of radiopharmaceuticals post-injection\ncomplicates the determination of optimal timing for the second scan, presenting\nchallenges in both practical applications and research. Notably, we have\nidentified that delay time PET imaging can be framed as an image-to-image\nconversion problem. Motivated by this insight, we propose a novel\nspatial-temporal guided diffusion transformer probabilistic model (st-DTPM) to\nsolve dual-time PET imaging prediction problem. Specifically, this architecture\nleverages the U-net framework that integrates patch-wise features of CNN and\npixel-wise relevance of Transformer to obtain local and global information. And\nthen employs a conditional DDPM model for image synthesis. Furthermore, on\nspatial condition, we concatenate early scan PET images and noisy PET images on\nevery denoising step to guide the spatial distribution of denoising sampling.\nOn temporal condition, we convert diffusion time steps and delay time to a\nuniversal time vector, then embed it to each layer of model architecture to\nfurther improve the accuracy of predictions. Experimental results demonstrated\nthe superiority of our method over alternative approaches in preserving image\nquality and structural information, thereby affirming its efficacy in\npredictive task.\n","authors":["Ran Hong","Yuxia Huang","Lei Liu","Zhonghui Wu","Bingxuan Li","Xuemei Wang","Qiegen Liu"],"pdf_url":"https://arxiv.org/pdf/2410.22732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22725v1","updated":"2024-10-30T06:17:20Z","published":"2024-10-30T06:17:20Z","title":"One Prompt to Verify Your Models: Black-Box Text-to-Image Models\n Verification via Non-Transferable Adversarial Attacks","summary":" Recently, the success of Text-to-Image (T2I) models has led to the rise of\nnumerous third-party platforms, which claim to provide cheaper API services and\nmore flexibility in model options. However, this also raises a new security\nconcern: Are these third-party services truly offering the models they claim?\nTo address this problem, we propose the first T2I model verification method\nnamed Text-to-Image Model Verification via Non-Transferable Adversarial Attacks\n(TVN). The non-transferability of adversarial examples means that these\nexamples are only effective on a target model and ineffective on other models,\nthereby allowing for the verification of the target model. TVN utilizes the\nNon-dominated Sorting Genetic Algorithm II (NSGA-II) to optimize the cosine\nsimilarity of a prompt's text encoding, generating non-transferable adversarial\nprompts. By calculating the CLIP-text scores between the non-transferable\nadversarial prompts without perturbations and the images, we can verify if the\nmodel matches the claimed target model, based on a 3-sigma threshold. The\nexperiments showed that TVN performed well in both closed-set and open-set\nscenarios, achieving a verification accuracy of over 90\\%. Moreover, the\nadversarial prompts generated by TVN significantly reduced the CLIP-text scores\nof the target model, while having little effect on other models.\n","authors":["Ji Guo","Wenbo Jiang","Rui Zhang","Guoming Lu","Hongwei Li","Weiren Wu"],"pdf_url":"https://arxiv.org/pdf/2410.22725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21946v2","updated":"2024-10-30T06:17:00Z","published":"2024-10-29T11:05:52Z","title":"Analyzing Noise Models and Advanced Filtering Algorithms for Image\n Enhancement","summary":" Noise, an unwanted component in an image, can be the reason for the\ndegradation of Image at the time of transmission or capturing. Noise reduction\nfrom images is still a challenging task. Digital Image Processing is a\ncomponent of Digital signal processing. A wide variety of algorithms can be\nused in image processing to apply to an image or an input dataset and obtain\nimportant outcomes. In image processing research, removing noise from images\nbefore further analysis is essential. Post-noise removal of images improves\nclarity, enabling better interpretation and analysis across medical imaging,\nsatellite imagery, and radar applications. While numerous algorithms exist,\neach comes with its own assumptions, strengths, and limitations. The paper aims\nto evaluate the effectiveness of different filtering techniques on images with\neight types of noise. It evaluates methodologies like Wiener, Median, Gaussian,\nMean, Low pass, High pass, Laplacian and bilateral filtering, using the\nperformance metric Peak signal to noise ratio. It shows us the impact of\ndifferent filters on noise models by applying a variety of filters to various\nkinds of noise. Additionally, it also assists us in determining which filtering\nstrategy is most appropriate for a certain noise model based on the\ncircumstances.\n","authors":["Sahil Ali Akbar","Ananya Verma"],"pdf_url":"https://arxiv.org/pdf/2410.21946v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01446v2","updated":"2024-10-30T05:56:30Z","published":"2024-03-03T09:04:34Z","title":"GuardT2I: Defending Text-to-Image Models from Adversarial Prompts","summary":" Recent advancements in Text-to-Image (T2I) models have raised significant\nsafety concerns about their potential misuse for generating inappropriate or\nNot-Safe-For-Work (NSFW) contents, despite existing countermeasures such as\nNSFW classifiers or model fine-tuning for inappropriate concept removal.\nAddressing this challenge, our study unveils GuardT2I, a novel moderation\nframework that adopts a generative approach to enhance T2I models' robustness\nagainst adversarial prompts. Instead of making a binary classification,\nGuardT2I utilizes a Large Language Model (LLM) to conditionally transform text\nguidance embeddings within the T2I models into natural language for effective\nadversarial prompt detection, without compromising the models' inherent\nperformance. Our extensive experiments reveal that GuardT2I outperforms leading\ncommercial solutions like OpenAI-Moderation and Microsoft Azure Moderator by a\nsignificant margin across diverse adversarial scenarios. Our framework is\navailable at https://github.com/cure-lab/GuardT2I.\n","authors":["Yijun Yang","Ruiyuan Gao","Xiao Yang","Jianyuan Zhong","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.01446v2.pdf","comment":"NeurIPS2024 Poster"},{"id":"http://arxiv.org/abs/2410.22715v1","updated":"2024-10-30T05:53:07Z","published":"2024-10-30T05:53:07Z","title":"SCRREAM : SCan, Register, REnder And Map:A Framework for Annotating\n Accurate and Dense 3D Indoor Scenes with a Benchmark","summary":" Traditionally, 3d indoor datasets have generally prioritized scale over\nground-truth accuracy in order to obtain improved generalization. However,\nusing these datasets to evaluate dense geometry tasks, such as depth rendering,\ncan be problematic as the meshes of the dataset are often incomplete and may\nproduce wrong ground truth to evaluate the details. In this paper, we propose\nSCRREAM, a dataset annotation framework that allows annotation of fully dense\nmeshes of objects in the scene and registers camera poses on the real image\nsequence, which can produce accurate ground truth for both sparse 3D as well as\ndense 3D tasks. We show the details of the dataset annotation pipeline and\nshowcase four possible variants of datasets that can be obtained from our\nframework with example scenes, such as indoor reconstruction and SLAM, scene\nediting & object removal, human reconstruction and 6d pose estimation. Recent\npipelines for indoor reconstruction and SLAM serve as new benchmarks. In\ncontrast to previous indoor dataset, our design allows to evaluate dense\ngeometry tasks on eleven sample scenes against accurately rendered ground truth\ndepth maps.\n","authors":["HyunJun Jung","Weihang Li","Shun-Cheng Wu","William Bittner","Nikolas Brasch","Jifei Song","Eduardo Pérez-Pellitero","Zhensong Zhang","Arthur Moreau","Nassir Navab","Benjamin Busam"],"pdf_url":"https://arxiv.org/pdf/2410.22715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20427v2","updated":"2024-10-30T05:51:08Z","published":"2024-10-27T12:52:28Z","title":"YourSkatingCoach: A Figure Skating Video Benchmark for Fine-Grained\n Element Analysis","summary":" Combining sports and machine learning involves leveraging ML algorithms and\ntechniques to extract insight from sports-related data such as player\nstatistics, game footage, and other relevant information. However, datasets\nrelated to figure skating in the literature focus primarily on element\nclassification and are currently unavailable or exhibit only limited access,\nwhich greatly raise the entry barrier to developing visual sports technology\nfor it. Moreover, when using such data to help athletes improve their skills,\nwe find they are very coarse-grained: they work for learning what an element\nis, but they are poorly suited to learning whether the element is good or bad.\nHere we propose air time detection, a novel motion analysis task, the goal of\nwhich is to accurately detect the duration of the air time of a jump. We\npresent YourSkatingCoach, a large, novel figure skating dataset which contains\n454 videos of jump elements, the detected skater skeletons in each video, along\nwith the gold labels of the start and ending frames of each jump, together as a\nvideo benchmark for figure skating. In addition, although this type of task is\noften viewed as classification, we cast it as a sequential labeling problem and\npropose a Transformer-based model to calculate the duration. Experimental\nresults show that the proposed model yields a favorable results for a strong\nbaseline. To further verify the generalizability of the fine-grained labels, we\napply the same process to other sports as cross-sports tasks but for\ncoarse-grained task action classification. Here we fine-tune the classification\nto demonstrate that figure skating, as it contains the essential body\nmovements, constitutes a strong foundation for adaptation to other sports.\n","authors":["Wei-Yi Chen","Yi-Ling Lin","Yu-An Su","Wei-Hsin Yeh","Lun-Wei Ku"],"pdf_url":"https://arxiv.org/pdf/2410.20427v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22710v1","updated":"2024-10-30T05:38:07Z","published":"2024-10-30T05:38:07Z","title":"LoFLAT: Local Feature Matching using Focused Linear Attention\n Transformer","summary":" Local feature matching is an essential technique in image matching and plays\na critical role in a wide range of vision-based applications. However, existing\nTransformer-based detector-free local feature matching methods encounter\nchallenges due to the quadratic computational complexity of attention\nmechanisms, especially at high resolutions. However, while existing\nTransformer-based detector-free local feature matching methods have reduced\ncomputational costs using linear attention mechanisms, they still struggle to\ncapture detailed local interactions, which affects the accuracy and robustness\nof precise local correspondences. In order to enhance representations of\nattention mechanisms while preserving low computational complexity, we propose\nthe LoFLAT, a novel Local Feature matching using Focused Linear Attention\nTransformer in this paper. Our LoFLAT consists of three main modules: the\nFeature Extraction Module, the Feature Transformer Module, and the Matching\nModule. Specifically, the Feature Extraction Module firstly uses ResNet and a\nFeature Pyramid Network to extract hierarchical features. The Feature\nTransformer Module further employs the Focused Linear Attention to refine\nattention distribution with a focused mapping function and to enhance feature\ndiversity with a depth-wise convolution. Finally, the Matching Module predicts\naccurate and robust matches through a coarse-to-fine strategy. Extensive\nexperimental evaluations demonstrate that the proposed LoFLAT outperforms the\nLoFTR method in terms of both efficiency and accuracy.\n","authors":["Naijian Cao","Renjie He","Yuchao Dai","Mingyi He"],"pdf_url":"https://arxiv.org/pdf/2410.22710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22709v1","updated":"2024-10-30T05:38:03Z","published":"2024-10-30T05:38:03Z","title":"FilterViT and DropoutViT: Lightweight Vision Transformer Models for\n Efficient Attention Mechanisms","summary":" In this study, we introduce FilterViT, an enhanced version of MobileViT,\nwhich leverages an attention-based mechanism for early-stage downsampling.\nTraditional QKV operations on high-resolution feature maps are computationally\nintensive due to the abundance of tokens. To address this, we propose a filter\nattention mechanism using a convolutional neural network (CNN) to generate an\nimportance mask, focusing attention on key image regions. The method\nsignificantly reduces computational complexity while maintaining\ninterpretability, as it highlights essential image areas. Experimental results\nshow that FilterViT achieves substantial gains in both efficiency and accuracy\ncompared to other models. We also introduce DropoutViT, a variant that uses a\nstochastic approach for pixel selection, further enhancing robustness.\n","authors":["Bohang Sun"],"pdf_url":"https://arxiv.org/pdf/2410.22709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22707v1","updated":"2024-10-30T05:34:52Z","published":"2024-10-30T05:34:52Z","title":"Robotic State Recognition with Image-to-Text Retrieval Task of\n Pre-Trained Vision-Language Model and Black-Box Optimization","summary":" State recognition of the environment and objects, such as the open/closed\nstate of doors and the on/off of lights, is indispensable for robots that\nperform daily life support and security tasks. Until now, state recognition\nmethods have been based on training neural networks from manual annotations,\npreparing special sensors for the recognition, or manually programming to\nextract features from point clouds or raw images. In contrast, we propose a\nrobotic state recognition method using a pre-trained vision-language model,\nwhich is capable of Image-to-Text Retrieval (ITR) tasks. We prepare several\nkinds of language prompts in advance, calculate the similarity between these\nprompts and the current image by ITR, and perform state recognition. By\napplying the optimal weighting to each prompt using black-box optimization,\nstate recognition can be performed with higher accuracy. Experiments show that\nthis theory enables a variety of state recognitions by simply preparing\nmultiple prompts without retraining neural networks or manual programming. In\naddition, since only prompts and their weights need to be prepared for each\nrecognizer, there is no need to prepare multiple models, which facilitates\nresource management. It is possible to recognize the open/closed state of\ntransparent doors, the state of whether water is running or not from a faucet,\nand even the qualitative state of whether a kitchen is clean or not, which have\nbeen challenging so far, through language.\n","authors":["Kento Kawaharazuka","Yoshiki Obinata","Naoaki Kanazawa","Kei Okada","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2410.22707v1.pdf","comment":"Accepted at Humanoids2024"},{"id":"http://arxiv.org/abs/2410.22705v1","updated":"2024-10-30T05:34:17Z","published":"2024-10-30T05:34:17Z","title":"Geometry Cloak: Preventing TGS-based 3D Reconstruction from Copyrighted\n Images","summary":" Single-view 3D reconstruction methods like Triplane Gaussian Splatting (TGS)\nhave enabled high-quality 3D model generation from just a single image input\nwithin seconds. However, this capability raises concerns about potential\nmisuse, where malicious users could exploit TGS to create unauthorized 3D\nmodels from copyrighted images. To prevent such infringement, we propose a\nnovel image protection approach that embeds invisible geometry perturbations,\ntermed \"geometry cloaks\", into images before supplying them to TGS. These\ncarefully crafted perturbations encode a customized message that is revealed\nwhen TGS attempts 3D reconstructions of the cloaked image. Unlike conventional\nadversarial attacks that simply degrade output quality, our method forces TGS\nto fail the 3D reconstruction in a specific way - by generating an identifiable\ncustomized pattern that acts as a watermark. This watermark allows copyright\nholders to assert ownership over any attempted 3D reconstructions made from\ntheir protected images. Extensive experiments have verified the effectiveness\nof our geometry cloak. Our project is available at\nhttps://qsong2001.github.io/geometry_cloak.\n","authors":["Qi Song","Ziyuan Luo","Ka Chun Cheung","Simon See","Renjie Wan"],"pdf_url":"https://arxiv.org/pdf/2410.22705v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.20234v2","updated":"2024-10-30T05:25:05Z","published":"2024-10-26T17:31:15Z","title":"Enhancing CNN Classification with Lamarckian Memetic Algorithms and\n Local Search","summary":" Optimization is critical for optimal performance in deep neural networks\n(DNNs). Traditional gradient-based methods often face challenges like local\nminima entrapment. This paper explores population-based metaheuristic\noptimization algorithms for image classification networks. We propose a novel\napproach integrating a two-stage training technique with population-based\noptimization algorithms incorporating local search capabilities. Our\nexperiments demonstrate that the proposed method outperforms state-of-the-art\ngradient-based techniques, such as ADAM, in accuracy and computational\nefficiency, particularly with high computational complexity and numerous\ntrainable parameters. The results suggest that our approach offers a robust\nalternative to traditional methods for weight optimization in convolutional\nneural networks (CNNs). Future work will explore integrating adaptive\nmechanisms for parameter tuning and applying the proposed method to other types\nof neural networks and real-time applications.\n","authors":["Akhilbaran Ghosh","Rama Sai Adithya Kalidindi"],"pdf_url":"https://arxiv.org/pdf/2410.20234v2.pdf","comment":"Accepted in IEEE SPARC 2024"},{"id":"http://arxiv.org/abs/2410.22023v2","updated":"2024-10-30T04:29:42Z","published":"2024-10-29T13:13:30Z","title":"Feature distribution Adaptation Network for Speech Emotion Recognition","summary":" In this paper, we propose a novel deep inductive transfer learning framework,\nnamed feature distribution adaptation network, to tackle the challenging\nmulti-modal speech emotion recognition problem. Our method aims to use deep\ntransfer learning strategies to align visual and audio feature distributions to\nobtain consistent representation of emotion, thereby improving the performance\nof speech emotion recognition. In our model, the pre-trained ResNet-34 is\nutilized for feature extraction for facial expression images and acoustic Mel\nspectrograms, respectively. Then, the cross-attention mechanism is introduced\nto model the intrinsic similarity relationships of multi-modal features.\nFinally, the multi-modal feature distribution adaptation is performed\nefficiently with feed-forward network, which is extended using the local\nmaximum mean discrepancy loss. Experiments are carried out on two benchmark\ndatasets, and the results demonstrate that our model can achieve excellent\nperformance compared with existing ones.\n","authors":["Shaokai Li","Yixuan Ji","Peng Song","Haoqin Sun","Wenming Zheng"],"pdf_url":"https://arxiv.org/pdf/2410.22023v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19942v2","updated":"2024-10-30T04:26:52Z","published":"2024-09-30T04:46:35Z","title":"CycleCrash: A Dataset of Bicycle Collision Videos for Collision\n Prediction and Analysis","summary":" Self-driving research often underrepresents cyclist collisions and safety. To\naddress this, we present CycleCrash, a novel dataset consisting of 3,000\ndashcam videos with 436,347 frames that capture cyclists in a range of critical\nsituations, from collisions to safe interactions. This dataset enables 9\ndifferent cyclist collision prediction and classification tasks focusing on\npotentially hazardous conditions for cyclists and is annotated with\ncollision-related, cyclist-related, and scene-related labels. Next, we propose\nVidNeXt, a novel method that leverages a ConvNeXt spatial encoder and a\nnon-stationary transformer to capture the temporal dynamics of videos for the\ntasks defined in our dataset. To demonstrate the effectiveness of our method\nand create additional baselines on CycleCrash, we apply and compare 7 models\nalong with a detailed ablation. We release the dataset and code at\nhttps://github.com/DeSinister/CycleCrash/ .\n","authors":["Nishq Poorav Desai","Ali Etemad","Michael Greenspan"],"pdf_url":"https://arxiv.org/pdf/2409.19942v2.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2410.22681v1","updated":"2024-10-30T04:24:40Z","published":"2024-10-30T04:24:40Z","title":"Persistent Homology for MCI Classification: A Comparative Analysis\n between Graph and Vietoris-Rips Filtrations","summary":" Mild cognitive impairment (MCI), often linked to early neurodegeneration, is\ncharacterized by subtle cognitive declines and disruptions in brain\nconnectivity. The present study offers a detailed analysis of topological\nchanges associated with MCI, focusing on two subtypes: Early MCI and Late MCI.\nThis analysis utilizes fMRI time series data from two distinct populations: the\npublicly available ADNI dataset (Western cohort) and the in-house TLSA dataset\n(Indian Urban cohort). Persistent Homology, a topological data analysis method,\nis employed with two distinct filtration techniques - Vietoris-Rips and graph\nfiltration-for classifying MCI subtypes. For Vietoris-Rips filtration,\ninter-ROI Wasserstein distance matrices between persistent diagrams are used\nfor classification, while graph filtration relies on the top ten most\npersistent homology features. Comparative analysis shows that the Vietoris-Rips\nfiltration significantly outperforms graph filtration, capturing subtle\nvariations in brain connectivity with greater accuracy. The Vietoris-Rips\nfiltration method achieved the highest classification accuracy of 85.7\\% for\ndistinguishing between age and gender matched healthy controls and MCI, whereas\ngraph filtration reached a maximum accuracy of 71.4\\% for the same task. This\nsuperior performance highlights the sensitivity of Vietoris-Rips filtration in\ndetecting intricate topological features associated with neurodegeneration. The\nfindings underscore the potential of persistent homology, particularly when\ncombined with the Wasserstein distance, as a powerful tool for early diagnosis\nand precise classification of cognitive impairments, offering valuable insights\ninto brain connectivity changes in MCI.\n","authors":["Debanjali Bhattacharya","Rajneet Kaur","Ninad Aithal","Neelam Sinha","Thomas Gregor Issac"],"pdf_url":"https://arxiv.org/pdf/2410.22681v1.pdf","comment":"17 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2410.21644v2","updated":"2024-10-30T04:18:59Z","published":"2024-10-29T01:13:22Z","title":"On filter design in deep convolutional neural network","summary":" The deep convolutional neural network (DCNN) in computer vision has given\npromising results. It is widely applied in many areas, from medicine,\nagriculture, self-driving car, biometric system, and almost all computer\nvision-based applications. Filters or weights are the critical elements\nresponsible for learning in DCNN. Backpropagation has been the primary learning\nalgorithm for DCNN and provides promising results, but the size and numbers of\nthe filters remain hyper-parameters. Various studies have been done in the last\ndecade on semi-supervised, self-supervised, and unsupervised methods and their\nproperties. The effects of filter initialization, size-shape selection, and the\nnumber of filters on learning and optimization have not been investigated in a\nseparate publication to collate all the options. Such attributes are often\ntreated as hyper-parameters and lack mathematical understanding. Computer\nvision algorithms have many limitations in real-life applications, and\nunderstanding the learning process is essential to have some significant\nimprovement. To the best of our knowledge, no separate investigation has been\npublished discussing the filters; this is our primary motivation. This study\nfocuses on arguments for choosing specific physical parameters of filters,\ninitialization, and learning technic over scattered methods. The promising\nunsupervised approaches have been evaluated. Additionally, the limitations,\ncurrent challenges, and future scope have been discussed in this paper.\n","authors":["Gaurav Hirani","Waleed Abdulla"],"pdf_url":"https://arxiv.org/pdf/2410.21644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22679v1","updated":"2024-10-30T04:18:48Z","published":"2024-10-30T04:18:48Z","title":"Practical and Accurate Reconstruction of an Illuminant's Spectral Power\n Distribution for Inverse Rendering Pipelines","summary":" Inverse rendering pipelines are gaining prominence in realizing\nphoto-realistic reconstruction of real-world objects for emulating them in\nvirtual reality scenes. Apart from material reflectances, spectral rendering\nand in-scene illuminants' spectral power distributions (SPDs) play important\nroles in producing photo-realistic images. We present a simple, low-cost\ntechnique to capture and reconstruct the SPD of uniform illuminants. Instead of\nrequiring a costly spectrometer for such measurements, our method uses a\ndiffractive compact disk (CD-ROM) and a machine learning approach for accurate\nestimation. We show our method to work well with spotlights under simulations\nand few real-world examples. Presented results clearly demonstrate the\nreliability of our approach through quantitative and qualitative evaluations,\nespecially in spectral rendering of iridescent materials.\n","authors":["Parisha Joshi","Daljit Singh J. Dhillon"],"pdf_url":"https://arxiv.org/pdf/2410.22679v1.pdf","comment":"3 pages, 3 Figures, Submitted as a Tiny Paper at ICVGIP'24,\n Bangalore, India"},{"id":"http://arxiv.org/abs/2410.22678v1","updated":"2024-10-30T04:06:12Z","published":"2024-10-30T04:06:12Z","title":"Backdoor Attack Against Vision Transformers via Attention Gradient-Based\n Image Erosion","summary":" Vision Transformers (ViTs) have outperformed traditional Convolutional Neural\nNetworks (CNN) across various computer vision tasks. However, akin to CNN, ViTs\nare vulnerable to backdoor attacks, where the adversary embeds the backdoor\ninto the victim model, causing it to make wrong predictions about testing\nsamples containing a specific trigger. Existing backdoor attacks against ViTs\nhave the limitation of failing to strike an optimal balance between attack\nstealthiness and attack effectiveness.\n In this work, we propose an Attention Gradient-based Erosion Backdoor (AGEB)\ntargeted at ViTs. Considering the attention mechanism of ViTs, AGEB selectively\nerodes pixels in areas of maximal attention gradient, embedding a covert\nbackdoor trigger. Unlike previous backdoor attacks against ViTs, AGEB achieves\nan optimal balance between attack stealthiness and attack effectiveness,\nensuring the trigger remains invisible to human detection while preserving the\nmodel's accuracy on clean samples. Extensive experimental evaluations across\nvarious ViT architectures and datasets confirm the effectiveness of AGEB,\nachieving a remarkable Attack Success Rate (ASR) without diminishing Clean Data\nAccuracy (CDA). Furthermore, the stealthiness of AGEB is rigorously validated,\ndemonstrating minimal visual discrepancies between the clean and the triggered\nimages.\n","authors":["Ji Guo","Hongwei Li","Wenbo Jiang","Guoming Lu"],"pdf_url":"https://arxiv.org/pdf/2410.22678v1.pdf","comment":"Accepted by IEEE GLOBECOM 2024"},{"id":"http://arxiv.org/abs/2408.06646v2","updated":"2024-10-30T03:41:42Z","published":"2024-08-13T05:30:41Z","title":"Hybrid SD: Edge-Cloud Collaborative Inference for Stable Diffusion\n Models","summary":" Stable Diffusion Models (SDMs) have shown remarkable proficiency in image\nsynthesis. However, their broad application is impeded by their large model\nsizes and intensive computational requirements, which typically require\nexpensive cloud servers for deployment. On the flip side, while there are many\ncompact models tailored for edge devices that can reduce these demands, they\noften compromise on semantic integrity and visual quality when compared to\nfull-sized SDMs. To bridge this gap, we introduce Hybrid SD, an innovative,\ntraining-free SDMs inference framework designed for edge-cloud collaborative\ninference. Hybrid SD distributes the early steps of the diffusion process to\nthe large models deployed on cloud servers, enhancing semantic planning.\nFurthermore, small efficient models deployed on edge devices can be integrated\nfor refining visual details in the later stages. Acknowledging the diversity of\nedge devices with differing computational and storage capacities, we employ\nstructural pruning to the SDMs U-Net and train a lightweight VAE. Empirical\nevaluations demonstrate that our compressed models achieve state-of-the-art\nparameter efficiency (225.8M) on edge devices with competitive image quality.\nAdditionally, Hybrid SD reduces the cloud cost by 66% with edge-cloud\ncollaborative inference.\n","authors":["Chenqian Yan","Songwei Liu","Hongjian Liu","Xurui Peng","Xiaojian Wang","Fangmin Chen","Lean Fu","Xing Mei"],"pdf_url":"https://arxiv.org/pdf/2408.06646v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.19657v2","updated":"2024-10-30T03:34:49Z","published":"2024-10-25T16:08:08Z","title":"DiffGS: Functional Gaussian Splatting Diffusion","summary":" 3D Gaussian Splatting (3DGS) has shown convincing performance in rendering\nspeed and fidelity, yet the generation of Gaussian Splatting remains a\nchallenge due to its discreteness and unstructured nature. In this work, we\npropose DiffGS, a general Gaussian generator based on latent diffusion models.\nDiffGS is a powerful and efficient 3D generative model which is capable of\ngenerating Gaussian primitives at arbitrary numbers for high-fidelity rendering\nwith rasterization. The key insight is to represent Gaussian Splatting in a\ndisentangled manner via three novel functions to model Gaussian probabilities,\ncolors and transforms. Through the novel disentanglement of 3DGS, we represent\nthe discrete and unstructured 3DGS with continuous Gaussian Splatting\nfunctions, where we then train a latent diffusion model with the target of\ngenerating these Gaussian Splatting functions both unconditionally and\nconditionally. Meanwhile, we introduce a discretization algorithm to extract\nGaussians at arbitrary numbers from the generated functions via octree-guided\nsampling and optimization. We explore DiffGS for various tasks, including\nunconditional generation, conditional generation from text, image, and partial\n3DGS, as well as Point-to-Gaussian generation. We believe that DiffGS provides\na new direction for flexibly modeling and generating Gaussian Splatting.\n","authors":["Junsheng Zhou","Weiqi Zhang","Yu-Shen Liu"],"pdf_url":"https://arxiv.org/pdf/2410.19657v2.pdf","comment":"Accepted by NeurIPS 2024. Project page:\n https://junshengzhou.github.io/DiffGS"},{"id":"http://arxiv.org/abs/2410.22325v2","updated":"2024-10-30T03:33:08Z","published":"2024-10-29T17:58:13Z","title":"Robots Pre-train Robots: Manipulation-Centric Robotic Representation\n from Large-Scale Robot Datasets","summary":" The pre-training of visual representations has enhanced the efficiency of\nrobot learning. Due to the lack of large-scale in-domain robotic datasets,\nprior works utilize in-the-wild human videos to pre-train robotic visual\nrepresentation. Despite their promising results, representations from human\nvideos are inevitably subject to distribution shifts and lack the dynamics\ninformation crucial for task completion. We first evaluate various pre-trained\nrepresentations in terms of their correlation to the downstream robotic\nmanipulation tasks (i.e., manipulation centricity). Interestingly, we find that\nthe \"manipulation centricity\" is a strong indicator of success rates when\napplied to downstream tasks. Drawing from these findings, we propose\nManipulation Centric Representation (MCR), a foundation representation learning\nframework capturing both visual features and the dynamics information such as\nactions and proprioceptions of manipulation tasks to improve manipulation\ncentricity. Specifically, we pre-train a visual encoder on the DROID robotic\ndataset and leverage motion-relevant data such as robot proprioceptive states\nand actions. We introduce a novel contrastive loss that aligns visual\nobservations with the robot's proprioceptive state-action dynamics, combined\nwith a behavior cloning (BC)-like actor loss to predict actions during\npre-training, along with a time contrastive loss. Empirical results across 4\nsimulation domains with 20 tasks verify that MCR outperforms the strongest\nbaseline method by 14.8%. Moreover, MCR boosts the performance of\ndata-efficient learning with a UR5e arm on 3 real-world tasks by 76.9%. Project\nwebsite: https://robots-pretrain-robots.github.io/.\n","authors":["Guangqi Jiang","Yifei Sun","Tao Huang","Huanyu Li","Yongyuan Liang","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2410.22325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11438v2","updated":"2024-10-30T03:19:39Z","published":"2024-08-21T08:50:19Z","title":"A Benchmark for AI-based Weather Data Assimilation","summary":" Recent advancements in Artificial Intelligence (AI) have led to the\ndevelopment of several Large Weather Models (LWMs) that rival State-Of-The-Art\n(SOTA) Numerical Weather Prediction (NWP) systems. Until now, these models have\nstill relied on traditional NWP-generated analysis fields as input and are far\nfrom autonomous. Currently, scientists are increasingly focusing on developing\ndata-driven data assimilation (DA) models for LWMs. To expedite advancements in\nthis field and facilitate the operationalization of data-driven end-to-end\nweather forecasting systems, we propose DABench, a benchmark constructed by\nsimulated observations, real-world observations, and ERA5 reanalysis. DABench\ncontributes four standard features: (1) sparse and noisy observations provided\nfor both simulated and real-world experiments; (2) a Skillful pre-trained\nTransformer-based weather prediction model, Sformer, designed to generate\nbackground fields while rigorously assessing the impact of assimilation\noutcomes on predictions; (3) standardized evaluation metrics for the model\ncomparison; (4) a strong DA baseline, 4DVarFormerV2. Our experimental results\ndemonstrate that the end-to-end weather forecasting system, integrating\n4DVarFormerV2 and Sformer, can assimilate real-world observations, thereby\nfacilitating a stable DA cycle lasting one year and achieving a skillful\nforecasting lead time of up to 7 days. The proposed DABench will significantly\nadvance research in AI-based DA, AI-based weather forecasting, and related\ndomains.\n","authors":["Wuxin Wang","Weicheng Ni","Tao Han","Taikang Yuan","Xiaoyong Li","Lei Bai","Boheng Duan","Kaijun Ren"],"pdf_url":"https://arxiv.org/pdf/2408.11438v2.pdf","comment":"38pages, 21 figures, 4 tables"},{"id":"http://arxiv.org/abs/2406.12293v2","updated":"2024-10-30T03:11:52Z","published":"2024-06-18T05:54:28Z","title":"Unleashing the Potential of Open-set Noisy Samples Against Label Noise\n for Medical Image Classification","summary":" Addressing mixed closed-set and open-set label noise in medical image\nclassification remains a largely unexplored challenge. Unlike natural image\nclassification, which often separates and processes closed-set and open-set\nnoisy samples from clean ones, medical image classification contends with high\ninter-class similarity, complicating the identification of open-set noisy\nsamples. Additionally, existing methods often fail to fully utilize open-set\nnoisy samples for label noise mitigation, leading to their exclusion or the\napplication of uniform soft labels. To address these challenges, we propose the\nExtended Noise-robust Contrastive and Open-set Feature Augmentation framework\nfor medical image classification tasks. This framework incorporates the\nExtended Noise-robust Supervised Contrastive Loss, which helps differentiate\nfeatures among both in-distribution and out-of-distribution classes. This loss\ntreats open-set noisy samples as an extended class, improving label noise\nmitigation by weighting contrastive pairs according to label reliability.\nAdditionally, we develop the Open-set Feature Augmentation module that enriches\nopen-set samples at the feature level and then assigns them dynamic class\nlabels, thereby leveraging the model's capacity and reducing overfitting to\nnoisy data. We evaluated the proposed framework on both a synthetic noisy\ndataset and a real-world noisy dataset. The results indicate the superiority of\nour framework over four existing methods and the effectiveness of leveraging\nopen-set noisy samples to combat label noise.\n","authors":["Zehui Liao","Shishuai Hu","Yanning Zhang","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2406.12293v2.pdf","comment":"14 pages, 6 figure"},{"id":"http://arxiv.org/abs/2410.22655v1","updated":"2024-10-30T02:48:50Z","published":"2024-10-30T02:48:50Z","title":"FlowDCN: Exploring DCN-like Architectures for Fast Image Generation with\n Arbitrary Resolution","summary":" Arbitrary-resolution image generation still remains a challenging task in\nAIGC, as it requires handling varying resolutions and aspect ratios while\nmaintaining high visual quality. Existing transformer-based diffusion methods\nsuffer from quadratic computation cost and limited resolution extrapolation\ncapabilities, making them less effective for this task. In this paper, we\npropose FlowDCN, a purely convolution-based generative model with linear time\nand memory complexity, that can efficiently generate high-quality images at\narbitrary resolutions. Equipped with a new design of learnable group-wise\ndeformable convolution block, our FlowDCN yields higher flexibility and\ncapability to handle different resolutions with a single model. FlowDCN\nachieves the state-of-the-art 4.30 sFID on $256\\times256$ ImageNet Benchmark\nand comparable resolution extrapolation results, surpassing transformer-based\ncounterparts in terms of convergence speed (only $\\frac{1}{5}$ images), visual\nquality, parameters ($8\\%$ reduction) and FLOPs ($20\\%$ reduction). We believe\nFlowDCN offers a promising solution to scalable and flexible image synthesis.\n","authors":["Shuai Wang","Zexian Li","Tianhui Song","Xubin Li","Tiezheng Ge","Bo Zheng","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2410.22655v1.pdf","comment":"Accepted on NeurIPS24"},{"id":"http://arxiv.org/abs/2408.04957v4","updated":"2024-10-30T02:38:29Z","published":"2024-08-09T09:22:40Z","title":"LLaVA-VSD: Large Language-and-Vision Assistant for Visual Spatial\n Description","summary":" Visual Spatial Description (VSD) aims to generate texts that describe the\nspatial relationships between objects within images. Traditional visual spatial\nrelationship classification (VSRC) methods typically output the spatial\nrelationship between two objects in an image, often neglecting world knowledge\nand lacking general language capabilities. In this paper, we propose a Large\nLanguage-and-Vision Assistant for Visual Spatial Description, named LLaVA-VSD,\nwhich is designed for the classification, description, and open-ended\ndescription of visual spatial relationships. Specifically, the model first\nconstructs a VSD instruction-following dataset using given figure-caption pairs\nfor the three tasks. It then employs LoRA to fine-tune a Large Language and\nVision Assistant for VSD, which has 13 billion parameters and supports\nhigh-resolution images. Finally, a large language model (Qwen-2) is used to\nrefine the generated sentences, enhancing their diversity and accuracy.\nLLaVA-VSD demonstrates excellent multimodal conversational capabilities and can\nfollow open-ended instructions to assist with inquiries about object\nrelationships in images.\n","authors":["Yizhang Jin","Jian Li","Jiangning Zhang","Jianlong Hu","Zhenye Gan","Xin Tan","Yong Liu","Yabiao Wang","Chengjie Wang","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2408.04957v4.pdf","comment":"We have discovered a significant error in the paper that affects the\n main conclusions. To ensure the accuracy of our research, we have decided to\n withdraw this paper and will resubmit it after making the necessary\n corrections"},{"id":"http://arxiv.org/abs/2409.01573v2","updated":"2024-10-30T02:36:18Z","published":"2024-09-03T03:11:48Z","title":"Improving Apple Object Detection with Occlusion-Enhanced Distillation","summary":" Apples growing in natural environments often face severe visual obstructions\nfrom leaves and branches. This significantly increases the risk of false\ndetections in object detection tasks, thereby escalating the challenge.\nAddressing this issue, we introduce a technique called \"Occlusion-Enhanced\nDistillation\" (OED). This approach utilizes occlusion information to regularize\nthe learning of semantically aligned features on occluded datasets and employs\nExponential Moving Average (EMA) to enhance training stability. Specifically,\nwe first design an occlusion-enhanced dataset that integrates Grounding DINO\nand SAM methods to extract occluding elements such as leaves and branches from\neach sample, creating occlusion examples that reflect the natural growth state\nof fruits. Additionally, we propose a multi-scale knowledge distillation\nstrategy, where the student network uses images with increased occlusions as\ninputs, while the teacher network employs images without natural occlusions.\nThrough this setup, the strategy guides the student network to learn from the\nteacher across scales of semantic and local features alignment, effectively\nnarrowing the feature distance between occluded and non-occluded targets and\nenhancing the robustness of object detection. Lastly, to improve the stability\nof the student network, we introduce the EMA strategy, which aids the student\nnetwork in learning more generalized feature expressions that are less affected\nby the noise of individual image occlusions. Our method significantly\noutperforms current state-of-the-art techniques through extensive comparative\nexperiments.\n","authors":["Liang Geng"],"pdf_url":"https://arxiv.org/pdf/2409.01573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22648v1","updated":"2024-10-30T02:30:40Z","published":"2024-10-30T02:30:40Z","title":"SimpsonsVQA: Enhancing Inquiry-Based Learning with a Tailored Dataset","summary":" Visual Question Answering (VQA) has emerged as a promising area of research\nto develop AI-based systems for enabling interactive and immersive learning.\nNumerous VQA datasets have been introduced to facilitate various tasks, such as\nanswering questions or identifying unanswerable ones. However, most of these\ndatasets are constructed using real-world images, leaving the performance of\nexisting models on cartoon images largely unexplored. Hence, in this paper, we\npresent \"SimpsonsVQA\", a novel dataset for VQA derived from The Simpsons TV\nshow, designed to promote inquiry-based learning. Our dataset is specifically\ndesigned to address not only the traditional VQA task but also to identify\nirrelevant questions related to images, as well as the reverse scenario where a\nuser provides an answer to a question that the system must evaluate (e.g., as\ncorrect, incorrect, or ambiguous). It aims to cater to various visual\napplications, harnessing the visual content of \"The Simpsons\" to create\nengaging and informative interactive systems. SimpsonsVQA contains\napproximately 23K images, 166K QA pairs, and 500K judgments\n(https://simpsonsvqa.org). Our experiments show that current large\nvision-language models like ChatGPT4o underperform in zero-shot settings across\nall three tasks, highlighting the dataset's value for improving model\nperformance on cartoon images. We anticipate that SimpsonsVQA will inspire\nfurther research, innovation, and advancements in inquiry-based learning VQA.\n","authors":["Ngoc Dung Huynh","Mohamed Reda Bouadjenek","Sunil Aryal","Imran Razzak","Hakim Hacid"],"pdf_url":"https://arxiv.org/pdf/2410.22648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05160v3","updated":"2024-10-30T02:06:43Z","published":"2024-03-08T09:02:13Z","title":"MamMIL: Multiple Instance Learning for Whole Slide Images with State\n Space Models","summary":" Recently, pathological diagnosis has achieved superior performance by\ncombining deep learning models with the multiple instance learning (MIL)\nframework using whole slide images (WSIs). However, the giga-pixeled nature of\nWSIs poses a great challenge for efficient MIL. Existing studies either do not\nconsider global dependencies among instances, or use approximations such as\nlinear attentions to model the pair-to-pair instance interactions, which\ninevitably brings performance bottlenecks. To tackle this challenge, we propose\na framework named MamMIL for WSI analysis by cooperating the selective\nstructured state space model (i.e., Mamba) with MIL, enabling the modeling of\nglobal instance dependencies while maintaining linear complexity. Specifically,\nconsidering the irregularity of the tissue regions in WSIs, we represent each\nWSI as an undirected graph. To address the problem that Mamba can only process\n1D sequences, we further propose a topology-aware scanning mechanism to\nserialize the WSI graphs while preserving the topological relationships among\nthe instances. Finally, in order to further perceive the topological structures\namong the instances and incorporate short-range feature interactions, we\npropose an instance aggregation block based on graph neural networks.\nExperiments show that MamMIL can achieve advanced performance than the\nstate-of-the-art frameworks. The code can be accessed at\nhttps://github.com/Vison307/MamMIL.\n","authors":["Zijie Fang","Yifeng Wang","Ye Zhang","Zhi Wang","Jian Zhang","Xiangyang Ji","Yongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.05160v3.pdf","comment":"6 pages, 2 figures. Accepted by IEEE International Conference on\n Bioinformatics and Biomedicine (BIBM)"},{"id":"http://arxiv.org/abs/2410.22638v1","updated":"2024-10-30T02:05:18Z","published":"2024-10-30T02:05:18Z","title":"Unbiased Regression Loss for DETRs","summary":" In this paper, we introduce a novel unbiased regression loss for DETR-based\ndetectors. The conventional $L_{1}$ regression loss tends to bias towards\nlarger boxes, as they disproportionately contribute more towards the overall\nloss compared to smaller boxes. Consequently, the detection performance for\nsmall objects suffers. To alleviate this bias, the proposed new unbiased loss,\ntermed Sized $L_{1}$ loss, normalizes the size of all boxes based on their\nindividual width and height. Our experiments demonstrate consistent\nimprovements in both fully-supervised and semi-supervised settings using the\nMS-COCO benchmark dataset.\n","authors":[" Edric","Ueta Daisuke","Kurokawa Yukimasa","Karlekar Jayashree","Sugiri Pranata"],"pdf_url":"https://arxiv.org/pdf/2410.22638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22637v1","updated":"2024-10-30T02:04:23Z","published":"2024-10-30T02:04:23Z","title":"Consistency Diffusion Bridge Models","summary":" Diffusion models (DMs) have become the dominant paradigm of generative\nmodeling in a variety of domains by learning stochastic processes from noise to\ndata. Recently, diffusion denoising bridge models (DDBMs), a new formulation of\ngenerative modeling that builds stochastic processes between fixed data\nendpoints based on a reference diffusion process, have achieved empirical\nsuccess across tasks with coupled data distribution, such as image-to-image\ntranslation. However, DDBM's sampling process typically requires hundreds of\nnetwork evaluations to achieve decent performance, which may impede their\npractical deployment due to high computational demands. In this work, inspired\nby the recent advance of consistency models in DMs, we tackle this problem by\nlearning the consistency function of the probability-flow ordinary differential\nequation (PF-ODE) of DDBMs, which directly predicts the solution at a starting\nstep given any point on the ODE trajectory. Based on a dedicated general-form\nODE solver, we propose two paradigms: consistency bridge distillation and\nconsistency bridge training, which is flexible to apply on DDBMs with broad\ndesign choices. Experimental results show that our proposed method could sample\n$4\\times$ to $50\\times$ faster than the base DDBM and produce better visual\nquality given the same step in various tasks with pixel resolution ranging from\n$64 \\times 64$ to $256 \\times 256$, as well as supporting downstream tasks such\nas semantic interpolation in the data space.\n","authors":["Guande He","Kaiwen Zheng","Jianfei Chen","Fan Bao","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.22637v1.pdf","comment":"NeurIPS 2024"}]},"2024-10-31T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2410.24226v1","updated":"2024-10-31T17:59:59Z","published":"2024-10-31T17:59:59Z","title":"Tensegrity Robot Proprioceptive State Estimation with Geometric\n Constraints","summary":" Tensegrity robots, characterized by a synergistic assembly of rigid rods and\nelastic cables, form robust structures that are resistant to impacts. However,\nthis design introduces complexities in kinematics and dynamics, complicating\ncontrol and state estimation. This work presents a novel proprioceptive state\nestimator for tensegrity robots. The estimator initially uses the geometric\nconstraints of 3-bar prism tensegrity structures, combined with IMU and motor\nencoder measurements, to reconstruct the robot's shape and orientation. It then\nemploys a contact-aided invariant extended Kalman filter with forward\nkinematics to estimate the global position and orientation of the tensegrity\nrobot. The state estimator's accuracy is assessed against ground truth data in\nboth simulated environments and real-world tensegrity robot applications. It\nachieves an average drift percentage of 4.2%, comparable to the state\nestimation performance of traditional rigid robots. This state estimator\nadvances the state of the art in tensegrity robot state estimation and has the\npotential to run in real-time using onboard sensors, paving the way for full\nautonomy of tensegrity robots in unstructured environments.\n","authors":["Wenzhe Tong","Tzu-Yuan Lin","Jonathan Mi","Yicheng Jiang","Maani Ghaffari","Xiaonan Huang"],"pdf_url":"https://arxiv.org/pdf/2410.24226v1.pdf","comment":"Preprint; 8 pages, 11 figures, 2 tables; Code at\n https://github.com/Jonathan-Twz/tensegrity-robot-state-estimator"},{"id":"http://arxiv.org/abs/2410.24221v1","updated":"2024-10-31T17:59:55Z","published":"2024-10-31T17:59:55Z","title":"EgoMimic: Scaling Imitation Learning via Egocentric Video","summary":" The scale and diversity of demonstration data required for imitation learning\nis a significant challenge. We present EgoMimic, a full-stack framework which\nscales manipulation via human embodiment data, specifically egocentric human\nvideos paired with 3D hand tracking. EgoMimic achieves this through: (1) a\nsystem to capture human embodiment data using the ergonomic Project Aria\nglasses, (2) a low-cost bimanual manipulator that minimizes the kinematic gap\nto human data, (3) cross-domain data alignment techniques, and (4) an imitation\nlearning architecture that co-trains on human and robot data. Compared to prior\nworks that only extract high-level intent from human videos, our approach\ntreats human and robot data equally as embodied demonstration data and learns a\nunified policy from both data sources. EgoMimic achieves significant\nimprovement on a diverse set of long-horizon, single-arm and bimanual\nmanipulation tasks over state-of-the-art imitation learning methods and enables\ngeneralization to entirely new scenes. Finally, we show a favorable scaling\ntrend for EgoMimic, where adding 1 hour of additional hand data is\nsignificantly more valuable than 1 hour of additional robot data. Videos and\nadditional information can be found at https://egomimic.github.io/\n","authors":["Simar Kareer","Dhruv Patel","Ryan Punamiya","Pranay Mathur","Shuo Cheng","Chen Wang","Judy Hoffman","Danfei Xu"],"pdf_url":"https://arxiv.org/pdf/2410.24221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24218v1","updated":"2024-10-31T17:59:52Z","published":"2024-10-31T17:59:52Z","title":"Teaching Embodied Reinforcement Learning Agents: Informativeness and\n Diversity of Language Use","summary":" In real-world scenarios, it is desirable for embodied agents to have the\nability to leverage human language to gain explicit or implicit knowledge for\nlearning tasks. Despite recent progress, most previous approaches adopt simple\nlow-level instructions as language inputs, which may not reflect natural human\ncommunication. It's not clear how to incorporate rich language use to\nfacilitate task learning. To address this question, this paper studies\ndifferent types of language inputs in facilitating reinforcement learning (RL)\nembodied agents. More specifically, we examine how different levels of language\ninformativeness (i.e., feedback on past behaviors and future guidance) and\ndiversity (i.e., variation of language expressions) impact agent learning and\ninference. Our empirical results based on four RL benchmarks demonstrate that\nagents trained with diverse and informative language feedback can achieve\nenhanced generalization and fast adaptation to new tasks. These findings\nhighlight the pivotal role of language use in teaching embodied agents new\ntasks in an open world. Project website:\nhttps://github.com/sled-group/Teachable_RL\n","authors":["Jiajun Xi","Yinong He","Jianing Yang","Yinpei Dai","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2410.24218v1.pdf","comment":"EMNLP 2024 Main. Project website:\n https://github.com/sled-group/Teachable_RL"},{"id":"http://arxiv.org/abs/2406.15349v2","updated":"2024-10-31T17:58:34Z","published":"2024-06-21T17:59:02Z","title":"NAVSIM: Data-Driven Non-Reactive Autonomous Vehicle Simulation and\n Benchmarking","summary":" Benchmarking vision-based driving policies is challenging. On one hand,\nopen-loop evaluation with real data is easy, but these results do not reflect\nclosed-loop performance. On the other, closed-loop evaluation is possible in\nsimulation, but is hard to scale due to its significant computational demands.\nFurther, the simulators available today exhibit a large domain gap to real\ndata. This has resulted in an inability to draw clear conclusions from the\nrapidly growing body of research on end-to-end autonomous driving. In this\npaper, we present NAVSIM, a middle ground between these evaluation paradigms,\nwhere we use large datasets in combination with a non-reactive simulator to\nenable large-scale real-world benchmarking. Specifically, we gather\nsimulation-based metrics, such as progress and time to collision, by unrolling\nbird's eye view abstractions of the test scenes for a short simulation horizon.\nOur simulation is non-reactive, i.e., the evaluated policy and environment do\nnot influence each other. As we demonstrate empirically, this decoupling allows\nopen-loop metric computation while being better aligned with closed-loop\nevaluations than traditional displacement errors. NAVSIM enabled a new\ncompetition held at CVPR 2024, where 143 teams submitted 463 entries, resulting\nin several new insights. On a large set of challenging scenarios, we observe\nthat simple methods with moderate compute requirements such as TransFuser can\nmatch recent large-scale end-to-end driving architectures such as UniAD. Our\nmodular framework can potentially be extended with new datasets, data curation\nstrategies, and metrics, and will be continually maintained to host future\nchallenges. Our code is available at\nhttps://github.com/autonomousvision/navsim.\n","authors":["Daniel Dauner","Marcel Hallgarten","Tianyu Li","Xinshuo Weng","Zhiyu Huang","Zetong Yang","Hongyang Li","Igor Gilitschenski","Boris Ivanovic","Marco Pavone","Andreas Geiger","Kashyap Chitta"],"pdf_url":"https://arxiv.org/pdf/2406.15349v2.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2410.24205v1","updated":"2024-10-31T17:57:51Z","published":"2024-10-31T17:57:51Z","title":"Zonal RL-RRT: Integrated RL-RRT Path Planning with Collision Probability\n and Zone Connectivity","summary":" Path planning in high-dimensional spaces poses significant challenges,\nparticularly in achieving both time efficiency and a fair success rate. To\naddress these issues, we introduce a novel path-planning algorithm, Zonal\nRL-RRT, that leverages kd-tree partitioning to segment the map into zones while\naddressing zone connectivity, ensuring seamless transitions between zones. By\nbreaking down the complex environment into multiple zones and using Q-learning\nas the high-level decision-maker, our algorithm achieves a 3x improvement in\ntime efficiency compared to basic sampling methods such as RRT and RRT* in\nforest-like maps. Our approach outperforms heuristic-guided methods like BIT*\nand Informed RRT* by 1.5x in terms of runtime while maintaining robust and\nreliable success rates across 2D to 6D environments. Compared to learning-based\nmethods like NeuralRRT* and MPNetSMP, as well as the heuristic RRT*J, our\nalgorithm demonstrates, on average, 1.5x better performance in the same\nenvironments. We also evaluate the effectiveness of our approach through\nsimulations of the UR10e arm manipulator in the MuJoCo environment. A key\nobservation of our approach lies in its use of zone partitioning and\nReinforcement Learning (RL) for adaptive high-level planning allowing the\nalgorithm to accommodate flexible policies across diverse environments, making\nit a versatile tool for advanced path planning.\n","authors":["AmirMohammad Tahmasbi","MohammadSaleh Faghfoorian","Saeed Khodaygan","Aniket Bera"],"pdf_url":"https://arxiv.org/pdf/2410.24205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24203v1","updated":"2024-10-31T17:57:02Z","published":"2024-10-31T17:57:02Z","title":"DiffPano: Scalable and Consistent Text to Panorama Generation with\n Spherical Epipolar-Aware Diffusion","summary":" Diffusion-based methods have achieved remarkable achievements in 2D image or\n3D object generation, however, the generation of 3D scenes and even\n$360^{\\circ}$ images remains constrained, due to the limited number of scene\ndatasets, the complexity of 3D scenes themselves, and the difficulty of\ngenerating consistent multi-view images. To address these issues, we first\nestablish a large-scale panoramic video-text dataset containing millions of\nconsecutive panoramic keyframes with corresponding panoramic depths, camera\nposes, and text descriptions. Then, we propose a novel text-driven panoramic\ngeneration framework, termed DiffPano, to achieve scalable, consistent, and\ndiverse panoramic scene generation. Specifically, benefiting from the powerful\ngenerative capabilities of stable diffusion, we fine-tune a single-view\ntext-to-panorama diffusion model with LoRA on the established panoramic\nvideo-text dataset. We further design a spherical epipolar-aware multi-view\ndiffusion model to ensure the multi-view consistency of the generated panoramic\nimages. Extensive experiments demonstrate that DiffPano can generate scalable,\nconsistent, and diverse panoramic images with given unseen text descriptions\nand camera poses.\n","authors":["Weicai Ye","Chenhao Ji","Zheng Chen","Junyao Gao","Xiaoshui Huang","Song-Hai Zhang","Wanli Ouyang","Tong He","Cairong Zhao","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.24203v1.pdf","comment":"NeurIPS2024, Project: https://github.com/zju3dv/DiffPano; Code:\n https://github.com/zju3dv/DiffPano"},{"id":"http://arxiv.org/abs/2410.24196v1","updated":"2024-10-31T17:54:33Z","published":"2024-10-31T17:54:33Z","title":"A Sagittal Planar Ankle-Foot Prosthesis with Powered Plantarflexion and\n Socket Alignment","summary":" Powered ankle-foot prostheses can often reduce the energy cost of walking by\nassisting with push-off. However, focus on providing mechanical work may lead\nto ignoring or exacerbating common issues with chronic pain, irritation,\npressure ulcer development, and eventual osteoarthritis in persons with\namputation. This paper presents the design and validation of a novel\ntranstibial prosthesis informed by predictive biomechanical simulations of gait\nwhich minimize a combination of user effort and interaction loading from the\nprosthesis socket. From these findings, the device was designed with a\nnon-biomimetic anterior-posterior translation degree of freedom with a 10 cm\nrange of motion which is primarily position-controlled to change the alignment\nof the prosthetic foot with the residual limb. The system is both mobile and\ntethered, with the batteries, actuators, and majority of electronics located in\na small backpack. Mechanical loads are transmitted through cables to the\nprosthesis, minimizing the distal mass carriage required. We measured torque\nand force sensing accuracy, open loop actuator performance, closed loop torque\nand position control bandwidth, and torque and position tracking error during\nwalking. The system is capable of producing up to 160 N-m of plantarflexion\ntorque and 394 N of AP translation force with a closed loop control bandwidth\nof about 7 Hz in both degrees of freedom. Torque tracking during walking was\naccurate within about 10 N-m but position tracking was substantially affected\nby phase lag, possibly due to cable slack in the bidirectional mechanism. The\nprototype was capable of replicating our simulated prosthesis dynamics during\ngait and offers useful insights into the advantages and the practical\nconsiderations of using predictive biomechanical simulation as a design tool\nfor wearable robots.\n","authors":["Mark A. Price","Frank C. Sup IV"],"pdf_url":"https://arxiv.org/pdf/2410.24196v1.pdf","comment":"9 pages, 8 figures, 1 table"},{"id":"http://arxiv.org/abs/2410.24185v1","updated":"2024-10-31T17:48:45Z","published":"2024-10-31T17:48:45Z","title":"DexMimicGen: Automated Data Generation for Bimanual Dexterous\n Manipulation via Imitation Learning","summary":" Imitation learning from human demonstrations is an effective means to teach\nrobots manipulation skills. But data acquisition is a major bottleneck in\napplying this paradigm more broadly, due to the amount of cost and human effort\ninvolved. There has been significant interest in imitation learning for\nbimanual dexterous robots, like humanoids. Unfortunately, data collection is\neven more challenging here due to the challenges of simultaneously controlling\nmultiple arms and multi-fingered hands. Automated data generation in simulation\nis a compelling, scalable alternative to fuel this need for data. To this end,\nwe introduce DexMimicGen, a large-scale automated data generation system that\nsynthesizes trajectories from a handful of human demonstrations for humanoid\nrobots with dexterous hands. We present a collection of simulation environments\nin the setting of bimanual dexterous manipulation, spanning a range of\nmanipulation behaviors and different requirements for coordination among the\ntwo arms. We generate 21K demos across these tasks from just 60 source human\ndemos and study the effect of several data generation and policy learning\ndecisions on agent performance. Finally, we present a real-to-sim-to-real\npipeline and deploy it on a real-world humanoid can sorting task. Videos and\nmore are at https://dexmimicgen.github.io/\n","authors":["Zhenyu Jiang","Yuqi Xie","Kevin Lin","Zhenjia Xu","Weikang Wan","Ajay Mandlekar","Linxi Fan","Yuke Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.24185v1.pdf","comment":"Project website: https://dexmimicgen.github.io/"},{"id":"http://arxiv.org/abs/2302.12610v3","updated":"2024-10-31T17:22:32Z","published":"2023-02-24T12:54:18Z","title":"A Joint Modeling of Vision-Language-Action for Target-oriented Grasping\n in Clutter","summary":" We focus on the task of language-conditioned grasping in clutter, in which a\nrobot is supposed to grasp the target object based on a language instruction.\nPrevious works separately conduct visual grounding to localize the target\nobject, and generate a grasp for that object. However, these works require\nobject labels or visual attributes for grounding, which calls for handcrafted\nrules in planner and restricts the range of language instructions. In this\npaper, we propose to jointly model vision, language and action with\nobject-centric representation. Our method is applicable under more flexible\nlanguage instructions, and not limited by visual grounding error. Besides, by\nutilizing the powerful priors from the pre-trained multi-modal model and grasp\nmodel, sample efficiency is effectively improved and the sim2real problem is\nrelived without additional data for transfer. A series of experiments carried\nout in simulation and real world indicate that our method can achieve better\ntask success rate by less times of motion under more flexible language\ninstructions. Moreover, our method is capable of generalizing better to\nscenarios with unseen objects and language instructions. Our code is available\nat https://github.com/xukechun/Vision-Language-Grasping\n","authors":["Kechun Xu","Shuqi Zhao","Zhongxiang Zhou","Zizhang Li","Huaijin Pi","Yue Wang","Rong Xiong"],"pdf_url":"https://arxiv.org/pdf/2302.12610v3.pdf","comment":"Accepted by ICRA 2023"},{"id":"http://arxiv.org/abs/2410.24164v1","updated":"2024-10-31T17:22:30Z","published":"2024-10-31T17:22:30Z","title":"$π_0$: A Vision-Language-Action Flow Model for General Robot Control","summary":" Robot learning holds tremendous promise to unlock the full potential of\nflexible, general, and dexterous robot systems, as well as to address some of\nthe deepest questions in artificial intelligence. However, bringing robot\nlearning to the level of generality required for effective real-world systems\nfaces major obstacles in terms of data, generalization, and robustness. In this\npaper, we discuss how generalist robot policies (i.e., robot foundation models)\ncan address these challenges, and how we can design effective generalist robot\npolicies for complex and highly dexterous tasks. We propose a novel flow\nmatching architecture built on top of a pre-trained vision-language model (VLM)\nto inherit Internet-scale semantic knowledge. We then discuss how this model\ncan be trained on a large and diverse dataset from multiple dexterous robot\nplatforms, including single-arm robots, dual-arm robots, and mobile\nmanipulators. We evaluate our model in terms of its ability to perform tasks in\nzero shot after pre-training, follow language instructions from people and from\na high-level VLM policy, and its ability to acquire new skills via fine-tuning.\nOur results cover a wide variety of tasks, such as laundry folding, table\ncleaning, and assembling boxes.\n","authors":["Kevin Black","Noah Brown","Danny Driess","Adnan Esmail","Michael Equi","Chelsea Finn","Niccolo Fusai","Lachy Groom","Karol Hausman","Brian Ichter","Szymon Jakubczak","Tim Jones","Liyiming Ke","Sergey Levine","Adrian Li-Bell","Mohith Mothukuri","Suraj Nair","Karl Pertsch","Lucy Xiaoyang Shi","James Tanner","Quan Vuong","Anna Walling","Haohuan Wang","Ury Zhilinsky"],"pdf_url":"https://arxiv.org/pdf/2410.24164v1.pdf","comment":"See project website for videos:\n https://physicalintelligence.company/blog/pi0"},{"id":"http://arxiv.org/abs/2406.00885v2","updated":"2024-10-31T17:12:57Z","published":"2024-06-02T22:40:05Z","title":"Visual place recognition for aerial imagery: A survey","summary":" Aerial imagery and its direct application to visual localization is an\nessential problem for many Robotics and Computer Vision tasks. While Global\nNavigation Satellite Systems (GNSS) are the standard default solution for\nsolving the aerial localization problem, it is subject to a number of\nlimitations, such as, signal instability or solution unreliability that make\nthis option not so desirable. Consequently, visual geolocalization is emerging\nas a viable alternative. However, adapting Visual Place Recognition (VPR) task\nto aerial imagery presents significant challenges, including weather variations\nand repetitive patterns. Current VPR reviews largely neglect the specific\ncontext of aerial data. This paper introduces a methodology tailored for\nevaluating VPR techniques specifically in the domain of aerial imagery,\nproviding a comprehensive assessment of various methods and their performance.\nHowever, we not only compare various VPR methods, but also demonstrate the\nimportance of selecting appropriate zoom and overlap levels when constructing\nmap tiles to achieve maximum efficiency of VPR algorithms in the case of aerial\nimagery. The code is available on our GitHub repository --\nhttps://github.com/prime-slam/aero-vloc.\n","authors":["Ivan Moskalenko","Anastasiia Kornilova","Gonzalo Ferrer"],"pdf_url":"https://arxiv.org/pdf/2406.00885v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24152v1","updated":"2024-10-31T17:10:01Z","published":"2024-10-31T17:10:01Z","title":"Language-Driven Policy Distillation for Cooperative Driving in\n Multi-Agent Reinforcement Learning","summary":" The cooperative driving technology of Connected and Autonomous Vehicles\n(CAVs) is crucial for improving the efficiency and safety of transportation\nsystems. Learning-based methods, such as Multi-Agent Reinforcement Learning\n(MARL), have demonstrated strong capabilities in cooperative decision-making\ntasks. However, existing MARL approaches still face challenges in terms of\nlearning efficiency and performance. In recent years, Large Language Models\n(LLMs) have rapidly advanced and shown remarkable abilities in various\nsequential decision-making tasks. To enhance the learning capabilities of\ncooperative agents while ensuring decision-making efficiency and\ncost-effectiveness, we propose LDPD, a language-driven policy distillation\nmethod for guiding MARL exploration. In this framework, a teacher agent based\non LLM trains smaller student agents to achieve cooperative decision-making\nthrough its own decision-making demonstrations. The teacher agent enhances the\nobservation information of CAVs and utilizes LLMs to perform complex\ncooperative decision-making reasoning, which also leverages carefully designed\ndecision-making tools to achieve expert-level decisions, providing high-quality\nteaching experiences. The student agent then refines the teacher's prior\nknowledge into its own model through gradient policy updates. The experiments\ndemonstrate that the students can rapidly improve their capabilities with\nminimal guidance from the teacher and eventually surpass the teacher's\nperformance. Extensive experiments show that our approach demonstrates better\nperformance and learning efficiency compared to baseline methods.\n","authors":["Jiaqi Liu","Chengkai Xu","Peng Hang","Jian Sun","Mingyu Ding","Wei Zhan","Masayoshi Tomizuka"],"pdf_url":"https://arxiv.org/pdf/2410.24152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24091v1","updated":"2024-10-31T16:22:53Z","published":"2024-10-31T16:22:53Z","title":"3D-ViTac: Learning Fine-Grained Manipulation with Visuo-Tactile Sensing","summary":" Tactile and visual perception are both crucial for humans to perform\nfine-grained interactions with their environment. Developing similar\nmulti-modal sensing capabilities for robots can significantly enhance and\nexpand their manipulation skills. This paper introduces \\textbf{3D-ViTac}, a\nmulti-modal sensing and learning system designed for dexterous bimanual\nmanipulation. Our system features tactile sensors equipped with dense sensing\nunits, each covering an area of 3$mm^2$. These sensors are low-cost and\nflexible, providing detailed and extensive coverage of physical contacts,\neffectively complementing visual information. To integrate tactile and visual\ndata, we fuse them into a unified 3D representation space that preserves their\n3D structures and spatial relationships. The multi-modal representation can\nthen be coupled with diffusion policies for imitation learning. Through\nconcrete hardware experiments, we demonstrate that even low-cost robots can\nperform precise manipulations and significantly outperform vision-only\npolicies, particularly in safe interactions with fragile items and executing\nlong-horizon tasks involving in-hand manipulation. Our project page is\navailable at \\url{https://binghao-huang.github.io/3D-ViTac/}.\n","authors":["Binghao Huang","Yixuan Wang","Xinyi Yang","Yiyue Luo","Yunzhu Li"],"pdf_url":"https://arxiv.org/pdf/2410.24091v1.pdf","comment":"Accepted at Conference on Robot Learning (CoRL) 2024"},{"id":"http://arxiv.org/abs/2410.24090v1","updated":"2024-10-31T16:22:23Z","published":"2024-10-31T16:22:23Z","title":"Sparsh: Self-supervised touch representations for vision-based tactile\n sensing","summary":" In this work, we introduce general purpose touch representations for the\nincreasingly accessible class of vision-based tactile sensors. Such sensors\nhave led to many recent advances in robot manipulation as they markedly\ncomplement vision, yet solutions today often rely on task and sensor specific\nhandcrafted perception models. Collecting real data at scale with task centric\nground truth labels, like contact forces and slip, is a challenge further\ncompounded by sensors of various form factor differing in aspects like lighting\nand gel markings. To tackle this we turn to self-supervised learning (SSL) that\nhas demonstrated remarkable performance in computer vision. We present Sparsh,\na family of SSL models that can support various vision-based tactile sensors,\nalleviating the need for custom labels through pre-training on 460k+ tactile\nimages with masking and self-distillation in pixel and latent spaces. We also\nbuild TacBench, to facilitate standardized benchmarking across sensors and\nmodels, comprising of six tasks ranging from comprehending tactile properties\nto enabling physical perception and manipulation planning. In evaluations, we\nfind that SSL pre-training for touch representation outperforms task and\nsensor-specific end-to-end training by 95.1% on average over TacBench, and\nSparsh (DINO) and Sparsh (IJEPA) are the most competitive, indicating the\nmerits of learning in latent space for tactile images. Project page:\nhttps://sparsh-ssl.github.io/\n","authors":["Carolina Higuera","Akash Sharma","Chaithanya Krishna Bodduluri","Taosha Fan","Patrick Lancaster","Mrinal Kalakrishnan","Michael Kaess","Byron Boots","Mike Lambeta","Tingfan Wu","Mustafa Mukadam"],"pdf_url":"https://arxiv.org/pdf/2410.24090v1.pdf","comment":"Conference on Robot Learning (CoRL), 2024"},{"id":"http://arxiv.org/abs/2410.24035v1","updated":"2024-10-31T15:32:32Z","published":"2024-10-31T15:32:32Z","title":"State- and context-dependent robotic manipulation and grasping via\n uncertainty-aware imitation learning","summary":" Generating context-adaptive manipulation and grasping actions is a\nchallenging problem in robotics. Classical planning and control algorithms tend\nto be inflexible with regard to parameterization by external variables such as\nobject shapes. In contrast, Learning from Demonstration (LfD) approaches, due\nto their nature as function approximators, allow for introducing external\nvariables to modulate policies in response to the environment. In this paper,\nwe utilize this property by introducing an LfD approach to acquire\ncontext-dependent grasping and manipulation strategies. We treat the problem as\na kernel-based function approximation, where the kernel inputs include generic\ncontext variables describing task-dependent parameters such as the object\nshape. We build on existing work on policy fusion with uncertainty\nquantification to propose a state-dependent approach that automatically returns\nto demonstrations, avoiding unpredictable behavior while smoothly adapting to\ncontext changes. The approach is evaluated against the LASA handwriting dataset\nand on a real 7-DoF robot in two scenarios: adaptation to slippage while\ngrasping and manipulating a deformable food item.\n","authors":["Tim R. Winter","Ashok M. Sundaram","Werner Friedl","Maximo A. Roa","Freek Stulp","João Silvério"],"pdf_url":"https://arxiv.org/pdf/2410.24035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22931v2","updated":"2024-10-31T15:04:25Z","published":"2024-10-30T11:37:47Z","title":"GPTR: Gaussian Process Trajectory Representation for Continuous-Time\n Motion Estimation","summary":" Continuous-time trajectory representation has gained significant popularity\nin recent years, as it offers an elegant formulation that allows the fusion of\na larger number of sensors and sensing modalities, overcoming limitations of\ntraditional discrete-time frameworks. To bolster the adoption of the\ncontinuous-time paradigm, we propose a so-called Gaussian Process Trajectory\nRepresentation (GPTR) framework for continuous-time motion estimation (CTME)\ntasks. Our approach stands out by employing a third-order random jerk model,\nfeaturing closed-form expressions for both rotational and translational state\nderivatives. This model provides smooth, continuous trajectory representations\nthat are crucial for precise estimation of complex motion. To support the wider\nrobotics and computer vision communities, we have made the source code for GPTR\navailable as a light-weight header-only library. This format was chosen for its\nease of integration, allowing developers to incorporate GPTR into existing\nsystems without needing extensive code modifications. Moreover, we also provide\na set of optimization examples with LiDAR, camera, IMU, UWB factors, and\nclosed-form analytical Jacobians under the proposed GP framework. Our\nexperiments demonstrate the efficacy and efficiency of GP-based trajectory\nrepresentation in various motion estimation tasks, and the examples can serve\nas the prototype to help researchers quickly develop future applications such\nas batch optimization, calibration, sensor fusion, trajectory planning, etc.,\nwith continuous-time trajectory representation. Our project is accessible at\nhttps://github.com/brytsknguyen/gptr .\n","authors":["Thien-Minh Nguyen","Ziyu Cao","Kailai Li","Shenghai Yuan","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2410.22931v2.pdf","comment":"The source code has been released. All feedbacks are welcome"},{"id":"http://arxiv.org/abs/2211.15656v3","updated":"2024-10-31T15:01:41Z","published":"2022-11-28T18:59:02Z","title":"SuperFusion: Multilevel LiDAR-Camera Fusion for Long-Range HD Map\n Generation","summary":" High-definition (HD) semantic map generation of the environment is an\nessential component of autonomous driving. Existing methods have achieved good\nperformance in this task by fusing different sensor modalities, such as LiDAR\nand camera. However, current works are based on raw data or network\nfeature-level fusion and only consider short-range HD map generation, limiting\ntheir deployment to realistic autonomous driving applications. In this paper,\nwe focus on the task of building the HD maps in both short ranges, i.e., within\n30 m, and also predicting long-range HD maps up to 90 m, which is required by\ndownstream path planning and control tasks to improve the smoothness and safety\nof autonomous driving. To this end, we propose a novel network named\nSuperFusion, exploiting the fusion of LiDAR and camera data at multiple levels.\nWe use LiDAR depth to improve image depth estimation and use image features to\nguide long-range LiDAR feature prediction. We benchmark our SuperFusion on the\nnuScenes dataset and a self-recorded dataset and show that it outperforms the\nstate-of-the-art baseline methods with large margins on all intervals.\nAdditionally, we apply the generated HD map to a downstream path planning task,\ndemonstrating that the long-range HD maps predicted by our method can lead to\nbetter path planning for autonomous vehicles. Our code has been released at\nhttps://github.com/haomo-ai/SuperFusion.\n","authors":["Hao Dong","Weihao Gu","Xianjing Zhang","Jintao Xu","Rui Ai","Huimin Lu","Juho Kannala","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2211.15656v3.pdf","comment":"ICRA 2024"},{"id":"http://arxiv.org/abs/2407.12582v2","updated":"2024-10-31T14:37:42Z","published":"2024-07-17T14:09:46Z","title":"Embracing Events and Frames with Hierarchical Feature Refinement Network\n for Object Detection","summary":" In frame-based vision, object detection faces substantial performance\ndegradation under challenging conditions due to the limited sensing capability\nof conventional cameras. Event cameras output sparse and asynchronous events,\nproviding a potential solution to solve these problems. However, effectively\nfusing two heterogeneous modalities remains an open issue. In this work, we\npropose a novel hierarchical feature refinement network for event-frame fusion.\nThe core concept is the design of the coarse-to-fine fusion module, denoted as\nthe cross-modality adaptive feature refinement (CAFR) module. In the initial\nphase, the bidirectional cross-modality interaction (BCI) part facilitates\ninformation bridging from two distinct sources. Subsequently, the features are\nfurther refined by aligning the channel-level mean and variance in the two-fold\nadaptive feature refinement (TAFR) part. We conducted extensive experiments on\ntwo benchmarks: the low-resolution PKU-DDD17-Car dataset and the\nhigh-resolution DSEC dataset. Experimental results show that our method\nsurpasses the state-of-the-art by an impressive margin of $\\textbf{8.0}\\%$ on\nthe DSEC dataset. Besides, our method exhibits significantly better robustness\n(\\textbf{69.5}\\% versus \\textbf{38.7}\\%) when introducing 15 different\ncorruption types to the frame images. The code can be found at the link\n(https://github.com/HuCaoFighting/FRN).\n","authors":["Hu Cao","Zehua Zhang","Yan Xia","Xinyi Li","Jiahao Xia","Guang Chen","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2407.12582v2.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2410.23978v1","updated":"2024-10-31T14:31:53Z","published":"2024-10-31T14:31:53Z","title":"GAMap: Zero-Shot Object Goal Navigation with Multi-Scale\n Geometric-Affordance Guidance","summary":" Zero-Shot Object Goal Navigation (ZS-OGN) enables robots or agents to\nnavigate toward objects of unseen categories without object-specific training.\nTraditional approaches often leverage categorical semantic information for\nnavigation guidance, which struggles when only objects are partially observed\nor detailed and functional representations of the environment are lacking. To\nresolve the above two issues, we propose \\textit{Geometric-part and Affordance\nMaps} (GAMap), a novel method that integrates object parts and affordance\nattributes as navigation guidance. Our method includes a multi-scale scoring\napproach to capture geometric-part and affordance attributes of objects at\ndifferent scales. Comprehensive experiments conducted on HM3D and Gibson\nbenchmark datasets demonstrate improvements in Success Rate and Success\nweighted by Path Length, underscoring the efficacy of our geometric-part and\naffordance-guided navigation approach in enhancing robot autonomy and\nversatility, without any additional object-specific training or fine-tuning\nwith the semantics of unseen objects and/or the locomotions of the robot.\n","authors":["Shuaihang Yuan","Hao Huang","Yu Hao","Congcong Wen","Anthony Tzes","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2410.23978v1.pdf","comment":"16 pages, 8 figures, 7 tables"},{"id":"http://arxiv.org/abs/2410.23968v1","updated":"2024-10-31T14:22:20Z","published":"2024-10-31T14:22:20Z","title":"EmbodiedRAG: Dynamic 3D Scene Graph Retrieval for Efficient and Scalable\n Robot Task Planning","summary":" Recent advances in Large Language Models (LLMs) have helped facilitate\nexciting progress for robotic planning in real, open-world environments. 3D\nscene graphs (3DSGs) offer a promising environment representation for grounding\nsuch LLM-based planners as they are compact and semantically rich. However, as\nthe robot's environment scales (e.g., number of entities tracked) and the\ncomplexity of scene graph information increases (e.g., maintaining more\nattributes), providing the 3DSG as-is to an LLM-based planner quickly becomes\ninfeasible due to input token count limits and attentional biases present in\nLLMs. Inspired by the successes of Retrieval-Augmented Generation (RAG) methods\nthat retrieve query-relevant document chunks for LLM question and answering, we\nadapt the paradigm for our embodied domain. Specifically, we propose a 3D scene\nsubgraph retrieval framework, called EmbodiedRAG, that we augment an LLM-based\nplanner with for executing natural language robotic tasks. Notably, our\nretrieved subgraphs adapt to changes in the environment as well as changes in\ntask-relevancy as the robot executes its plan. We demonstrate EmbodiedRAG's\nability to significantly reduce input token counts (by an order of magnitude)\nand planning time (up to 70% reduction in average time per planning step) while\nimproving success rates on AI2Thor simulated household tasks with a single-arm,\nmobile manipulator. Additionally, we implement EmbodiedRAG on a quadruped with\na manipulator to highlight the performance benefits for robot deployment at the\nedge in real environments.\n","authors":["Meghan Booker","Grayson Byrd","Bethany Kemp","Aurora Schmidt","Corban Rivera"],"pdf_url":"https://arxiv.org/pdf/2410.23968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23963v1","updated":"2024-10-31T14:15:54Z","published":"2024-10-31T14:15:54Z","title":"Exploiting Information Theory for Intuitive Robot Programming of Manual\n Activities","summary":" Observational learning is a promising approach to enable people without\nexpertise in programming to transfer skills to robots in a user-friendly\nmanner, since it mirrors how humans learn new behaviors by observing others.\nMany existing methods focus on instructing robots to mimic human trajectories,\nbut motion-level strategies often pose challenges in skills generalization\nacross diverse environments. This paper proposes a novel framework that allows\nrobots to achieve a \\textit{higher-level} understanding of human-demonstrated\nmanual tasks recorded in RGB videos. By recognizing the task structure and\ngoals, robots generalize what observed to unseen scenarios. We found our task\nrepresentation on Shannon's Information Theory (IT), which is applied for the\nfirst time to manual tasks. IT helps extract the active scene elements and\nquantify the information shared between hands and objects. We exploit scene\ngraph properties to encode the extracted interaction features in a compact\nstructure and segment the demonstration into blocks, streamlining the\ngeneration of Behavior Trees for robot replicas. Experiments validated the\neffectiveness of IT to automatically generate robot execution plans from a\nsingle human demonstration. Additionally, we provide HANDSOME, an open-source\ndataset of HAND Skills demOnstrated by Multi-subjEcts, to promote further\nresearch and evaluation in this field.\n","authors":["Elena Merlo","Marta Lagomarsino","Edoardo Lamon","Arash Ajoudani"],"pdf_url":"https://arxiv.org/pdf/2410.23963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13321v2","updated":"2024-10-31T13:42:58Z","published":"2024-03-20T05:57:20Z","title":"Robotics meets Fluid Dynamics: A Characterization of the Induced Airflow\n below a Quadrotor as a Turbulent Jet","summary":" The widespread adoption of quadrotors for diverse applications, from\nagriculture to public safety, necessitates an understanding of the aerodynamic\ndisturbances they create. This paper introduces a computationally lightweight\nmodel for estimating the time-averaged magnitude of the induced flow below\nquadrotors in hover. Unlike related approaches that rely on expensive\ncomputational fluid dynamics (CFD) simulations or drone specific time-consuming\nempirical measurements, our method leverages classical theory from turbulent\nflows. By analyzing over 16 hours of flight data from drones of varying sizes\nwithin a large motion capture system, we show for the first time that the\ncombined flow from all drone propellers is well-approximated by a turbulent jet\nafter 2.5 drone-diameters below the vehicle. Using a novel normalization and\nscaling, we experimentally identify model parameters that describe a unified\nmean velocity field below differently sized quadrotors. The model, which\nrequires only the drone's mass, propeller size, and drone size for\ncalculations, accurately describes the far-field airflow over a long-range in a\nvery large volume which is impractical to simulate using CFD. Our model offers\na practical tool for ensuring safer operations near humans, optimizing sensor\nplacements and drone control in multi-agent scenarios. We demonstrate the\nlatter by designing a controller that compensates for the downwash of another\ndrone, leading to a four times lower altitude deviation when passing below.\n","authors":["Leonard Bauersfeld","Koen Muller","Dominic Ziegler","Filippo Coletti","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2403.13321v2.pdf","comment":"7+1 pages"},{"id":"http://arxiv.org/abs/2410.23929v1","updated":"2024-10-31T13:36:04Z","published":"2024-10-31T13:36:04Z","title":"Redundant Observer-Based Tracking Control for Object Extraction Using a\n Cable Connected UAV","summary":" A new disturbance observer based control scheme is developed for a quadrotor\nunder the concurrent disturbances from a lightweight elastic tether cable and a\nlumped vertical disturbance. This elastic tether is unusual as it creates a\ndisturbance proportional to the multicopter's translational movement. This\npaper takes an observer-based approach to estimate the stiffness coefficient of\nthe cable and uses the system model to update the estimates of the external\nforces, which are then compensated in the control action. Given that the\ntethered cable force affects both horizontal channels of the quadrotor and is\nalso coupled with the vertical channel, the proposed disturbance observer is\nconstructed to exploit the redundant measurements across all three channels to\njointly estimate the cable stiffness and the vertical disturbance. A\npseudo-inverse method is used to determine the observer gain functions, such\nthat the estimation of the two quantities is decoupled and stable. Compared to\nstandard disturbance observers which assume nearly constant disturbances, the\nproposed approach can quickly adjust its total force estimate as the tethered\nquadrotor changes its position or tautness of the tether. This is applied to\ntwo experiments - a tracking performance test where the multicopter moves under\na constant tether strain, and an object extraction test. In the second test,\nthe multicopter manipulates a nonlinear mechanism mimicking the extraction of a\nwedged object. In both cases, the proposed approach shows significant\nimprovement over standard Disturbance Observer and Extended State Observer\napproaches. A video summary of the experiments can be found at\nhttps://youtu.be/9gKr13WTj-k.\n","authors":["Benjamin J. Marshall","Yunda Yan","James Knowles","Chenguang Yang","Cunjia Liu"],"pdf_url":"https://arxiv.org/pdf/2410.23929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23916v1","updated":"2024-10-31T13:23:10Z","published":"2024-10-31T13:23:10Z","title":"Transformer-based Model Predictive Control: Trajectory Optimization via\n Sequence Modeling","summary":" Model predictive control (MPC) has established itself as the primary\nmethodology for constrained control, enabling general-purpose robot autonomy in\ndiverse real-world scenarios. However, for most problems of interest, MPC\nrelies on the recursive solution of highly non-convex trajectory optimization\nproblems, leading to high computational complexity and strong dependency on\ninitialization. In this work, we present a unified framework to combine the\nmain strengths of optimization-based and learning-based methods for MPC. Our\napproach entails embedding high-capacity, transformer-based neural network\nmodels within the optimization process for trajectory generation, whereby the\ntransformer provides a near-optimal initial guess, or target plan, to a\nnon-convex optimization problem. Our experiments, performed in simulation and\nthe real world onboard a free flyer platform, demonstrate the capabilities of\nour framework to improve MPC convergence and runtime. Compared to purely\noptimization-based approaches, results show that our approach can improve\ntrajectory generation performance by up to 75%, reduce the number of solver\niterations by up to 45%, and improve overall MPC runtime by 7x without loss in\nperformance.\n","authors":["Davide Celestini","Daniele Gammelli","Tommaso Guffanti","Simone D'Amico","Elisa Capello","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2410.23916v1.pdf","comment":"8 pages, 7 figures. Datasets, videos and code available at:\n https://transformermpc.github.io"},{"id":"http://arxiv.org/abs/2408.14791v3","updated":"2024-10-31T12:23:42Z","published":"2024-08-27T05:53:02Z","title":"Optimizing Structured Data Processing through Robotic Process Automation","summary":" Robotic Process Automation (RPA) has emerged as a game-changing technology in\ndata extraction, revolutionizing the way organizations process and analyze\nlarge volumes of documents such as invoices, purchase orders, and payment\nadvices. This study investigates the use of RPA for structured data extraction\nand evaluates its advantages over manual processes. By comparing\nhuman-performed tasks with those executed by RPA software bots, we assess\nefficiency and accuracy in data extraction from invoices, focusing on the\neffectiveness of the RPA system. Through four distinct scenarios involving\nvarying numbers of invoices, we measure efficiency in terms of time and effort\nrequired for task completion, as well as accuracy by comparing error rates\nbetween manual and RPA processes. Our findings highlight the significant\nefficiency gains achieved by RPA, with bots completing tasks in significantly\nless time compared to manual efforts across all cases. Moreover, the RPA system\nconsistently achieves perfect accuracy, mitigating the risk of errors and\nenhancing process reliability. These results underscore the transformative\npotential of RPA in optimizing operational efficiency, reducing human labor\ncosts, and improving overall business performance.\n","authors":["Vivek Bhardwaj","Ajit Noonia","Sandeep Chaurasia","Mukesh Kumar","Abdulnaser Rashid","Mohamed Tahar Ben Othman"],"pdf_url":"https://arxiv.org/pdf/2408.14791v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23860v1","updated":"2024-10-31T12:10:20Z","published":"2024-10-31T12:10:20Z","title":"Analysing the Interplay of Vision and Touch for Dexterous Insertion\n Tasks","summary":" Robotic insertion tasks remain challenging due to uncertainties in perception\nand the need for precise control, particularly in unstructured environments.\nWhile humans seamlessly combine vision and touch for such tasks, effectively\nintegrating these modalities in robotic systems is still an open problem. Our\nwork presents an extensive analysis of the interplay between visual and tactile\nfeedback during dexterous insertion tasks, showing that tactile sensing can\ngreatly enhance success rates on challenging insertions with tight tolerances\nand varied hole orientations that vision alone cannot solve. These findings\nprovide valuable insights for designing more effective multi-modal robotic\ncontrol systems and highlight the critical role of tactile feedback in\ncontact-rich manipulation tasks.\n","authors":["Janis Lenz","Theo Gruner","Daniel Palenicek","Tim Schneider","Jan Peters"],"pdf_url":"https://arxiv.org/pdf/2410.23860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14927v3","updated":"2024-10-31T11:32:19Z","published":"2024-06-21T07:37:17Z","title":"GIC: Gaussian-Informed Continuum for Physical Property Identification\n and Simulation","summary":" This paper studies the problem of estimating physical properties (system\nidentification) through visual observations. To facilitate geometry-aware\nguidance in physical property estimation, we introduce a novel hybrid framework\nthat leverages 3D Gaussian representation to not only capture explicit shapes\nbut also enable the simulated continuum to render object masks as 2D shape\nsurrogates during training. We propose a new dynamic 3D Gaussian framework\nbased on motion factorization to recover the object as 3D Gaussian point sets\nacross different time states. Furthermore, we develop a coarse-to-fine filling\nstrategy to generate the density fields of the object from the Gaussian\nreconstruction, allowing for the extraction of object continuums along with\ntheir surfaces and the integration of Gaussian attributes into these continuum.\nIn addition to the extracted object surfaces, the Gaussian-informed continuum\nalso enables the rendering of object masks during simulations, serving as\n2D-shape guidance for physical property estimation. Extensive experimental\nevaluations demonstrate that our pipeline achieves state-of-the-art performance\nacross multiple benchmarks and metrics. Additionally, we illustrate the\neffectiveness of the proposed method through real-world demonstrations,\nshowcasing its practical utility. Our project page is at\nhttps://jukgei.github.io/project/gic.\n","authors":["Junhao Cai","Yuji Yang","Weihao Yuan","Yisheng He","Zilong Dong","Liefeng Bo","Hui Cheng","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2406.14927v3.pdf","comment":"21 pages, 8 figures, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2304.06923v5","updated":"2024-10-31T09:26:18Z","published":"2023-04-14T05:01:10Z","title":"An NMPC-ECBF Framework for Dynamic Motion Planning and Execution in\n vision-based Human-Robot Collaboration","summary":" To enable safe and effective human-robot collaboration (HRC) in smart\nmanufacturing, seamless integration of sensing, cognition, and prediction into\nthe robot controller is critical for real-time awareness, response, and\ncommunication inside a heterogeneous environment (robots, humans, and\nequipment). The proposed approach takes advantage of the prediction\ncapabilities of nonlinear model predictive control (NMPC) to execute a safe\npath planning based on feedback from a vision system. In order to satisfy the\nrequirement of real-time path planning, an embedded solver based on a penalty\nmethod is applied. However, due to tight sampling times NMPC solutions are\napproximate, and hence the safety of the system cannot be guaranteed. To\naddress this we formulate a novel safety-critical paradigm with an exponential\ncontrol barrier function (ECBF) used as a safety filter. We also design a\nsimple human-robot collaboration scenario using V-REP to evaluate the\nperformance of the proposed controller and investigate whether integrating\nhuman pose prediction can help with safe and efficient collaboration. The robot\nuses OptiTrack cameras for perception and dynamically generates collision-free\ntrajectories to the predicted target interactive position. Results for a number\nof different configurations confirm the efficiency of the proposed motion\nplanning and execution framework. It yields a 19.8% reduction in execution time\nfor the HRC task considered.\n","authors":["Dianhao Zhang","Mien Van","Pantelis Sopasakis","Seán McLoone"],"pdf_url":"https://arxiv.org/pdf/2304.06923v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23747v1","updated":"2024-10-31T09:02:39Z","published":"2024-10-31T09:02:39Z","title":"A Comprehensive Review of Current Robot- Based Pollinators in Greenhouse\n Farming","summary":" The decline of bee and wind-based pollination systems in greenhouses due to\ncontrolled environments and limited access has boost the importance of finding\nalternative pollination methods. Robotic based pollination systems have emerged\nas a promising solution, ensuring adequate crop yield even in challenging\npollination scenarios. This paper presents a comprehensive review of the\ncurrent robotic-based pollinators employed in greenhouses. The review\ncategorizes pollinator technologies into major categories such as air-jet,\nwater-jet, linear actuator, ultrasonic wave, and air-liquid spray, each\nsuitable for specific crop pollination requirements. However, these\ntechnologies are often tailored to particular crops, limiting their\nversatility. The advancement of science and technology has led to the\nintegration of automated pollination technology, encompassing information\ntechnology, automatic perception, detection, control, and operation. This\nintegration not only reduces labor costs but also fosters the ongoing progress\nof modern agriculture by refining technology, enhancing automation, and\npromoting intelligence in agricultural practices. Finally, the challenges\nencountered in design of pollinator are addressed, and a forward-looking\nperspective is taken towards future developments, aiming to contribute to the\nsustainable advancement of this technology.\n","authors":["Rajmeet Singh","lakmal Seneviratne","Irfan Hussain"],"pdf_url":"https://arxiv.org/pdf/2410.23747v1.pdf","comment":"20 pages, 21 figures"},{"id":"http://arxiv.org/abs/2405.15223v3","updated":"2024-10-31T08:58:08Z","published":"2024-05-24T05:29:12Z","title":"iVideoGPT: Interactive VideoGPTs are Scalable World Models","summary":" World models empower model-based agents to interactively explore, reason, and\nplan within imagined environments for real-world decision-making. However, the\nhigh demand for interactivity poses challenges in harnessing recent\nadvancements in video generative models for developing world models at scale.\nThis work introduces Interactive VideoGPT (iVideoGPT), a scalable\nautoregressive transformer framework that integrates multimodal signals--visual\nobservations, actions, and rewards--into a sequence of tokens, facilitating an\ninteractive experience of agents via next-token prediction. iVideoGPT features\na novel compressive tokenization technique that efficiently discretizes\nhigh-dimensional visual observations. Leveraging its scalable architecture, we\nare able to pre-train iVideoGPT on millions of human and robotic manipulation\ntrajectories, establishing a versatile foundation that is adaptable to serve as\ninteractive world models for a wide range of downstream tasks. These include\naction-conditioned video prediction, visual planning, and model-based\nreinforcement learning, where iVideoGPT achieves competitive performance\ncompared with state-of-the-art methods. Our work advances the development of\ninteractive general world models, bridging the gap between generative video\nmodels and practical model-based reinforcement learning applications. Code and\npre-trained models are available at https://thuml.github.io/iVideoGPT.\n","authors":["Jialong Wu","Shaofeng Yin","Ningya Feng","Xu He","Dong Li","Jianye Hao","Mingsheng Long"],"pdf_url":"https://arxiv.org/pdf/2405.15223v3.pdf","comment":"NeurIPS 2024. Code is available at project website:\n https://thuml.github.io/iVideoGPT"},{"id":"http://arxiv.org/abs/2402.04555v2","updated":"2024-10-31T08:25:08Z","published":"2024-02-07T03:19:02Z","title":"FM-Fusion: Instance-aware Semantic Mapping Boosted by Vision-Language\n Foundation Models","summary":" Semantic mapping based on the supervised object detectors is sensitive to\nimage distribution. In real-world environments, the object detection and\nsegmentation performance can lead to a major drop, preventing the use of\nsemantic mapping in a wider domain. On the other hand, the development of\nvision-language foundation models demonstrates a strong zero-shot\ntransferability across data distribution. It provides an opportunity to\nconstruct generalizable instance-aware semantic maps. Hence, this work explores\nhow to boost instance-aware semantic mapping from object detection generated\nfrom foundation models. We propose a probabilistic label fusion method to\npredict close-set semantic classes from open-set label measurements. An\ninstance refinement module merges the over-segmented instances caused by\ninconsistent segmentation. We integrate all the modules into a unified semantic\nmapping system. Reading a sequence of RGB-D input, our work incrementally\nreconstructs an instance-aware semantic map. We evaluate the zero-shot\nperformance of our method in ScanNet and SceneNN datasets. Our method achieves\n40.3 mean average precision (mAP) on the ScanNet semantic instance segmentation\ntask. It outperforms the traditional semantic mapping method significantly.\n","authors":["Chuhao Liu","Ke Wang","Jieqi Shi","Zhijian Qiao","Shaojie Shen"],"pdf_url":"https://arxiv.org/pdf/2402.04555v2.pdf","comment":"Published in IEEE RAL"},{"id":"http://arxiv.org/abs/2410.23722v1","updated":"2024-10-31T08:19:08Z","published":"2024-10-31T08:19:08Z","title":"Features characterizing safe aerial-aquatic robots","summary":" This paper underscores the importance of environmental monitoring, and\nspecifically of freshwater ecosystems, which play a critical role in sustaining\nlife and global economy. Despite their importance, insufficient data\navailability prevents a comprehensive understanding of these ecosystems,\nthereby impeding informed decision-making concerning their preservation.\nAerial-aquatic robots are identified as effective tools for freshwater sensing,\noffering rapid deployment and avoiding the need of using ships and manned\nteams.\n To advance the field of aerial aquatic robots, this paper conducts a\ncomprehensive review of air-water transitions focusing on the water entry\nstrategy of existing prototypes. This analysis also highlights the safety risks\nassociated with each transition and proposes a set of design requirements\nrelating to robots' tasks, mission objectives, and safety measures. To further\nexplore the proposed design requirements, we present a novel robot with VTOL\ncapability, enabling seamless air water transitions.\n","authors":["Andrea Giordano","Luca Romanello","Diego Perez Gonzalez","Mirko Kovac","Sophie F. Armanini"],"pdf_url":"https://arxiv.org/pdf/2410.23722v1.pdf","comment":"Peer-reviewed and accepted in IEEE Ubiquitous Robots 2024, New York\n City"},{"id":"http://arxiv.org/abs/2404.09200v2","updated":"2024-10-31T07:46:38Z","published":"2024-04-14T09:29:37Z","title":"Tube RRT*: Efficient Homotopic Path Planning for Swarm Robotics\n Passing-Through Large-Scale Obstacle Environments","summary":" Recently, the concept of homotopic trajectory planning has emerged as a novel\nsolution to navigation in large-scale obstacle environments for swarm robotics,\noffering a wide ranging of applications. However, it lacks an efficient\nhomotopic path planning method in large-scale obstacle environments. This paper\nintroduces Tube RRT*, an innovative homotopic path planning method that builds\nupon and improves the Rapidly-exploring Random Tree (RRT) algorithm. Tube RRT*\nis specifically designed to generate homotopic paths, strategically considering\ngap volume and path length to mitigate swarm congestion and ensure agile\nnavigation. Through comprehensive simulations and experiments, the\neffectiveness of Tube RRT* is validated.\n","authors":["Pengda Mao","Shuli Lv","Quan Quan"],"pdf_url":"https://arxiv.org/pdf/2404.09200v2.pdf","comment":"8 pages, 8 figures, submitted to RA-L"},{"id":"http://arxiv.org/abs/2410.23701v1","updated":"2024-10-31T07:45:12Z","published":"2024-10-31T07:45:12Z","title":"Get a Grip: Multi-Finger Grasp Evaluation at Scale Enables Robust\n Sim-to-Real Transfer","summary":" This work explores conditions under which multi-finger grasping algorithms\ncan attain robust sim-to-real transfer. While numerous large datasets\nfacilitate learning generative models for multi-finger grasping at scale,\nreliable real-world dexterous grasping remains challenging, with most methods\ndegrading when deployed on hardware. An alternate strategy is to use\ndiscriminative grasp evaluation models for grasp selection and refinement,\nconditioned on real-world sensor measurements. This paradigm has produced\nstate-of-the-art results for vision-based parallel-jaw grasping, but remains\nunproven in the multi-finger setting. In this work, we find that existing\ndatasets and methods have been insufficient for training discriminitive models\nfor multi-finger grasping. To train grasp evaluators at scale, datasets must\nprovide on the order of millions of grasps, including both positive and\nnegative examples, with corresponding visual data resembling measurements at\ninference time. To that end, we release a new, open-source dataset of 3.5M\ngrasps on 4.3K objects annotated with RGB images, point clouds, and trained\nNeRFs. Leveraging this dataset, we train vision-based grasp evaluators that\noutperform both analytic and generative modeling-based baselines on extensive\nsimulated and real-world trials across a diverse range of objects. We show via\nnumerous ablations that the key factor for performance is indeed the evaluator,\nand that its quality degrades as the dataset shrinks, demonstrating the\nimportance of our new dataset. Project website at:\nhttps://sites.google.com/view/get-a-grip-dataset.\n","authors":["Tyler Ga Wei Lum","Albert H. Li","Preston Culbertson","Krishnan Srinivasan","Aaron D. Ames","Mac Schwager","Jeannette Bohg"],"pdf_url":"https://arxiv.org/pdf/2410.23701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23690v1","updated":"2024-10-31T07:25:39Z","published":"2024-10-31T07:25:39Z","title":"XRDSLAM: A Flexible and Modular Framework for Deep Learning based SLAM","summary":" In this paper, we propose a flexible SLAM framework, XRDSLAM. It adopts a\nmodular code design and a multi-process running mechanism, providing highly\nreusable foundational modules such as unified dataset management, 3d\nvisualization, algorithm configuration, and metrics evaluation. It can help\ndevelopers quickly build a complete SLAM system, flexibly combine different\nalgorithm modules, and conduct standardized benchmarking for accuracy and\nefficiency comparison. Within this framework, we integrate several\nstate-of-the-art SLAM algorithms with different types, including NeRF and 3DGS\nbased SLAM, and even odometry or reconstruction algorithms, which demonstrates\nthe flexibility and extensibility. We also conduct a comprehensive comparison\nand evaluation of these integrated algorithms, analyzing the characteristics of\neach. Finally, we contribute all the code, configuration and data to the\nopen-source community, which aims to promote the widespread research and\ndevelopment of SLAM technology within the open-source ecosystem.\n","authors":["Xiaomeng Wang","Nan Wang","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.23690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14329v2","updated":"2024-10-31T07:16:46Z","published":"2023-08-28T06:17:15Z","title":"End-to-End Driving via Self-Supervised Imitation Learning Using Camera\n and LiDAR Data","summary":" In autonomous driving, the end-to-end (E2E) driving approach that predicts\nvehicle control signals directly from sensor data is rapidly gaining attention.\nTo learn a safe E2E driving system, one needs an extensive amount of driving\ndata and human intervention. Vehicle control data is constructed by many hours\nof human driving, and it is challenging to construct large vehicle control\ndatasets. Often, publicly available driving datasets are collected with limited\ndriving scenes, and collecting vehicle control data is only available by\nvehicle manufacturers. To address these challenges, this letter proposes the\nfirst fully self-supervised learning framework, self-supervised imitation\nlearning (SSIL), for E2E driving, based on the self-supervised regression\nlearning framework. The proposed SSIL framework can learn E2E driving networks\nwithout using driving command data. To construct pseudo steering angle data,\nproposed SSIL predicts a pseudo target from the vehicle's poses at the current\nand previous time points that are estimated with light detection and ranging\nsensors. In addition, we propose two modified E2E driving networks that predict\ndriving commands depending on high-level instruction. Our numerical experiments\nwith three different benchmark datasets demonstrate that the proposed SSIL\nframework achieves very comparable E2E driving accuracy with the supervised\nlearning counterpart.\n","authors":["Jin Bok Park","Jinkyu Lee","Muhyun Back","Hyunmin Han","David T. Ma","Sang Min Won","Sung Soo Hwang","Il Yong Chun"],"pdf_url":"https://arxiv.org/pdf/2308.14329v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2410.23682v1","updated":"2024-10-31T07:12:35Z","published":"2024-10-31T07:12:35Z","title":"CubiXMusashi: Fusion of Wire-Driven CubiX and Musculoskeletal Humanoid\n Musashi toward Unlimited Performance","summary":" Humanoids exhibit a wide variety in terms of joint configuration, actuators,\nand degrees of freedom, resulting in different achievable movements and tasks\nfor each type. Particularly, musculoskeletal humanoids are developed to closely\nemulate human body structure and movement functions, consisting of a skeletal\nframework driven by numerous muscle actuators. The redundant arrangement of\nmuscles relative to the skeletal degrees of freedom has been used to represent\nthe flexible and complex body movements observed in humans. However, due to\nthis flexible body and high degrees of freedom, modeling, simulation, and\ncontrol become extremely challenging, limiting the feasible movements and\ntasks. In this study, we integrate the musculoskeletal humanoid Musashi with\nthe wire-driven robot CubiX, capable of connecting to the environment, to form\nCubiXMusashi. This combination addresses the shortcomings of traditional\nmusculoskeletal humanoids and enables movements beyond the capabilities of\nother humanoids. CubiXMusashi connects to the environment with wires and drives\nby winding them, successfully achieving movements such as pull-up, rising from\na lying pose, and mid-air kicking, which are difficult for Musashi alone. This\nconcept demonstrates that various humanoids, not limited to musculoskeletal\nhumanoids, can mitigate their physical constraints and acquire new abilities by\nconnecting to the environment and driving through wires.\n","authors":["Shintaro Inoue","Kento Kawaharazuka","Temma Suzuki","Sota Yuzaki","Yoshimoto Ribayashi","Yuta Sahara","Kei Okada"],"pdf_url":"https://arxiv.org/pdf/2410.23682v1.pdf","comment":"Accepted Humanoids2024, website -\n https://shin0805.github.io/cubixmusashi/, YouTube -\n https://youtu.be/IvzP98-r_mo"},{"id":"http://arxiv.org/abs/2410.23643v1","updated":"2024-10-31T05:29:30Z","published":"2024-10-31T05:29:30Z","title":"SceneComplete: Open-World 3D Scene Completion in Complex Real World\n Environments for Robot Manipulation","summary":" Careful robot manipulation in every-day cluttered environments requires an\naccurate understanding of the 3D scene, in order to grasp and place objects\nstably and reliably and to avoid mistakenly colliding with other objects. In\ngeneral, we must construct such a 3D interpretation of a complex scene based on\nlimited input, such as a single RGB-D image. We describe SceneComplete, a\nsystem for constructing a complete, segmented, 3D model of a scene from a\nsingle view. It provides a novel pipeline for composing general-purpose\npretrained perception modules (vision-language, segmentation, image-inpainting,\nimage-to-3D, and pose-estimation) to obtain high-accuracy results. We\ndemonstrate its accuracy and effectiveness with respect to ground-truth models\nin a large benchmark dataset and show that its accurate whole-object\nreconstruction enables robust grasp proposal generation, including for a\ndexterous hand.\n","authors":["Aditya Agarwal","Gaurav Singh","Bipasha Sen","Tomás Lozano-Pérez","Leslie Pack Kaelbling"],"pdf_url":"https://arxiv.org/pdf/2410.23643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23640v1","updated":"2024-10-31T05:25:11Z","published":"2024-10-31T05:25:11Z","title":"SuctionPrompt: Visual-assisted Robotic Picking with a Suction Cup Using\n Vision-Language Models and Facile Hardware Design","summary":" The development of large language models and vision-language models (VLMs)\nhas resulted in the increasing use of robotic systems in various fields.\nHowever, the effective integration of these models into real-world robotic\ntasks is a key challenge. We developed a versatile robotic system called\nSuctionPrompt that utilizes prompting techniques of VLMs combined with 3D\ndetections to perform product-picking tasks in diverse and dynamic\nenvironments. Our method highlights the importance of integrating 3D spatial\ninformation with adaptive action planning to enable robots to approach and\nmanipulate objects in novel environments. In the validation experiments, the\nsystem accurately selected suction points 75.4%, and achieved a 65.0% success\nrate in picking common items. This study highlights the effectiveness of VLMs\nin robotic manipulation tasks, even with simple 3D processing.\n","authors":["Tomohiro Motoda","Takahide Kitamura","Ryo Hanai","Yukiyasu Domae"],"pdf_url":"https://arxiv.org/pdf/2410.23640v1.pdf","comment":"11 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2410.23634v1","updated":"2024-10-31T05:01:20Z","published":"2024-10-31T05:01:20Z","title":"Tiny Learning-Based MPC for Multirotors: Solver-Aware Learning for\n Efficient Embedded Predictive Control","summary":" Tiny aerial robots show promise for applications like environmental\nmonitoring and search-and-rescue but face challenges in control due to their\nlimited computing power and complex dynamics. Model Predictive Control (MPC)\ncan achieve agile trajectory tracking and handle constraints. Although current\nlearning-based MPC methods, such as Gaussian Process (GP) MPC, improve control\nperformance by learning residual dynamics, they are computationally demanding,\nlimiting their onboard application on tiny robots. This paper introduces Tiny\nLearning-Based Model Predictive Control (LB MPC), a novel framework for\nresource-constrained micro multirotor platforms. By exploiting multirotor\ndynamics' structure and developing an efficient solver, our approach enables\nhigh-rate control at 100 Hz on a Crazyflie 2.1 with a Teensy 4.0\nmicrocontroller. We demonstrate a 23\\% average improvement in tracking\nperformance over existing embedded MPC methods, achieving the first onboard\nimplementation of learning-based MPC on a tiny multirotor (53 g).\n","authors":["Babak Akbari","Justin Frank","Melissa Greeff"],"pdf_url":"https://arxiv.org/pdf/2410.23634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05421v3","updated":"2024-10-31T04:53:19Z","published":"2024-02-08T05:26:40Z","title":"DiffTORI: Differentiable Trajectory Optimization for Deep Reinforcement\n and Imitation Learning","summary":" This paper introduces DiffTORI, which utilizes Differentiable Trajectory\nOptimization as the policy representation to generate actions for deep\nReinforcement and Imitation learning. Trajectory optimization is a powerful and\nwidely used algorithm in control, parameterized by a cost and a dynamics\nfunction. The key to our approach is to leverage the recent progress in\ndifferentiable trajectory optimization, which enables computing the gradients\nof the loss with respect to the parameters of trajectory optimization. As a\nresult, the cost and dynamics functions of trajectory optimization can be\nlearned end-to-end. DiffTORI addresses the ``objective mismatch'' issue of\nprior model-based RL algorithms, as the dynamics model in DiffTORI is learned\nto directly maximize task performance by differentiating the policy gradient\nloss through the trajectory optimization process. We further benchmark DiffTORI\nfor imitation learning on standard robotic manipulation task suites with\nhigh-dimensional sensory observations and compare our method to feed-forward\npolicy classes as well as Energy-Based Models (EBM) and Diffusion. Across 15\nmodel-based RL tasks and 35 imitation learning tasks with high-dimensional\nimage and point cloud inputs, DiffTORI outperforms prior state-of-the-art\nmethods in both domains.\n","authors":["Weikang Wan","Ziyu Wang","Yufei Wang","Zackory Erickson","David Held"],"pdf_url":"https://arxiv.org/pdf/2402.05421v3.pdf","comment":"NeurIPS 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2309.09017v2","updated":"2024-10-31T04:11:54Z","published":"2023-09-16T15:11:34Z","title":"Triple Regression for Camera Agnostic Sim2Real Robot Grasping and\n Manipulation Tasks","summary":" Sim2Real (Simulation to Reality) techniques have gained prominence in robotic\nmanipulation and motion planning due to their ability to enhance success rates\nby enabling agents to test and evaluate various policies and trajectories. In\nthis paper, we investigate the advantages of integrating Sim2Real into robotic\nframeworks. We introduce the Triple Regression Sim2Real framework, which\nconstructs a real-time digital twin. This twin serves as a replica of reality\nto simulate and evaluate multiple plans before their execution in real-world\nscenarios. Our triple regression approach addresses the reality gap by: (1)\nmitigating projection errors between real and simulated camera perspectives\nthrough the first two regression models, and (2) detecting discrepancies in\nrobot control using the third regression model. Experiments on 6-DoF grasp and\nmanipulation tasks (where the gripper can approach from any direction)\nhighlight the effectiveness of our framework. Remarkably, with only RGB input\nimages, our method achieves state-of-the-art success rates. This research\nadvances efficient robot training methods and sets the stage for rapid\nadvancements in robotics and automation.\n","authors":["Yuanhong Zeng","Yizhou Zhao","Ying Nian Wu"],"pdf_url":"https://arxiv.org/pdf/2309.09017v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20927v3","updated":"2024-10-31T03:50:22Z","published":"2024-10-28T11:12:00Z","title":"VLMimic: Vision Language Models are Visual Imitation Learner for\n Fine-grained Actions","summary":" Visual imitation learning (VIL) provides an efficient and intuitive strategy\nfor robotic systems to acquire novel skills. Recent advancements in Vision\nLanguage Models (VLMs) have demonstrated remarkable performance in vision and\nlanguage reasoning capabilities for VIL tasks. Despite the progress, current\nVIL methods naively employ VLMs to learn high-level plans from human videos,\nrelying on pre-defined motion primitives for executing physical interactions,\nwhich remains a major bottleneck. In this work, we present VLMimic, a novel\nparadigm that harnesses VLMs to directly learn even fine-grained action levels,\nonly given a limited number of human videos. Specifically, VLMimic first\ngrounds object-centric movements from human videos, and learns skills using\nhierarchical constraint representations, facilitating the derivation of skills\nwith fine-grained action levels from limited human videos. These skills are\nrefined and updated through an iterative comparison strategy, enabling\nefficient adaptation to unseen environments. Our extensive experiments exhibit\nthat our VLMimic, using only 5 human videos, yields significant improvements of\nover 27% and 21% in RLBench and real-world manipulation tasks, and surpasses\nbaselines by over 37% in long-horizon tasks.\n","authors":["Guanyan Chen","Meiling Wang","Te Cui","Yao Mu","Haoyang Lu","Tianxing Zhou","Zicai Peng","Mengxiao Hu","Haizhou Li","Yuan Li","Yi Yang","Yufeng Yue"],"pdf_url":"https://arxiv.org/pdf/2410.20927v3.pdf","comment":"accepted for publication in the 38th Conference on Neural Information\n Processing Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2406.12095v2","updated":"2024-10-31T03:23:39Z","published":"2024-06-17T21:15:13Z","title":"DistillNeRF: Perceiving 3D Scenes from Single-Glance Images by\n Distilling Neural Fields and Foundation Model Features","summary":" We propose DistillNeRF, a self-supervised learning framework addressing the\nchallenge of understanding 3D environments from limited 2D observations in\noutdoor autonomous driving scenes. Our method is a generalizable feedforward\nmodel that predicts a rich neural scene representation from sparse,\nsingle-frame multi-view camera inputs with limited view overlap, and is trained\nself-supervised with differentiable rendering to reconstruct RGB, depth, or\nfeature images. Our first insight is to exploit per-scene optimized Neural\nRadiance Fields (NeRFs) by generating dense depth and virtual camera targets\nfrom them, which helps our model to learn enhanced 3D geometry from sparse\nnon-overlapping image inputs. Second, to learn a semantically rich 3D\nrepresentation, we propose distilling features from pre-trained 2D foundation\nmodels, such as CLIP or DINOv2, thereby enabling various downstream tasks\nwithout the need for costly 3D human annotations. To leverage these two\ninsights, we introduce a novel model architecture with a two-stage\nlift-splat-shoot encoder and a parameterized sparse hierarchical voxel\nrepresentation. Experimental results on the NuScenes and Waymo NOTR datasets\ndemonstrate that DistillNeRF significantly outperforms existing comparable\nstate-of-the-art self-supervised methods for scene reconstruction, novel view\nsynthesis, and depth estimation; and it allows for competitive zero-shot 3D\nsemantic occupancy prediction, as well as open-world scene understanding\nthrough distilled foundation model features. Demos and code will be available\nat https://distillnerf.github.io/.\n","authors":["Letian Wang","Seung Wook Kim","Jiawei Yang","Cunjun Yu","Boris Ivanovic","Steven L. Waslander","Yue Wang","Sanja Fidler","Marco Pavone","Peter Karkus"],"pdf_url":"https://arxiv.org/pdf/2406.12095v2.pdf","comment":"Accepted by Advances in Neural Information Processing Systems\n (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2410.23586v1","updated":"2024-10-31T02:56:19Z","published":"2024-10-31T02:56:19Z","title":"Multi-Robot Pursuit in Parameterized Formation via Imitation Learning","summary":" This paper studies the problem of multi-robot pursuit of how to coordinate a\ngroup of defending robots to capture a faster attacker before it enters a\nprotected area. Such operation for defending robots is challenging due to the\nunknown avoidance strategy and higher speed of the attacker, coupled with the\nlimited communication capabilities of defenders. To solve this problem, we\npropose a parameterized formation controller that allows defending robots to\nadapt their formation shape using five adjustable parameters. Moreover, we\ndevelop an imitation-learning based approach integrated with model predictive\ncontrol to optimize these shape parameters. We make full use of these two\ntechniques to enhance the capture capabilities of defending robots through\nongoing training. Both simulation and experiment are provided to verify the\neffectiveness and robustness of our proposed controller. Simulation results\nshow that defending robots can rapidly learn an effective strategy for\ncapturing the attacker, and moreover the learned strategy remains effective\nacross varying numbers of defenders. Experiment results on real robot platforms\nfurther validated these findings.\n","authors":["Jinyong Chen","Rui Zhou","Zhaozong Wang","Yunjie Zhang","Guibin Sun"],"pdf_url":"https://arxiv.org/pdf/2410.23586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02444v2","updated":"2024-10-31T02:49:27Z","published":"2024-08-05T13:08:53Z","title":"RIs-Calib: An Open-Source Spatiotemporal Calibrator for Multiple 3D\n Radars and IMUs Based on Continuous-Time Estimation","summary":" Aided inertial navigation system (INS), typically consisting of an inertial\nmeasurement unit (IMU) and an exteroceptive sensor, has been widely accepted as\na feasible solution for navigation. Compared with vision-aided and LiDAR-aided\nINS, radar-aided INS could achieve better performance in adverse weather\nconditions since the radar utilizes low-frequency measuring signals with less\nattenuation effect in atmospheric gases and rain. For such a radar-aided INS,\naccurate spatiotemporal transformation is a fundamental prerequisite to\nachieving optimal information fusion. In this work, we present RIs-Calib: a\nspatiotemporal calibrator for multiple 3D radars and IMUs based on\ncontinuous-time estimation, which enables accurate spatiotemporal calibration\nand does not require any additional artificial infrastructure or prior\nknowledge. Our approach starts with a rigorous and robust procedure for state\ninitialization, followed by batch optimizations, where all parameters can be\nrefined to global optimal states steadily. We validate and evaluate RIs-Calib\non both simulated and real-world experiments, and the results demonstrate that\nRIs-Calib is capable of accurate and consistent calibration. We open-source our\nimplementations at (https://github.com/Unsigned-Long/RIs-Calib) to benefit the\nresearch community.\n","authors":["Shuolong Chen","Xingxing Li","Shengyu Li","Yuxuan Zhou","Shiwen Wang"],"pdf_url":"https://arxiv.org/pdf/2408.02444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23581v1","updated":"2024-10-31T02:47:42Z","published":"2024-10-31T02:47:42Z","title":"Distributed Formation Shape Control of Identity-less Robot Swarms","summary":" Different from most of the formation strategies where robots require unique\nlabels to identify topological neighbors to satisfy the predefined shape\nconstraints, we here study the problem of identity-less distributed shape\nformation in homogeneous swarms, which is rarely studied in the literature. The\nabsence of identities creates a unique challenge: how to design appropriate\ntarget formations and local behaviors that are suitable for identity-less\nformation shape control. To address this challenge, we propose the following\nnovel results. First, to avoid using unique identities, we propose a dynamic\nformation description method and solve the formation consensus of robots in a\nlocally distributed manner. Second, to handle identity-less distributed\nformations, we propose a fully distributed control law for homogeneous swarms\nbased on locally sensed information. While the existing methods are applicable\nto simple cases where the target formation is stationary, ours can tackle more\ngeneral maneuvering formations such as translation, rotation, or even shape\ndeformation. Both numerical simulation and flight experiment are presented to\nverify the effectiveness and robustness of our proposed formation strategy.\n","authors":["Guibin Sun","Yang Xu","Kexin Liu","Jinhu Lü"],"pdf_url":"https://arxiv.org/pdf/2410.23581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19291v2","updated":"2024-10-31T02:43:00Z","published":"2024-05-29T17:19:15Z","title":"Grasp as You Say: Language-guided Dexterous Grasp Generation","summary":" This paper explores a novel task \"Dexterous Grasp as You Say\" (DexGYS),\nenabling robots to perform dexterous grasping based on human commands expressed\nin natural language. However, the development of this field is hindered by the\nlack of datasets with natural human guidance; thus, we propose a\nlanguage-guided dexterous grasp dataset, named DexGYSNet, offering high-quality\ndexterous grasp annotations along with flexible and fine-grained human language\nguidance. Our dataset construction is cost-efficient, with the carefully-design\nhand-object interaction retargeting strategy, and the LLM-assisted language\nguidance annotation system. Equipped with this dataset, we introduce the\nDexGYSGrasp framework for generating dexterous grasps based on human language\ninstructions, with the capability of producing grasps that are intent-aligned,\nhigh quality and diversity. To achieve this capability, our framework\ndecomposes the complex learning process into two manageable progressive\nobjectives and introduce two components to realize them. The first component\nlearns the grasp distribution focusing on intention alignment and generation\ndiversity. And the second component refines the grasp quality while maintaining\nintention consistency. Extensive experiments are conducted on DexGYSNet and\nreal world environments for validation.\n","authors":["Yi-Lin Wei","Jian-Jian Jiang","Chengyi Xing","Xian-Tuo Tan","Xiao-Ming Wu","Hao Li","Mark Cutkosky","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2405.19291v2.pdf","comment":"Accepted by NeurIPS2024"},{"id":"http://arxiv.org/abs/2409.07116v2","updated":"2024-10-31T02:38:49Z","published":"2024-09-11T09:09:25Z","title":"iKalibr-RGBD: Partially-Specialized Target-Free Visual-Inertial\n Spatiotemporal Calibration For RGBDs via Continuous-Time Velocity Estimation","summary":" Visual-inertial systems have been widely studied and applied in the last two\ndecades (from the early 2000s to the present), mainly due to their low cost and\npower consumption, small footprint, and high availability. Such a trend\nsimultaneously leads to a large amount of visual-inertial calibration methods\nbeing presented, as accurate spatiotemporal parameters between sensors are a\nprerequisite for visual-inertial fusion. In our previous work, i.e., iKalibr, a\ncontinuous-time-based visual-inertial calibration method was proposed as a part\nof one-shot multi-sensor resilient spatiotemporal calibration. While requiring\nno artificial target brings considerable convenience, computationally expensive\npose estimation is demanded in initialization and batch optimization, limiting\nits availability. Fortunately, this could be vastly improved for the RGBDs with\nadditional depth information, by employing mapping-free ego-velocity estimation\ninstead of mapping-based pose estimation. In this paper, we present the\ncontinuous-time ego-velocity estimation-based RGBD-inertial spatiotemporal\ncalibration, termed as iKalibr-RGBD, which is also targetless but\ncomputationally efficient. The general pipeline of iKalibr-RGBD is inherited\nfrom iKalibr, composed of a rigorous initialization procedure and several\ncontinuous-time batch optimizations. The implementation of iKalibr-RGBD is\nopen-sourced at (https://github.com/Unsigned-Long/iKalibr) to benefit the\nresearch community.\n","authors":["Shuolong Chen","Xingxing Li","Shengyu Li","Yuxuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.07116v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23571v1","updated":"2024-10-31T02:27:25Z","published":"2024-10-31T02:27:25Z","title":"Dual Agent Learning Based Aerial Trajectory Tracking","summary":" This paper presents a novel reinforcement learning framework for trajectory\ntracking of unmanned aerial vehicles in cluttered environments using a\ndual-agent architecture. Traditional optimization methods for trajectory\ntracking face significant computational challenges and lack robustness in\ndynamic environments. Our approach employs deep reinforcement learning (RL) to\novercome these limitations, leveraging 3D pointcloud data to perceive the\nenvironment without relying on memory-intensive obstacle representations like\noccupancy grids. The proposed system features two RL agents: one for predicting\nUAV velocities to follow a reference trajectory and another for managing\ncollision avoidance in the presence of obstacles. This architecture ensures\nreal-time performance and adaptability to uncertainties. We demonstrate the\nefficacy of our approach through simulated and real-world experiments,\nhighlighting improvements over state-of-the-art RL and optimization-based\nmethods. Additionally, a curriculum learning paradigm is employed to scale the\nalgorithms to more complex environments, ensuring robust trajectory tracking\nand obstacle avoidance in both static and dynamic scenarios.\n","authors":["Shaswat Garg","Houman Masnavi","Baris Fidan","Farrokh Janabi-Sharifi"],"pdf_url":"https://arxiv.org/pdf/2410.23571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00769v3","updated":"2024-10-31T02:07:10Z","published":"2024-03-31T18:51:52Z","title":"An Active Perception Game for Robust Information Gathering","summary":" Active perception approaches select future viewpoints by using some estimate\nof the information gain. An inaccurate estimate can be detrimental in critical\nsituations, e.g., locating a person in distress. However the true information\ngained can only be calculated post hoc, i.e., after the observation is\nrealized. We present an approach for estimating the discrepancy between the\ninformation gain (which is the average over putative future observations) and\nthe true information gain. The key idea is to analyze the mathematical\nrelationship between active perception and the estimation error of the\ninformation gain in a game-theoretic setting. Using this, we develop an online\nestimation approach that achieves sub-linear regret (in the number of\ntime-steps) for the estimation of the true information gain and reduces the\nsub-optimality of active perception systems.\n We demonstrate our approach for active perception using a comprehensive set\nof experiments on: (a) different types of environments, including a quadrotor\nin a photorealistic simulation, real-world robotic data, and real-world\nexperiments with ground robots exploring indoor and outdoor scenes; (b)\ndifferent types of robotic perception data; and (c) different map\nrepresentations. On average, our approach reduces information gain estimation\nerrors by 42%, increases the information gain by 7%, PSNR by 5%, and semantic\naccuracy (measured as the number of objects that are localized correctly) by\n6%. In real-world experiments with a Jackal ground robot, our approach\ndemonstrated complex trajectories to explore occluded regions.\n","authors":["Siming He","Yuezhan Tao","Igor Spasojevic","Vijay Kumar","Pratik Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2404.00769v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23535v1","updated":"2024-10-31T00:56:08Z","published":"2024-10-31T00:56:08Z","title":"Simulating User Agents for Embodied Conversational-AI","summary":" Embodied agents designed to assist users with tasks must engage in natural\nlanguage interactions, interpret instructions, execute actions, and communicate\neffectively to resolve issues. However, collecting large-scale, diverse\ndatasets of situated human-robot dialogues to train and evaluate such agents is\nexpensive, labor-intensive, and time-consuming. To address this challenge, we\npropose building a large language model (LLM)-based user agent that can\nsimulate user behavior during interactions with an embodied agent in a virtual\nenvironment. Given a user goal (e.g., make breakfast), at each time step, the\nuser agent may observe\" the robot actions or speak\" to either intervene with\nthe robot or answer questions. Such a user agent assists in improving the\nscalability and efficiency of embodied dialogues dataset generation and is\ncritical for enhancing and evaluating the robot's interaction and task\ncompletion ability, as well as for research in reinforcement learning using AI\nfeedback. We evaluate our user agent's ability to generate human-like behaviors\nby comparing its simulated dialogues with the TEACh dataset. We perform three\nexperiments: zero-shot prompting to predict dialogue acts, few-shot prompting,\nand fine-tuning on the TEACh training subset. Results show the LLM-based user\nagent achieves an F-measure of 42% with zero-shot prompting and 43.4% with\nfew-shot prompting in mimicking human speaking behavior. Through fine-tuning,\nperformance in deciding when to speak remained stable, while deciding what to\nsay improved from 51.1% to 62.5%. These findings showcase the feasibility of\nthe proposed approach for assessing and enhancing the effectiveness of robot\ntask completion through natural language communication.\n","authors":["Daniel Philipov","Vardhan Dongre","Gokhan Tur","Dilek Hakkani-Tür"],"pdf_url":"https://arxiv.org/pdf/2410.23535v1.pdf","comment":"8 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2410.23522v1","updated":"2024-10-31T00:08:36Z","published":"2024-10-31T00:08:36Z","title":"LBurst: Learning-Based Robotic Burst Feature Extraction for 3D\n Reconstruction in Low Light","summary":" Drones have revolutionized the fields of aerial imaging, mapping, and\ndisaster recovery. However, the deployment of drones in low-light conditions is\nconstrained by the image quality produced by their on-board cameras. In this\npaper, we present a learning architecture for improving 3D reconstructions in\nlow-light conditions by finding features in a burst. Our approach enhances\nvisual reconstruction by detecting and describing high quality true features\nand less spurious features in low signal-to-noise ratio images. We demonstrate\nthat our method is capable of handling challenging scenes in millilux\nillumination, making it a significant step towards drones operating at night\nand in extremely low-light applications such as underground mining and search\nand rescue operations.\n","authors":["Ahalya Ravendran","Mitch Bryson","Donald G. Dansereau"],"pdf_url":"https://arxiv.org/pdf/2410.23522v1.pdf","comment":"7 pages, 8 figures, 3 tables, for associated project page, see\n https://roboticimaging.org/Projects/LBurst/"},{"id":"http://arxiv.org/abs/2403.14597v3","updated":"2024-10-31T21:33:32Z","published":"2024-03-21T17:50:22Z","title":"Extended Reality for Enhanced Human-Robot Collaboration: a\n Human-in-the-Loop Approach","summary":" The rise of automation has provided an opportunity to achieve higher\nefficiency in manufacturing processes, yet it often compromises the flexibility\nrequired to promptly respond to evolving market needs and meet the demand for\ncustomization. Human-robot collaboration attempts to tackle these challenges by\ncombining the strength and precision of machines with human ingenuity and\nperceptual understanding. In this paper, we conceptualize and propose an\nimplementation framework for an autonomous, machine learning-based manipulator\nthat incorporates human-in-the-loop principles and leverages Extended Reality\n(XR) to facilitate intuitive communication and programming between humans and\nrobots. Furthermore, the conceptual framework foresees human involvement\ndirectly in the robot learning process, resulting in higher adaptability and\ntask generalization. The paper highlights key technologies enabling the\nproposed framework, emphasizing the importance of developing the digital\necosystem as a whole. Additionally, we review the existent implementation\napproaches of XR in human-robot collaboration, showcasing diverse perspectives\nand methodologies. The challenges and future outlooks are discussed, delving\ninto the major obstacles and potential research avenues of XR for more natural\nhuman-robot interaction and integration in the industrial landscape.\n","authors":["Yehor Karpichev","Todd Charter","Jayden Hong","Amir M. Soufi Enayati","Homayoun Honari","Mehran Ghafarian Tamizi","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2403.14597v3.pdf","comment":"Published in IEEE International Conference on Robot and Human\n Interactive Communication (RO-MAN) 2024"},{"id":"http://arxiv.org/abs/2403.16275v2","updated":"2024-10-31T19:45:39Z","published":"2024-03-24T19:47:37Z","title":"M^3RS: Multi-robot, Multi-objective, and Multi-mode Routing and\n Scheduling","summary":" The quality of task execution can significantly impact a multi-robot mission.\nWhile higher quality is desirable, it may not always be feasible due to mission\nconstraints. Existing multi-robot task allocation literature generally\noverlooks quality of service as a decision variable. Addressing this gap, we\nintroduce the multi-robot, multi-objective, and multi-mode routing and\nscheduling (M^3RS) problem, designed for time-bound, multi-robot,\nmulti-objective missions. In M^3RS, each task offers multiple execution modes,\neach with different resource requirements, execution time, and quality. M^3RS\noptimizes task sequences and execution modes for each agent. The need for M^3RS\ncomes from multi-robot applications in which a trade-off between multiple\ncriteria can be achieved by varying the task level quality of service through\ntask execution modes. Such ability is particularly useful for service robot\napplications. We use M^3RS for the application of multi-robot disinfection in\nhealthcare environments and other public locations. The objectives considered\nfor disinfection application are disinfection quality and number of tasks\ncompleted. A mixed-integer linear programming (MIP) model is proposed for\nM^3RS. Further, a clustering-based column generation (CCG) algorithm is\nproposed to handle larger problem instances. Through synthetic, simulated, and\nhardware case studies, we demonstrate the advantages of M^3RS, showing it\nprovides flexibility and strong performance across multiple metrics. Our CCG\nalgorithm generates solutions 2.5x faster than a baseline MIP optimizer,\nmaintaining competitive performance. The videos for the experiments are\navailable on the project website: https://sites.google.com/view/g-robot/m3rs/\n","authors":["Ishaan Mehta","Junseo Kim","Sharareh Taghipour","Sajad Saeedi"],"pdf_url":"https://arxiv.org/pdf/2403.16275v2.pdf","comment":"Submitted to IEEE Systems"},{"id":"http://arxiv.org/abs/2410.07801v3","updated":"2024-10-31T18:06:49Z","published":"2024-10-10T10:40:42Z","title":"LucidGrasp: Robotic Framework for Autonomous Manipulation of Laboratory\n Equipment with Different Degrees of Transparency via 6D Pose Estimation","summary":" Many modern robotic systems operate autonomously, however they often lack the\nability to accurately analyze the environment and adapt to changing external\nconditions, while teleoperation systems often require special operator skills.\nIn the field of laboratory automation, the number of automated processes is\ngrowing, however such systems are usually developed to perform specific tasks.\nIn addition, many of the objects used in this field are transparent, making it\ndifficult to analyze them using visual channels. The contributions of this work\ninclude the development of a robotic framework with autonomous mode for\nmanipulating liquid-filled objects with different degrees of transparency in\ncomplex pose combinations. The conducted experiments demonstrated the\nrobustness of the designed visual perception system to accurately estimate\nobject poses for autonomous manipulation, and confirmed the performance of the\nalgorithms in dexterous operations such as liquid dispensing. The proposed\nrobotic framework can be applied for laboratory automation, since it allows\nsolving the problem of performing non-trivial manipulation tasks with the\nanalysis of object poses of varying degrees of transparency and liquid levels,\nrequiring high accuracy and repeatability.\n","authors":["Maria Makarova","Daria Trinitatova","Qian Liu","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2410.07801v3.pdf","comment":"Accepted to the 2024 IEEE International Conference on Robotics and\n Biomimetics (IEEE ROBIO 2024), 6 pages, 8 figures"},{"id":"http://arxiv.org/abs/2410.23277v2","updated":"2024-10-31T18:03:51Z","published":"2024-10-30T17:55:52Z","title":"SlowFast-VGen: Slow-Fast Learning for Action-Driven Long Video\n Generation","summary":" Human beings are endowed with a complementary learning system, which bridges\nthe slow learning of general world dynamics with fast storage of episodic\nmemory from a new experience. Previous video generation models, however,\nprimarily focus on slow learning by pre-training on vast amounts of data,\noverlooking the fast learning phase crucial for episodic memory storage. This\noversight leads to inconsistencies across temporally distant frames when\ngenerating longer videos, as these frames fall beyond the model's context\nwindow. To this end, we introduce SlowFast-VGen, a novel dual-speed learning\nsystem for action-driven long video generation. Our approach incorporates a\nmasked conditional video diffusion model for the slow learning of world\ndynamics, alongside an inference-time fast learning strategy based on a\ntemporal LoRA module. Specifically, the fast learning process updates its\ntemporal LoRA parameters based on local inputs and outputs, thereby efficiently\nstoring episodic memory in its parameters. We further propose a slow-fast\nlearning loop algorithm that seamlessly integrates the inner fast learning loop\ninto the outer slow learning loop, enabling the recall of prior multi-episode\nexperiences for context-aware skill learning. To facilitate the slow learning\nof an approximate world model, we collect a large-scale dataset of 200k videos\nwith language action annotations, covering a wide range of scenarios. Extensive\nexperiments show that SlowFast-VGen outperforms baselines across various\nmetrics for action-driven video generation, achieving an FVD score of 514\ncompared to 782, and maintaining consistency in longer videos, with an average\nof 0.37 scene cuts versus 0.89. The slow-fast learning loop algorithm\nsignificantly enhances performances on long-horizon planning tasks as well.\nProject Website: https://slowfast-vgen.github.io\n","authors":["Yining Hong","Beide Liu","Maxine Wu","Yuanhao Zhai","Kai-Wei Chang","Linjie Li","Kevin Lin","Chung-Ching Lin","Jianfeng Wang","Zhengyuan Yang","Yingnian Wu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2410.23277v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00221v1","updated":"2024-10-31T21:36:53Z","published":"2024-10-31T21:36:53Z","title":"BOMP: Bin-Optimized Motion Planning","summary":" In logistics, the ability to quickly compute and execute pick-and-place\nmotions from bins is critical to increasing productivity. We present\nBin-Optimized Motion Planning (BOMP), a motion planning framework that plans\narm motions for a six-axis industrial robot with a long-nosed suction tool to\nremove boxes from deep bins. BOMP considers robot arm kinematics, actuation\nlimits, the dimensions of a grasped box, and a varying height map of a bin\nenvironment to rapidly generate time-optimized, jerk-limited, and\ncollision-free trajectories. The optimization is warm-started using a deep\nneural network trained offline in simulation with 25,000 scenes and\ncorresponding trajectories. Experiments with 96 simulated and 15 physical\nenvironments suggest that BOMP generates collision-free trajectories that are\nup to 58 % faster than baseline sampling-based planners and up to 36 % faster\nthan an industry-standard Up-Over-Down algorithm, which has an extremely low 15\n% success rate in this context. BOMP also generates jerk-limited trajectories\nwhile baselines do not. Website: https://sites.google.com/berkeley.edu/bomp.\n","authors":["Zachary Tam","Karthik Dharmarajan","Tianshuang Qiu","Yahav Avigal","Jeffrey Ichnowski","Ken Goldberg"],"pdf_url":"https://arxiv.org/pdf/2411.00221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00174v1","updated":"2024-10-31T19:42:42Z","published":"2024-10-31T19:42:42Z","title":"Pedestrian Trajectory Prediction with Missing Data: Datasets,\n Imputation, and Benchmarking","summary":" Pedestrian trajectory prediction is crucial for several applications such as\nrobotics and self-driving vehicles. Significant progress has been made in the\npast decade thanks to the availability of pedestrian trajectory datasets, which\nenable trajectory prediction methods to learn from pedestrians' past movements\nand predict future trajectories. However, these datasets and methods typically\nassume that the observed trajectory sequence is complete, ignoring real-world\nissues such as sensor failure, occlusion, and limited fields of view that can\nresult in missing values in observed trajectories. To address this challenge,\nwe present TrajImpute, a pedestrian trajectory prediction dataset that\nsimulates missing coordinates in the observed trajectory, enhancing real-world\napplicability. TrajImpute maintains a uniform distribution of missing data\nwithin the observed trajectories. In this work, we comprehensively examine\nseveral imputation methods to reconstruct the missing coordinates and benchmark\nthem for imputing pedestrian trajectories. Furthermore, we provide a thorough\nanalysis of recent trajectory prediction methods and evaluate the performance\nof these models on the imputed trajectories. Our experimental evaluation of the\nimputation and trajectory prediction methods offers several valuable insights.\nOur dataset provides a foundational resource for future research on\nimputation-aware pedestrian trajectory prediction, potentially accelerating the\ndeployment of these methods in real-world applications. Publicly accessible\nlinks to the datasets and code files are available at\nhttps://github.com/Pranav-chib/TrajImpute.\n","authors":["Pranav Singh Chib","Pravendra Singh"],"pdf_url":"https://arxiv.org/pdf/2411.00174v1.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.00138v1","updated":"2024-10-31T18:37:22Z","published":"2024-10-31T18:37:22Z","title":"Learning Low-Dimensional Strain Models of Soft Robots by Looking at the\n Evolution of Their Shape with Application to Model-Based Control","summary":" Obtaining dynamic models of continuum soft robots is central to the analysis\nand control of soft robots, and researchers have devoted much attention to the\nchallenge of proposing both data-driven and first-principle solutions. Both\navenues have, however, shown their limitations; the former lacks structure and\nperforms poorly outside training data, while the latter requires significant\nsimplifications and extensive expert knowledge to be used in practice. This\npaper introduces a streamlined method for learning low-dimensional,\nphysics-based models that are both accurate and easy to interpret. We start\nwith an algorithm that uses image data (i.e., shape evolutions) to determine\nthe minimal necessary segments for describing a soft robot's movement.\nFollowing this, we apply a dynamic regression and strain sparsification\nalgorithm to identify relevant strains and define the model's dynamics. We\nvalidate our approach through simulations with various planar soft\nmanipulators, comparing its performance against other learning strategies,\nshowing that our models are both computationally efficient and 25x more\naccurate on out-of-training distribution inputs. Finally, we demonstrate that\nthanks to the capability of the method of generating physically compatible\nmodels, the learned models can be straightforwardly combined with model-based\ncontrol policies.\n","authors":["Ricardo Valadas","Maximilian Stölzle","Jingyue Liu","Cosimo Della Santina"],"pdf_url":"https://arxiv.org/pdf/2411.00138v1.pdf","comment":"8 pages, under review"},{"id":"http://arxiv.org/abs/2411.00137v1","updated":"2024-10-31T18:35:03Z","published":"2024-10-31T18:35:03Z","title":"Cost-Aware Query Policies in Active Learning for Efficient Autonomous\n Robotic Exploration","summary":" In missions constrained by finite resources, efficient data collection is\ncritical. Informative path planning, driven by automated decision-making,\noptimizes exploration by reducing the costs associated with accurate\ncharacterization of a target in an environment. Previous implementations of\nactive learning did not consider the action cost for regression problems or\nonly considered the action cost for classification problems. This paper\nanalyzes an AL algorithm for Gaussian Process regression while incorporating\naction cost. The algorithm's performance is compared on various regression\nproblems to include terrain mapping on diverse simulated surfaces along metrics\nof root mean square error, samples and distance until convergence, and model\nvariance upon convergence. The cost-dependent acquisition policy doesn't\norganically optimize information gain over distance. Instead, the traditional\nuncertainty metric with a distance constraint best minimizes root-mean-square\nerror over trajectory distance. This studys impact is to provide insight into\nincorporating action cost with AL methods to optimize exploration under\nrealistic mission constraints.\n","authors":["Sapphira Akins","Hans Mertens","Frances Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.00137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00107v1","updated":"2024-10-31T18:02:30Z","published":"2024-10-31T18:02:30Z","title":"First, Learn What You Don't Know: Active Information Gathering for\n Driving at the Limits of Handling","summary":" Combining data-driven models that adapt online and model predictive control\n(MPC) has enabled effective control of nonlinear systems. However, when\ndeployed on unstable systems, online adaptation may not be fast enough to\nensure reliable simultaneous learning and control. For example, controllers on\na vehicle executing highly dynamic maneuvers may push the tires to their\nfriction limits, destabilizing the vehicle and allowing modeling errors to\nquickly compound and cause a loss of control. In this work, we present a\nBayesian meta-learning MPC framework. We propose an expressive vehicle dynamics\nmodel that leverages Bayesian last-layer meta-learning to enable rapid online\nadaptation. The model's uncertainty estimates are used to guide informative\ndata collection and quickly improve the model prior to deployment. Experiments\non a Toyota Supra show that (i) the framework enables reliable control in\ndynamic drifting maneuvers, (ii) online adaptation alone may not suffice for\nzero-shot control of a vehicle at the edge of stability, and (iii) active data\ncollection helps achieve reliable performance.\n","authors":["Alexander Davydov","Franck Djeumou","Marcus Greiff","Makoto Suminaka","Michael Thompson","John Subosits","Thomas Lew"],"pdf_url":"https://arxiv.org/pdf/2411.00107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00083v1","updated":"2024-10-31T17:59:49Z","published":"2024-10-31T17:59:49Z","title":"Learning Visual Parkour from Generated Images","summary":" Fast and accurate physics simulation is an essential component of robot\nlearning, where robots can explore failure scenarios that are difficult to\nproduce in the real world and learn from unlimited on-policy data. Yet, it\nremains challenging to incorporate RGB-color perception into the sim-to-real\npipeline that matches the real world in its richness and realism. In this work,\nwe train a robot dog in simulation for visual parkour. We propose a way to use\ngenerative models to synthesize diverse and physically accurate image sequences\nof the scene from the robot's ego-centric perspective. We present\ndemonstrations of zero-shot transfer to the RGB-only observations of the real\nworld on a robot equipped with a low-cost, off-the-shelf color camera. website\nvisit https://lucidsim.github.io\n","authors":["Alan Yu","Ge Yang","Ran Choi","Yajvan Ravan","John Leonard","Phillip Isola"],"pdf_url":"https://arxiv.org/pdf/2411.00083v1.pdf","comment":"17 pages, 19 figures"},{"id":"http://arxiv.org/abs/2411.00081v1","updated":"2024-10-31T17:53:12Z","published":"2024-10-31T17:53:12Z","title":"PARTNR: A Benchmark for Planning and Reasoning in Embodied Multi-agent\n Tasks","summary":" We present a benchmark for Planning And Reasoning Tasks in humaN-Robot\ncollaboration (PARTNR) designed to study human-robot coordination in household\nactivities. PARTNR tasks exhibit characteristics of everyday tasks, such as\nspatial, temporal, and heterogeneous agent capability constraints. We employ a\nsemi-automated task generation pipeline using Large Language Models (LLMs),\nincorporating simulation in the loop for grounding and verification. PARTNR\nstands as the largest benchmark of its kind, comprising 100,000 natural\nlanguage tasks, spanning 60 houses and 5,819 unique objects. We analyze\nstate-of-the-art LLMs on PARTNR tasks, across the axes of planning, perception\nand skill execution. The analysis reveals significant limitations in SoTA\nmodels, such as poor coordination and failures in task tracking and recovery\nfrom errors. When LLMs are paired with real humans, they require 1.5x as many\nsteps as two humans collaborating and 1.1x more steps than a single human,\nunderscoring the potential for improvement in these models. We further show\nthat fine-tuning smaller LLMs with planning data can achieve performance on par\nwith models 9 times larger, while being 8.6x faster at inference. Overall,\nPARTNR highlights significant challenges facing collaborative embodied agents\nand aims to drive research in this direction.\n","authors":["Matthew Chang","Gunjan Chhablani","Alexander Clegg","Mikael Dallaire Cote","Ruta Desai","Michal Hlavac","Vladimir Karashchuk","Jacob Krantz","Roozbeh Mottaghi","Priyam Parashar","Siddharth Patki","Ishita Prasad","Xavier Puig","Akshara Rai","Ram Ramrakhya","Daniel Tran","Joanne Truong","John M. Turner","Eric Undersander","Tsung-Yen Yang"],"pdf_url":"https://arxiv.org/pdf/2411.00081v1.pdf","comment":"Alphabetical author order"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2410.24223v1","updated":"2024-10-31T17:59:56Z","published":"2024-10-31T17:59:56Z","title":"URAvatar: Universal Relightable Gaussian Codec Avatars","summary":" We present a new approach to creating photorealistic and relightable head\navatars from a phone scan with unknown illumination. The reconstructed avatars\ncan be animated and relit in real time with the global illumination of diverse\nenvironments. Unlike existing approaches that estimate parametric reflectance\nparameters via inverse rendering, our approach directly models learnable\nradiance transfer that incorporates global light transport in an efficient\nmanner for real-time rendering. However, learning such a complex light\ntransport that can generalize across identities is non-trivial. A phone scan in\na single environment lacks sufficient information to infer how the head would\nappear in general environments. To address this, we build a universal\nrelightable avatar model represented by 3D Gaussians. We train on hundreds of\nhigh-quality multi-view human scans with controllable point lights.\nHigh-resolution geometric guidance further enhances the reconstruction accuracy\nand generalization. Once trained, we finetune the pretrained model on a phone\nscan using inverse rendering to obtain a personalized relightable avatar. Our\nexperiments establish the efficacy of our design, outperforming existing\napproaches while retaining real-time rendering capability.\n","authors":["Junxuan Li","Chen Cao","Gabriel Schwartz","Rawal Khirodkar","Christian Richardt","Tomas Simon","Yaser Sheikh","Shunsuke Saito"],"pdf_url":"https://arxiv.org/pdf/2410.24223v1.pdf","comment":"SIGGRAPH Asia 2024. Website:\n https://junxuan-li.github.io/urgca-website/"},{"id":"http://arxiv.org/abs/2410.24221v1","updated":"2024-10-31T17:59:55Z","published":"2024-10-31T17:59:55Z","title":"EgoMimic: Scaling Imitation Learning via Egocentric Video","summary":" The scale and diversity of demonstration data required for imitation learning\nis a significant challenge. We present EgoMimic, a full-stack framework which\nscales manipulation via human embodiment data, specifically egocentric human\nvideos paired with 3D hand tracking. EgoMimic achieves this through: (1) a\nsystem to capture human embodiment data using the ergonomic Project Aria\nglasses, (2) a low-cost bimanual manipulator that minimizes the kinematic gap\nto human data, (3) cross-domain data alignment techniques, and (4) an imitation\nlearning architecture that co-trains on human and robot data. Compared to prior\nworks that only extract high-level intent from human videos, our approach\ntreats human and robot data equally as embodied demonstration data and learns a\nunified policy from both data sources. EgoMimic achieves significant\nimprovement on a diverse set of long-horizon, single-arm and bimanual\nmanipulation tasks over state-of-the-art imitation learning methods and enables\ngeneralization to entirely new scenes. Finally, we show a favorable scaling\ntrend for EgoMimic, where adding 1 hour of additional hand data is\nsignificantly more valuable than 1 hour of additional robot data. Videos and\nadditional information can be found at https://egomimic.github.io/\n","authors":["Simar Kareer","Dhruv Patel","Ryan Punamiya","Pranay Mathur","Shuo Cheng","Chen Wang","Judy Hoffman","Danfei Xu"],"pdf_url":"https://arxiv.org/pdf/2410.24221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24219v1","updated":"2024-10-31T17:59:53Z","published":"2024-10-31T17:59:53Z","title":"Enhancing Motion in Text-to-Video Generation with Decomposed Encoding\n and Conditioning","summary":" Despite advancements in Text-to-Video (T2V) generation, producing videos with\nrealistic motion remains challenging. Current models often yield static or\nminimally dynamic outputs, failing to capture complex motions described by\ntext. This issue stems from the internal biases in text encoding, which\noverlooks motions, and inadequate conditioning mechanisms in T2V generation\nmodels. To address this, we propose a novel framework called DEcomposed MOtion\n(DEMO), which enhances motion synthesis in T2V generation by decomposing both\ntext encoding and conditioning into content and motion components. Our method\nincludes a content encoder for static elements and a motion encoder for\ntemporal dynamics, alongside separate content and motion conditioning\nmechanisms. Crucially, we introduce text-motion and video-motion supervision to\nimprove the model's understanding and generation of motion. Evaluations on\nbenchmarks such as MSR-VTT, UCF-101, WebVid-10M, EvalCrafter, and VBench\ndemonstrate DEMO's superior ability to produce videos with enhanced motion\ndynamics while maintaining high visual quality. Our approach significantly\nadvances T2V generation by integrating comprehensive motion understanding\ndirectly from textual descriptions. Project page:\nhttps://PR-Ryan.github.io/DEMO-project/\n","authors":["Penghui Ruan","Pichao Wang","Divya Saxena","Jiannong Cao","Yuhui Shi"],"pdf_url":"https://arxiv.org/pdf/2410.24219v1.pdf","comment":"Accepted at NeurIPS 2024, code available at\n https://github.com/PR-Ryan/DEMO"},{"id":"http://arxiv.org/abs/2410.24218v1","updated":"2024-10-31T17:59:52Z","published":"2024-10-31T17:59:52Z","title":"Teaching Embodied Reinforcement Learning Agents: Informativeness and\n Diversity of Language Use","summary":" In real-world scenarios, it is desirable for embodied agents to have the\nability to leverage human language to gain explicit or implicit knowledge for\nlearning tasks. Despite recent progress, most previous approaches adopt simple\nlow-level instructions as language inputs, which may not reflect natural human\ncommunication. It's not clear how to incorporate rich language use to\nfacilitate task learning. To address this question, this paper studies\ndifferent types of language inputs in facilitating reinforcement learning (RL)\nembodied agents. More specifically, we examine how different levels of language\ninformativeness (i.e., feedback on past behaviors and future guidance) and\ndiversity (i.e., variation of language expressions) impact agent learning and\ninference. Our empirical results based on four RL benchmarks demonstrate that\nagents trained with diverse and informative language feedback can achieve\nenhanced generalization and fast adaptation to new tasks. These findings\nhighlight the pivotal role of language use in teaching embodied agents new\ntasks in an open world. Project website:\nhttps://github.com/sled-group/Teachable_RL\n","authors":["Jiajun Xi","Yinong He","Jianing Yang","Yinpei Dai","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2410.24218v1.pdf","comment":"EMNLP 2024 Main. Project website:\n https://github.com/sled-group/Teachable_RL"},{"id":"http://arxiv.org/abs/2410.24214v1","updated":"2024-10-31T17:59:37Z","published":"2024-10-31T17:59:37Z","title":"ARQ: A Mixed-Precision Quantization Framework for Accurate and\n Certifiably Robust DNNs","summary":" Mixed precision quantization has become an important technique for enabling\nthe execution of deep neural networks (DNNs) on limited resource computing\nplatforms. Traditional quantization methods have primarily concentrated on\nmaintaining neural network accuracy, either ignoring the impact of quantization\non the robustness of the network, or using only empirical techniques for\nimproving robustness. In contrast, techniques for robustness certification,\nwhich can provide strong guarantees about the robustness of DNNs have not been\nused during quantization due to their high computation cost.\n This paper introduces ARQ, an innovative mixed-precision quantization method\nthat not only preserves the clean accuracy of the smoothed classifiers but also\nmaintains their certified robustness. ARQ uses reinforcement learning to find\naccurate and robust DNN quantization, while efficiently leveraging randomized\nsmoothing, a popular class of statistical DNN verification algorithms, to guide\nthe search process.\n We compare ARQ with multiple state-of-the-art quantization techniques on\nseveral DNN architectures commonly used in quantization studies: ResNet-20 on\nCIFAR-10, ResNet-50 on ImageNet, and MobileNetV2 on ImageNet. We demonstrate\nthat ARQ consistently performs better than these baselines across all the\nbenchmarks and the input perturbation levels. In many cases, the performance of\nARQ quantized networks can reach that of the original DNN with floating-point\nweights, but with only 1.5% instructions.\n","authors":["Yuchen Yang","Shubham Ugare","Yifan Zhao","Gagandeep Singh","Sasa Misailovic"],"pdf_url":"https://arxiv.org/pdf/2410.24214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24213v1","updated":"2024-10-31T17:59:30Z","published":"2024-10-31T17:59:30Z","title":"Learning Video Representations without Natural Videos","summary":" In this paper, we show that useful video representations can be learned from\nsynthetic videos and natural images, without incorporating natural videos in\nthe training. We propose a progression of video datasets synthesized by simple\ngenerative processes, that model a growing set of natural video properties\n(e.g. motion, acceleration, and shape transformations). The downstream\nperformance of video models pre-trained on these generated datasets gradually\nincreases with the dataset progression. A VideoMAE model pre-trained on our\nsynthetic videos closes 97.2% of the performance gap on UCF101 action\nclassification between training from scratch and self-supervised pre-training\nfrom natural videos, and outperforms the pre-trained model on HMDB51.\nIntroducing crops of static images to the pre-training stage results in similar\nperformance to UCF101 pre-training and outperforms the UCF101 pre-trained model\non 11 out of 14 out-of-distribution datasets of UCF101-P. Analyzing the\nlow-level properties of the datasets, we identify correlations between frame\ndiversity, frame similarity to natural data, and downstream performance. Our\napproach provides a more controllable and transparent alternative to video data\ncuration processes for pre-training.\n","authors":["Xueyang Yu","Xinlei Chen","Yossi Gandelsman"],"pdf_url":"https://arxiv.org/pdf/2410.24213v1.pdf","comment":"Project page: https://unicorn53547.github.io/video_syn_rep/"},{"id":"http://arxiv.org/abs/2410.24211v1","updated":"2024-10-31T17:59:01Z","published":"2024-10-31T17:59:01Z","title":"DELTA: Dense Efficient Long-range 3D Tracking for any video","summary":" Tracking dense 3D motion from monocular videos remains challenging,\nparticularly when aiming for pixel-level precision over long sequences. We\nintroduce \\Approach, a novel method that efficiently tracks every pixel in 3D\nspace, enabling accurate motion estimation across entire videos. Our approach\nleverages a joint global-local attention mechanism for reduced-resolution\ntracking, followed by a transformer-based upsampler to achieve high-resolution\npredictions. Unlike existing methods, which are limited by computational\ninefficiency or sparse tracking, \\Approach delivers dense 3D tracking at scale,\nrunning over 8x faster than previous methods while achieving state-of-the-art\naccuracy. Furthermore, we explore the impact of depth representation on\ntracking performance and identify log-depth as the optimal choice. Extensive\nexperiments demonstrate the superiority of \\Approach on multiple benchmarks,\nachieving new state-of-the-art results in both 2D and 3D dense tracking tasks.\nOur method provides a robust solution for applications requiring fine-grained,\nlong-term motion tracking in 3D space.\n","authors":["Tuan Duc Ngo","Peiye Zhuang","Chuang Gan","Evangelos Kalogerakis","Sergey Tulyakov","Hsin-Ying Lee","Chaoyang Wang"],"pdf_url":"https://arxiv.org/pdf/2410.24211v1.pdf","comment":"Project Page: https://snap-research.github.io/DELTA/"},{"id":"http://arxiv.org/abs/2406.15349v2","updated":"2024-10-31T17:58:34Z","published":"2024-06-21T17:59:02Z","title":"NAVSIM: Data-Driven Non-Reactive Autonomous Vehicle Simulation and\n Benchmarking","summary":" Benchmarking vision-based driving policies is challenging. On one hand,\nopen-loop evaluation with real data is easy, but these results do not reflect\nclosed-loop performance. On the other, closed-loop evaluation is possible in\nsimulation, but is hard to scale due to its significant computational demands.\nFurther, the simulators available today exhibit a large domain gap to real\ndata. This has resulted in an inability to draw clear conclusions from the\nrapidly growing body of research on end-to-end autonomous driving. In this\npaper, we present NAVSIM, a middle ground between these evaluation paradigms,\nwhere we use large datasets in combination with a non-reactive simulator to\nenable large-scale real-world benchmarking. Specifically, we gather\nsimulation-based metrics, such as progress and time to collision, by unrolling\nbird's eye view abstractions of the test scenes for a short simulation horizon.\nOur simulation is non-reactive, i.e., the evaluated policy and environment do\nnot influence each other. As we demonstrate empirically, this decoupling allows\nopen-loop metric computation while being better aligned with closed-loop\nevaluations than traditional displacement errors. NAVSIM enabled a new\ncompetition held at CVPR 2024, where 143 teams submitted 463 entries, resulting\nin several new insights. On a large set of challenging scenarios, we observe\nthat simple methods with moderate compute requirements such as TransFuser can\nmatch recent large-scale end-to-end driving architectures such as UniAD. Our\nmodular framework can potentially be extended with new datasets, data curation\nstrategies, and metrics, and will be continually maintained to host future\nchallenges. Our code is available at\nhttps://github.com/autonomousvision/navsim.\n","authors":["Daniel Dauner","Marcel Hallgarten","Tianyu Li","Xinshuo Weng","Zhiyu Huang","Zetong Yang","Hongyang Li","Igor Gilitschenski","Boris Ivanovic","Marco Pavone","Andreas Geiger","Kashyap Chitta"],"pdf_url":"https://arxiv.org/pdf/2406.15349v2.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2410.24207v1","updated":"2024-10-31T17:58:22Z","published":"2024-10-31T17:58:22Z","title":"No Pose, No Problem: Surprisingly Simple 3D Gaussian Splats from Sparse\n Unposed Images","summary":" We introduce NoPoSplat, a feed-forward model capable of reconstructing 3D\nscenes parameterized by 3D Gaussians from \\textit{unposed} sparse multi-view\nimages. Our model, trained exclusively with photometric loss, achieves\nreal-time 3D Gaussian reconstruction during inference. To eliminate the need\nfor accurate pose input during reconstruction, we anchor one input view's local\ncamera coordinates as the canonical space and train the network to predict\nGaussian primitives for all views within this space. This approach obviates the\nneed to transform Gaussian primitives from local coordinates into a global\ncoordinate system, thus avoiding errors associated with per-frame Gaussians and\npose estimation. To resolve scale ambiguity, we design and compare various\nintrinsic embedding methods, ultimately opting to convert camera intrinsics\ninto a token embedding and concatenate it with image tokens as input to the\nmodel, enabling accurate scene scale prediction. We utilize the reconstructed\n3D Gaussians for novel view synthesis and pose estimation tasks and propose a\ntwo-stage coarse-to-fine pipeline for accurate pose estimation. Experimental\nresults demonstrate that our pose-free approach can achieve superior novel view\nsynthesis quality compared to pose-required methods, particularly in scenarios\nwith limited input image overlap. For pose estimation, our method, trained\nwithout ground truth depth or explicit matching loss, significantly outperforms\nthe state-of-the-art methods with substantial improvements. This work makes\nsignificant advances in pose-free generalizable 3D reconstruction and\ndemonstrates its applicability to real-world scenarios. Code and trained models\nare available at https://noposplat.github.io/.\n","authors":["Botao Ye","Sifei Liu","Haofei Xu","Xueting Li","Marc Pollefeys","Ming-Hsuan Yang","Songyou Peng"],"pdf_url":"https://arxiv.org/pdf/2410.24207v1.pdf","comment":"Project page: https://noposplat.github.io/"},{"id":"http://arxiv.org/abs/2410.24204v1","updated":"2024-10-31T17:57:07Z","published":"2024-10-31T17:57:07Z","title":"GeoSplatting: Towards Geometry Guided Gaussian Splatting for\n Physically-based Inverse Rendering","summary":" We consider the problem of physically-based inverse rendering using 3D\nGaussian Splatting (3DGS) representations. While recent 3DGS methods have\nachieved remarkable results in novel view synthesis (NVS), accurately capturing\nhigh-fidelity geometry, physically interpretable materials and lighting remains\nchallenging, as it requires precise geometry modeling to provide accurate\nsurface normals, along with physically-based rendering (PBR) techniques to\nensure correct material and lighting disentanglement. Previous 3DGS methods\nresort to approximating surface normals, but often struggle with noisy local\ngeometry, leading to inaccurate normal estimation and suboptimal\nmaterial-lighting decomposition. In this paper, we introduce GeoSplatting, a\nnovel hybrid representation that augments 3DGS with explicit geometric guidance\nand differentiable PBR equations. Specifically, we bridge isosurface and 3DGS\ntogether, where we first extract isosurface mesh from a scalar field, then\nconvert it into 3DGS points and formulate PBR equations for them in a fully\ndifferentiable manner. In GeoSplatting, 3DGS is grounded on the mesh geometry,\nenabling precise surface normal modeling, which facilitates the use of PBR\nframeworks for material decomposition. This approach further maintains the\nefficiency and quality of NVS from 3DGS while ensuring accurate geometry from\nthe isosurface. Comprehensive evaluations across diverse datasets demonstrate\nthe superiority of GeoSplatting, consistently outperforming existing methods\nboth quantitatively and qualitatively.\n","authors":["Kai Ye","Chong Gao","Guanbin Li","Wenzheng Chen","Baoquan Chen"],"pdf_url":"https://arxiv.org/pdf/2410.24204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24203v1","updated":"2024-10-31T17:57:02Z","published":"2024-10-31T17:57:02Z","title":"DiffPano: Scalable and Consistent Text to Panorama Generation with\n Spherical Epipolar-Aware Diffusion","summary":" Diffusion-based methods have achieved remarkable achievements in 2D image or\n3D object generation, however, the generation of 3D scenes and even\n$360^{\\circ}$ images remains constrained, due to the limited number of scene\ndatasets, the complexity of 3D scenes themselves, and the difficulty of\ngenerating consistent multi-view images. To address these issues, we first\nestablish a large-scale panoramic video-text dataset containing millions of\nconsecutive panoramic keyframes with corresponding panoramic depths, camera\nposes, and text descriptions. Then, we propose a novel text-driven panoramic\ngeneration framework, termed DiffPano, to achieve scalable, consistent, and\ndiverse panoramic scene generation. Specifically, benefiting from the powerful\ngenerative capabilities of stable diffusion, we fine-tune a single-view\ntext-to-panorama diffusion model with LoRA on the established panoramic\nvideo-text dataset. We further design a spherical epipolar-aware multi-view\ndiffusion model to ensure the multi-view consistency of the generated panoramic\nimages. Extensive experiments demonstrate that DiffPano can generate scalable,\nconsistent, and diverse panoramic images with given unseen text descriptions\nand camera poses.\n","authors":["Weicai Ye","Chenhao Ji","Zheng Chen","Junyao Gao","Xiaoshui Huang","Song-Hai Zhang","Wanli Ouyang","Tong He","Cairong Zhao","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.24203v1.pdf","comment":"NeurIPS2024, Project: https://github.com/zju3dv/DiffPano; Code:\n https://github.com/zju3dv/DiffPano"},{"id":"http://arxiv.org/abs/2410.24187v1","updated":"2024-10-31T17:49:44Z","published":"2024-10-31T17:49:44Z","title":"Chasing Better Deep Image Priors between Over- and\n Under-parameterization","summary":" Deep Neural Networks (DNNs) are well-known to act as over-parameterized deep\nimage priors (DIP) that regularize various image inverse problems. Meanwhile,\nresearchers also proposed extremely compact, under-parameterized image priors\n(e.g., deep decoder) that are strikingly competent for image restoration too,\ndespite a loss of accuracy. These two extremes push us to think whether there\nexists a better solution in the middle: between over- and under-parameterized\nimage priors, can one identify \"intermediate\" parameterized image priors that\nachieve better trade-offs between performance, efficiency, and even preserving\nstrong transferability? Drawing inspirations from the lottery ticket hypothesis\n(LTH), we conjecture and study a novel \"lottery image prior\" (LIP) by\nexploiting DNN inherent sparsity, stated as: given an over-parameterized\nDNN-based image prior, it will contain a sparse subnetwork that can be trained\nin isolation, to match the original DNN's performance when being applied as a\nprior to various image inverse problems. Our results validate the superiority\nof LIPs: we can successfully locate the LIP subnetworks from over-parameterized\nDIPs at substantial sparsity ranges. Those LIP subnetworks significantly\noutperform deep decoders under comparably compact model sizes (by often fully\npreserving the effectiveness of their over-parameterized counterparts), and\nthey also possess high transferability across different images as well as\nrestoration task types. Besides, we also extend LIP to compressive sensing\nimage reconstruction, where a pre-trained GAN generator is used as the prior\n(in contrast to untrained DIP or deep decoder), and confirm its validity in\nthis setting too. To our best knowledge, this is the first time that LTH is\ndemonstrated to be relevant in the context of inverse problems or image priors.\n","authors":["Qiming Wu","Xiaohan Chen","Yifan Jiang","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2410.24187v1.pdf","comment":"Codes are available at\n https://github.com/VITA-Group/Chasing-Better-DIPs"},{"id":"http://arxiv.org/abs/2410.24185v1","updated":"2024-10-31T17:48:45Z","published":"2024-10-31T17:48:45Z","title":"DexMimicGen: Automated Data Generation for Bimanual Dexterous\n Manipulation via Imitation Learning","summary":" Imitation learning from human demonstrations is an effective means to teach\nrobots manipulation skills. But data acquisition is a major bottleneck in\napplying this paradigm more broadly, due to the amount of cost and human effort\ninvolved. There has been significant interest in imitation learning for\nbimanual dexterous robots, like humanoids. Unfortunately, data collection is\neven more challenging here due to the challenges of simultaneously controlling\nmultiple arms and multi-fingered hands. Automated data generation in simulation\nis a compelling, scalable alternative to fuel this need for data. To this end,\nwe introduce DexMimicGen, a large-scale automated data generation system that\nsynthesizes trajectories from a handful of human demonstrations for humanoid\nrobots with dexterous hands. We present a collection of simulation environments\nin the setting of bimanual dexterous manipulation, spanning a range of\nmanipulation behaviors and different requirements for coordination among the\ntwo arms. We generate 21K demos across these tasks from just 60 source human\ndemos and study the effect of several data generation and policy learning\ndecisions on agent performance. Finally, we present a real-to-sim-to-real\npipeline and deploy it on a real-world humanoid can sorting task. Videos and\nmore are at https://dexmimicgen.github.io/\n","authors":["Zhenyu Jiang","Yuqi Xie","Kevin Lin","Zhenjia Xu","Weikang Wan","Ajay Mandlekar","Linxi Fan","Yuke Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.24185v1.pdf","comment":"Project website: https://dexmimicgen.github.io/"},{"id":"http://arxiv.org/abs/2409.06711v2","updated":"2024-10-31T17:48:06Z","published":"2024-08-25T13:14:59Z","title":"Quantized neural network for complex hologram generation","summary":" Computer-generated holography (CGH) is a promising technology for augmented\nreality displays, such as head-mounted or head-up displays. However, its high\ncomputational demand makes it impractical for implementation. Recent efforts to\nintegrate neural networks into CGH have successfully accelerated computing\nspeed, demonstrating the potential to overcome the trade-off between\ncomputational cost and image quality. Nevertheless, deploying neural\nnetwork-based CGH algorithms on computationally limited embedded systems\nrequires more efficient models with lower computational cost, memory footprint,\nand power consumption. In this study, we developed a lightweight model for\ncomplex hologram generation by introducing neural network quantization.\nSpecifically, we built a model based on tensor holography and quantized it from\n32-bit floating-point precision (FP32) to 8-bit integer precision (INT8). Our\nperformance evaluation shows that the proposed INT8 model achieves hologram\nquality comparable to that of the FP32 model while reducing the model size by\napproximately 70% and increasing the speed fourfold. Additionally, we\nimplemented the INT8 model on a system-on-module to demonstrate its\ndeployability on embedded platforms and high power efficiency.\n","authors":["Yutaka Endo","Minoru Oikawa","Timothy D. Wilkinson","Tomoyoshi Shimobaba","Tomoyoshi Ito"],"pdf_url":"https://arxiv.org/pdf/2409.06711v2.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2406.04312v2","updated":"2024-10-31T17:47:54Z","published":"2024-06-06T17:56:40Z","title":"ReNO: Enhancing One-step Text-to-Image Models through Reward-based Noise\n Optimization","summary":" Text-to-Image (T2I) models have made significant advancements in recent\nyears, but they still struggle to accurately capture intricate details\nspecified in complex compositional prompts. While fine-tuning T2I models with\nreward objectives has shown promise, it suffers from \"reward hacking\" and may\nnot generalize well to unseen prompt distributions. In this work, we propose\nReward-based Noise Optimization (ReNO), a novel approach that enhances T2I\nmodels at inference by optimizing the initial noise based on the signal from\none or multiple human preference reward models. Remarkably, solving this\noptimization problem with gradient ascent for 50 iterations yields impressive\nresults on four different one-step models across two competitive benchmarks,\nT2I-CompBench and GenEval. Within a computational budget of 20-50 seconds,\nReNO-enhanced one-step models consistently surpass the performance of all\ncurrent open-source Text-to-Image models. Extensive user studies demonstrate\nthat our model is preferred nearly twice as often compared to the popular SDXL\nmodel and is on par with the proprietary Stable Diffusion 3 with 8B parameters.\nMoreover, given the same computational resources, a ReNO-optimized one-step\nmodel outperforms widely-used open-source models such as SDXL and\nPixArt-$\\alpha$, highlighting the efficiency and effectiveness of ReNO in\nenhancing T2I model performance at inference time. Code is available at\nhttps://github.com/ExplainableML/ReNO.\n","authors":["Luca Eyring","Shyamgopal Karthik","Karsten Roth","Alexey Dosovitskiy","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2406.04312v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2306.01953v3","updated":"2024-10-31T17:47:21Z","published":"2023-06-02T23:29:28Z","title":"Invisible Image Watermarks Are Provably Removable Using Generative AI","summary":" Invisible watermarks safeguard images' copyrights by embedding hidden\nmessages only detectable by owners. They also prevent people from misusing\nimages, especially those generated by AI models. We propose a family of\nregeneration attacks to remove these invisible watermarks. The proposed attack\nmethod first adds random noise to an image to destroy the watermark and then\nreconstructs the image. This approach is flexible and can be instantiated with\nmany existing image-denoising algorithms and pre-trained generative models such\nas diffusion models. Through formal proofs and extensive empirical evaluations,\nwe demonstrate that pixel-level invisible watermarks are vulnerable to this\nregeneration attack. Our results reveal that, across four different pixel-level\nwatermarking schemes, the proposed method consistently achieves superior\nperformance compared to existing attack techniques, with lower detection rates\nand higher image quality. However, watermarks that keep the image semantically\nsimilar can be an alternative defense against our attacks. Our finding\nunderscores the need for a shift in research/industry emphasis from invisible\nwatermarks to semantic-preserving watermarks. Code is available at\nhttps://github.com/XuandongZhao/WatermarkAttacker\n","authors":["Xuandong Zhao","Kexun Zhang","Zihao Su","Saastha Vasan","Ilya Grishchenko","Christopher Kruegel","Giovanni Vigna","Yu-Xiang Wang","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2306.01953v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.24183v1","updated":"2024-10-31T17:46:54Z","published":"2024-10-31T17:46:54Z","title":"Extended Object Tracking and Classification based on Linear Splines","summary":" This paper introduces a framework based on linear splines for 2-dimensional\nextended object tracking and classification. Unlike state of the art models,\nlinear splines allow to represent extended objects whose contour is an\narbitrarily complex curve. An exact likelihood is derived for the case in which\nnoisy measurements can be scattered from any point on the contour of the\nextended object, while an approximate Monte Carlo likelihood is provided for\nthe case wherein scattering points can be anywhere, i.e. inside or on the\ncontour, on the object surface. Exploiting such likelihood to measure how well\nthe observed data fit a given shape, a suitable estimator is developed. The\nproposed estimator models the extended object in terms of a kinematic state,\nproviding object position and orientation, along with a shape vector,\ncharacterizing object contour and surface. The kinematic state is estimated via\na nonlinear Kalman filter, while the shape vector is estimated via a Bayesian\nclassifier so that classification is implicitly solved during shape estimation.\nNumerical experiments are provided to assess, compared to state of the art\nextended object estimators, the effectiveness of the proposed one.\n","authors":["Matteo Tesori","Giorgio Battistelli","Luigi Chisci"],"pdf_url":"https://arxiv.org/pdf/2410.24183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24181v1","updated":"2024-10-31T17:45:09Z","published":"2024-10-31T17:45:09Z","title":"Federated Black-Box Adaptation for Semantic Segmentation","summary":" Federated Learning (FL) is a form of distributed learning that allows\nmultiple institutions or clients to collaboratively learn a global model to\nsolve a task. This allows the model to utilize the information from every\ninstitute while preserving data privacy. However, recent studies show that the\npromise of protecting the privacy of data is not upheld by existing methods and\nthat it is possible to recreate the training data from the different\ninstitutions. This is done by utilizing gradients transferred between the\nclients and the global server during training or by knowing the model\narchitecture at the client end. In this paper, we propose a federated learning\nframework for semantic segmentation without knowing the model architecture nor\ntransferring gradients between the client and the server, thus enabling better\nprivacy preservation. We propose BlackFed - a black-box adaptation of neural\nnetworks that utilizes zero order optimization (ZOO) to update the client model\nweights and first order optimization (FOO) to update the server weights. We\nevaluate our approach on several computer vision and medical imaging datasets\nto demonstrate its effectiveness. To the best of our knowledge, this work is\none of the first works in employing federated learning for segmentation, devoid\nof gradients or model information exchange. Code:\nhttps://github.com/JayParanjape/blackfed/tree/master\n","authors":["Jay N. Paranjape","Shameema Sikder","S. Swaroop Vedula","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2410.24181v1.pdf","comment":"Accepted at NEURIPS 2024"},{"id":"http://arxiv.org/abs/2404.13046v2","updated":"2024-10-31T17:39:34Z","published":"2024-04-19T17:59:48Z","title":"MoVA: Adapting Mixture of Vision Experts to Multimodal Context","summary":" As the key component in multimodal large language models (MLLMs), the ability\nof the visual encoder greatly affects MLLM's understanding on diverse image\ncontent. Although some large-scale pretrained vision encoders such as vision\nencoders in CLIP and DINOv2 have brought promising performance, we found that\nthere is still no single vision encoder that can dominate various image content\nunderstanding, e.g., the CLIP vision encoder leads to outstanding results on\ngeneral image understanding but poor performance on document or chart content.\nTo alleviate the bias of CLIP vision encoder, we first delve into the inherent\nbehavior of different pre-trained vision encoders and then propose the MoVA, a\npowerful and novel MLLM, adaptively routing and fusing task-specific vision\nexperts with a coarse-to-fine mechanism. In the coarse-grained stage, we design\na context-aware expert routing strategy to dynamically select the most suitable\nvision experts according to the user instruction, input image, and expertise of\nvision experts. This benefits from the powerful model function understanding\nability of the large language model (LLM). In the fine-grained stage, we\nelaborately conduct the mixture-of-vision-expert adapter (MoV-Adapter) to\nextract and fuse task-specific knowledge from various experts. This\ncoarse-to-fine paradigm effectively leverages representations from experts\nbased on multimodal context and model expertise, further enhancing the\ngeneralization ability. We conduct extensive experiments to evaluate the\neffectiveness of the proposed approach. Without any bells and whistles, MoVA\ncan achieve significant performance gains over current state-of-the-art methods\nin a wide range of challenging multimodal benchmarks.\n","authors":["Zhuofan Zong","Bingqi Ma","Dazhong Shen","Guanglu Song","Hao Shao","Dongzhi Jiang","Hongsheng Li","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.13046v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2403.04690v3","updated":"2024-10-31T17:32:26Z","published":"2024-03-07T17:35:58Z","title":"Faster Neighborhood Attention: Reducing the O(n^2) Cost of Self\n Attention at the Threadblock Level","summary":" Neighborhood attention reduces the cost of self attention by restricting each\ntoken's attention span to its nearest neighbors. This restriction,\nparameterized by a window size and dilation factor, draws a spectrum of\npossible attention patterns between linear projection and self attention.\nNeighborhood attention, and more generally sliding window attention patterns,\nhave long been bounded by infrastructure, particularly in higher-rank spaces\n(2-D and 3-D), calling for the development of custom kernels, which have been\nlimited in either functionality, or performance, if not both. In this work, we\naim to massively improve upon existing infrastructure by providing two new\nmethods for implementing neighborhood attention. We first show that\nneighborhood attention can be represented as a batched GEMM problem, similar to\nstandard attention, and implement it for 1-D and 2-D neighborhood attention.\nThese kernels on average provide 895% and 272% improvement in full precision\nruntime compared to existing naive CUDA kernels for 1-D and 2-D neighborhood\nattention respectively. We find that aside from being heavily bound by memory\nbandwidth, certain inherent inefficiencies exist in all unfused implementations\nof neighborhood attention, which in most cases undo their theoretical\nefficiency gain. Motivated by the progress made into fused dot-product\nattention kernels, we developed fused neighborhood attention; an adaptation of\nfused dot-product attention kernels that allow fine-grained control over\nattention across different spatial axes. Known for reducing the quadratic time\ncomplexity of self attention to a linear complexity, neighborhood attention can\nnow enjoy a reduced and constant memory footprint, and record-breaking half\nprecision runtime. We observe that our fused implementation successfully\ncircumvents some of the unavoidable inefficiencies in unfused\nimplementations...\n","authors":["Ali Hassani","Wen-Mei Hwu","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2403.04690v3.pdf","comment":"To appear in 38th Conference on Neural Information Processing Systems\n (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2410.24160v1","updated":"2024-10-31T17:19:03Z","published":"2024-10-31T17:19:03Z","title":"Redefining in Dictionary: Towards a Enhanced Semantic\n Understanding of Creative Generation","summary":" Creativity, both in human and diffusion models, remains an inherently\nabstract concept; thus, simply adding \"creative\" to a prompt does not yield\nreliable semantic recognition by the model. In this work, we concretize the\nabstract notion of \"creative\" through the TP2O task, which aims to merge two\nunrelated concepts, and introduce CreTok, redefining \"creative\" as the token\n$\\texttt{}$. This redefinition offers a more concrete and universally\nadaptable representation for concept blending. This redefinition occurs\ncontinuously, involving the repeated random sampling of text pairs with\ndifferent concepts and optimizing cosine similarity between target and constant\nprompts. This approach enables $\\texttt{}$ to learn a method for\ncreative concept fusion. Extensive experiments demonstrate that the creative\ncapability enabled by $\\texttt{}$ substantially surpasses recent SOTA\ndiffusion models and achieves superior creative generation. CreTok exhibits\ngreater flexibility and reduced time overhead, as $\\texttt{}$ can\nfunction as a universal token for any concept, facilitating creative generation\nwithout retraining.\n","authors":["Fu Feng","Yucheng Xie","Jing Wang","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2410.24160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00885v2","updated":"2024-10-31T17:12:57Z","published":"2024-06-02T22:40:05Z","title":"Visual place recognition for aerial imagery: A survey","summary":" Aerial imagery and its direct application to visual localization is an\nessential problem for many Robotics and Computer Vision tasks. While Global\nNavigation Satellite Systems (GNSS) are the standard default solution for\nsolving the aerial localization problem, it is subject to a number of\nlimitations, such as, signal instability or solution unreliability that make\nthis option not so desirable. Consequently, visual geolocalization is emerging\nas a viable alternative. However, adapting Visual Place Recognition (VPR) task\nto aerial imagery presents significant challenges, including weather variations\nand repetitive patterns. Current VPR reviews largely neglect the specific\ncontext of aerial data. This paper introduces a methodology tailored for\nevaluating VPR techniques specifically in the domain of aerial imagery,\nproviding a comprehensive assessment of various methods and their performance.\nHowever, we not only compare various VPR methods, but also demonstrate the\nimportance of selecting appropriate zoom and overlap levels when constructing\nmap tiles to achieve maximum efficiency of VPR algorithms in the case of aerial\nimagery. The code is available on our GitHub repository --\nhttps://github.com/prime-slam/aero-vloc.\n","authors":["Ivan Moskalenko","Anastasiia Kornilova","Gonzalo Ferrer"],"pdf_url":"https://arxiv.org/pdf/2406.00885v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24151v1","updated":"2024-10-31T17:09:55Z","published":"2024-10-31T17:09:55Z","title":"Scaling Concept With Text-Guided Diffusion Models","summary":" Text-guided diffusion models have revolutionized generative tasks by\nproducing high-fidelity content from text descriptions. They have also enabled\nan editing paradigm where concepts can be replaced through text conditioning\n(e.g., a dog to a tiger). In this work, we explore a novel approach: instead of\nreplacing a concept, can we enhance or suppress the concept itself? Through an\nempirical study, we identify a trend where concepts can be decomposed in\ntext-guided diffusion models. Leveraging this insight, we introduce\nScalingConcept, a simple yet effective method to scale decomposed concepts up\nor down in real input without introducing new elements. To systematically\nevaluate our approach, we present the WeakConcept-10 dataset, where concepts\nare imperfect and need to be enhanced. More importantly, ScalingConcept enables\na variety of novel zero-shot applications across image and audio domains,\nincluding tasks such as canonical pose generation and generative sound\nhighlighting or removal.\n","authors":["Chao Huang","Susan Liang","Yunlong Tang","Yapeng Tian","Anurag Kumar","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2410.24151v1.pdf","comment":"Project page: https://wikichao.github.io/ScalingConcept/"},{"id":"http://arxiv.org/abs/2410.24148v1","updated":"2024-10-31T17:09:19Z","published":"2024-10-31T17:09:19Z","title":"Exploring Vision Language Models for Facial Attribute Recognition:\n Emotion, Race, Gender, and Age","summary":" Technologies for recognizing facial attributes like race, gender, age, and\nemotion have several applications, such as surveillance, advertising content,\nsentiment analysis, and the study of demographic trends and social behaviors.\nAnalyzing demographic characteristics based on images and analyzing facial\nexpressions have several challenges due to the complexity of humans' facial\nattributes. Traditional approaches have employed CNNs and various other deep\nlearning techniques, trained on extensive collections of labeled images. While\nthese methods demonstrated effective performance, there remains potential for\nfurther enhancements. In this paper, we propose to utilize vision language\nmodels (VLMs) such as generative pre-trained transformer (GPT), GEMINI, large\nlanguage and vision assistant (LLAVA), PaliGemma, and Microsoft Florence2 to\nrecognize facial attributes such as race, gender, age, and emotion from images\nwith human faces. Various datasets like FairFace, AffectNet, and UTKFace have\nbeen utilized to evaluate the solutions. The results show that VLMs are\ncompetitive if not superior to traditional techniques. Additionally, we propose\n\"FaceScanPaliGemma\"--a fine-tuned PaliGemma model--for race, gender, age, and\nemotion recognition. The results show an accuracy of 81.1%, 95.8%, 80%, and\n59.4% for race, gender, age group, and emotion classification, respectively,\noutperforming pre-trained version of PaliGemma, other VLMs, and SotA methods.\nFinally, we propose \"FaceScanGPT\", which is a GPT-4o model to recognize the\nabove attributes when several individuals are present in the image using a\nprompt engineered for a person with specific facial and/or physical attributes.\nThe results underscore the superior multitasking capability of FaceScanGPT to\ndetect the individual's attributes like hair cut, clothing color, postures,\netc., using only a prompt to drive the detection and recognition tasks.\n","authors":["Nouar AlDahoul","Myles Joshua Toledo Tan","Harishwar Reddy Kasireddy","Yasir Zaki"],"pdf_url":"https://arxiv.org/pdf/2410.24148v1.pdf","comment":"52 pages, 13 figures"},{"id":"http://arxiv.org/abs/2410.24144v1","updated":"2024-10-31T17:05:44Z","published":"2024-10-31T17:05:44Z","title":"HoloChrome: Polychromatic Illumination for Speckle Reduction in\n Holographic Near-Eye Displays","summary":" Holographic displays hold the promise of providing authentic depth cues,\nresulting in enhanced immersive visual experiences for near-eye applications.\nHowever, current holographic displays are hindered by speckle noise, which\nlimits accurate reproduction of color and texture in displayed images. We\npresent HoloChrome, a polychromatic holographic display framework designed to\nmitigate these limitations. HoloChrome utilizes an ultrafast,\nwavelength-adjustable laser and a dual-Spatial Light Modulator (SLM)\narchitecture, enabling the multiplexing of a large set of discrete wavelengths\nacross the visible spectrum. By leveraging spatial separation in our dual-SLM\nsetup, we independently manipulate speckle patterns across multiple\nwavelengths. This novel approach effectively reduces speckle noise through\nincoherent averaging achieved by wavelength multiplexing. Our method is\ncomplementary to existing speckle reduction techniques, offering a new pathway\nto address this challenge. Furthermore, the use of polychromatic illumination\nbroadens the achievable color gamut compared to traditional three-color primary\nholographic displays.\n Our simulations and tabletop experiments validate that HoloChrome\nsignificantly reduces speckle noise and expands the color gamut. These\nadvancements enhance the performance of holographic near-eye displays, moving\nus closer to practical, immersive next-generation visual experiences.\n","authors":["Florian Schiffers","Grace Kuo","Nathan Matsuda","Douglas Lanman","Oliver Cossairt"],"pdf_url":"https://arxiv.org/pdf/2410.24144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24139v1","updated":"2024-10-31T17:03:38Z","published":"2024-10-31T17:03:38Z","title":"COSNet: A Novel Semantic Segmentation Network using Enhanced Boundaries\n in Cluttered Scenes","summary":" Automated waste recycling aims to efficiently separate the recyclable objects\nfrom the waste by employing vision-based systems. However, the presence of\nvarying shaped objects having different material types makes it a challenging\nproblem, especially in cluttered environments. Existing segmentation methods\nperform reasonably on many semantic segmentation datasets by employing\nmulti-contextual representations, however, their performance is degraded when\nutilized for waste object segmentation in cluttered scenarios. In addition,\nplastic objects further increase the complexity of the problem due to their\ntranslucent nature. To address these limitations, we introduce an efficacious\nsegmentation network, named COSNet, that uses boundary cues along with\nmulti-contextual information to accurately segment the objects in cluttered\nscenes. COSNet introduces novel components including feature sharpening block\n(FSB) and boundary enhancement module (BEM) for enhancing the features and\nhighlighting the boundary information of irregular waste objects in cluttered\nenvironment. Extensive experiments on three challenging datasets including\nZeroWaste-f, SpectralWaste, and ADE20K demonstrate the effectiveness of the\nproposed method. Our COSNet achieves a significant gain of 1.8% on ZeroWaste-f\nand 2.1% on SpectralWaste datasets respectively in terms of mIoU metric.\n","authors":["Muhammad Ali","Mamoona Javaid","Mubashir Noman","Mustansar Fiaz","Salman Khan"],"pdf_url":"https://arxiv.org/pdf/2410.24139v1.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2311.00371v2","updated":"2024-10-31T17:01:50Z","published":"2023-11-01T08:53:05Z","title":"Learning Cooperative Trajectory Representations for Motion Forecasting","summary":" Motion forecasting is an essential task for autonomous driving, and utilizing\ninformation from infrastructure and other vehicles can enhance forecasting\ncapabilities. Existing research mainly focuses on leveraging single-frame\ncooperative information to enhance the limited perception capability of the ego\nvehicle, while underutilizing the motion and interaction context of traffic\nparticipants observed from cooperative devices. In this paper, we propose a\nforecasting-oriented representation paradigm to utilize motion and interaction\nfeatures from cooperative information. Specifically, we present V2X-Graph, a\nrepresentative framework to achieve interpretable and end-to-end trajectory\nfeature fusion for cooperative motion forecasting. V2X-Graph is evaluated on\nV2X-Seq in vehicle-to-infrastructure (V2I) scenarios. To further evaluate on\nvehicle-to-everything (V2X) scenario, we construct the first real-world V2X\nmotion forecasting dataset V2X-Traj, which contains multiple autonomous\nvehicles and infrastructure in every scenario. Experimental results on both\nV2X-Seq and V2X-Traj show the advantage of our method. We hope both V2X-Graph\nand V2X-Traj will benefit the further development of cooperative motion\nforecasting. Find the project at https://github.com/AIR-THU/V2X-Graph.\n","authors":["Hongzhi Ruan","Haibao Yu","Wenxian Yang","Siqi Fan","Zaiqing Nie"],"pdf_url":"https://arxiv.org/pdf/2311.00371v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2401.01650v3","updated":"2024-10-31T16:53:49Z","published":"2024-01-03T10:07:11Z","title":"De-Confusing Pseudo-Labels in Source-Free Domain Adaptation","summary":" Source-free domain adaptation aims to adapt a source-trained model to an\nunlabeled target domain without access to the source data. It has attracted\ngrowing attention in recent years, where existing approaches focus on\nself-training that usually includes pseudo-labeling techniques. In this paper,\nwe introduce a novel noise-learning approach tailored to address noise\ndistribution in domain adaptation settings and learn to de-confuse the\npseudo-labels. More specifically, we learn a noise transition matrix of the\npseudo-labels to capture the label corruption of each class and learn the\nunderlying true label distribution. Estimating the noise transition matrix\nenables a better true class-posterior estimation, resulting in better\nprediction accuracy. We demonstrate the effectiveness of our approach when\ncombined with several source-free domain adaptation methods: SHOT, SHOT++, and\nAaD. We obtain state-of-the-art results on three domain adaptation datasets:\nVisDA, DomainNet, and OfficeHome.\n","authors":["Idit Diamant","Amir Rosenfeld","Idan Achituve","Jacob Goldberger","Arnon Netzer"],"pdf_url":"https://arxiv.org/pdf/2401.01650v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01903v2","updated":"2024-10-31T16:49:26Z","published":"2024-07-02T03:08:20Z","title":"Text-Aware Diffusion for Policy Learning","summary":" Training an agent to achieve particular goals or perform desired behaviors is\noften accomplished through reinforcement learning, especially in the absence of\nexpert demonstrations. However, supporting novel goals or behaviors through\nreinforcement learning requires the ad-hoc design of appropriate reward\nfunctions, which quickly becomes intractable. To address this challenge, we\npropose Text-Aware Diffusion for Policy Learning (TADPoLe), which uses a\npretrained, frozen text-conditioned diffusion model to compute dense zero-shot\nreward signals for text-aligned policy learning. We hypothesize that\nlarge-scale pretrained generative models encode rich priors that can supervise\na policy to behave not only in a text-aligned manner, but also in alignment\nwith a notion of naturalness summarized from internet-scale training data. In\nour experiments, we demonstrate that TADPoLe is able to learn policies for\nnovel goal-achievement and continuous locomotion behaviors specified by natural\nlanguage, in both Humanoid and Dog environments. The behaviors are learned\nzero-shot without ground-truth rewards or expert demonstrations, and are\nqualitatively more natural according to human evaluation. We further show that\nTADPoLe performs competitively when applied to robotic manipulation tasks in\nthe Meta-World environment, without having access to any in-domain\ndemonstrations.\n","authors":["Calvin Luo","Mandy He","Zilai Zeng","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2407.01903v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24116v1","updated":"2024-10-31T16:46:23Z","published":"2024-10-31T16:46:23Z","title":"AIDOVECL: AI-generated Dataset of Outpainted Vehicles for Eye-level\n Classification and Localization","summary":" Image labeling is a critical bottleneck in the development of computer vision\ntechnologies, often constraining the potential of machine learning models due\nto the time-intensive nature of manual annotations. This work introduces a\nnovel approach that leverages outpainting to address the problem of annotated\ndata scarcity by generating artificial contexts and annotations, significantly\nreducing manual labeling efforts. We apply this technique to a particularly\nacute challenge in autonomous driving, urban planning, and environmental\nmonitoring: the lack of diverse, eye-level vehicle images in desired classes.\nOur dataset comprises AI-generated vehicle images obtained by detecting and\ncropping vehicles from manually selected seed images, which are then outpainted\nonto larger canvases to simulate varied real-world conditions. The outpainted\nimages include detailed annotations, providing high-quality ground truth data.\nAdvanced outpainting techniques and image quality assessments ensure visual\nfidelity and contextual relevance. Augmentation with outpainted vehicles\nimproves overall performance metrics by up to 8\\% and enhances prediction of\nunderrepresented classes by up to 20\\%. This approach, exemplifying outpainting\nas a self-annotating paradigm, presents a solution that enhances dataset\nversatility across multiple domains of machine learning. The code and links to\ndatasets used in this study are available for further research and replication\nat https://github.com/amir-kazemi/aidovecl.\n","authors":["Amir Kazemi","Qurat ul ain Fatima","Volodymyr Kindratenko","Christopher Tessum"],"pdf_url":"https://arxiv.org/pdf/2410.24116v1.pdf","comment":"19 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2410.24114v1","updated":"2024-10-31T16:44:10Z","published":"2024-10-31T16:44:10Z","title":"Nearest Neighbor Normalization Improves Multimodal Retrieval","summary":" Multimodal models leverage large-scale pre-training to achieve strong but\nstill imperfect performance on tasks such as image captioning, visual question\nanswering, and cross-modal retrieval. In this paper, we present a simple and\nefficient method for correcting errors in trained contrastive image-text\nretrieval models with no additional training, called Nearest Neighbor\nNormalization (NNN). We show an improvement on retrieval metrics in both text\nretrieval and image retrieval for all of the contrastive models that we tested\n(CLIP, BLIP, ALBEF, SigLIP, BEiT) and for both of the datasets that we used\n(MS-COCO and Flickr30k). NNN requires a reference database, but does not\nrequire any training on this database, and can even increase the retrieval\naccuracy of a model after finetuning.\n","authors":["Neil Chowdhury","Franklin Wang","Sumedh Shenoy","Douwe Kiela","Sarah Schwettmann","Tristan Thrush"],"pdf_url":"https://arxiv.org/pdf/2410.24114v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14919v2","updated":"2024-10-31T16:36:14Z","published":"2024-10-19T00:33:51Z","title":"Adversarial Score identity Distillation: Rapidly Surpassing the Teacher\n in One Step","summary":" Score identity Distillation (SiD) is a data-free method that has achieved\nstate-of-the-art performance in image generation by leveraging only a\npretrained diffusion model, without requiring any training data. However, the\nultimate performance of SiD is constrained by the accuracy with which the\npretrained model captures the true data scores at different stages of the\ndiffusion process. In this paper, we introduce SiDA (SiD with Adversarial\nLoss), which not only enhances generation quality but also improves\ndistillation efficiency by incorporating real images and adversarial loss. SiDA\nutilizes the encoder from the generator's score network as a discriminator,\nboosting its ability to distinguish between real images and those generated by\nSiD. The adversarial loss is batch-normalized within each GPU and then combined\nwith the original SiD loss. This integration effectively incorporates the\naverage \"fakeness\" per GPU batch into the pixel-based SiD loss, enabling SiDA\nto distill a single-step generator either from scratch or by fine-tuning an\nexisting one. SiDA converges significantly faster than its predecessor when\ntrained from scratch, and swiftly improves upon the original model's\nperformance after an initial warmup period during fine-tuning from a\npre-distilled SiD generator. This one-step adversarial distillation method\nestablishes new benchmarks in generation performance when distilling EDM\ndiffusion models pretrained on CIFAR-10 (32x32) and ImageNet (64x64), achieving\nFID score of 1.110 on ImageNet 64x64. It sets record-low FID scores when\ndistilling EDM2 models trained on ImageNet (512x512), surpassing even the\nlargest teacher model, EDM2-XXL. Our SiDA's results record FID scores of 2.156\nfor EDM2-XS, 1.669 for EDM2-S, 1.488 for EDM2-M, and 1.465 for EDM2-L,\ndemonstrating significant improvements across all model sizes. Our open-source\ncode will be integrated into the SiD codebase.\n","authors":["Mingyuan Zhou","Huangjie Zheng","Yi Gu","Zhendong Wang","Hai Huang"],"pdf_url":"https://arxiv.org/pdf/2410.14919v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24098v1","updated":"2024-10-31T16:28:49Z","published":"2024-10-31T16:28:49Z","title":"Parameter choices in HaarPSI for IQA with medical images","summary":" When developing machine learning models, image quality assessment (IQA)\nmeasures are a crucial component for evaluation. However, commonly used IQA\nmeasures have been primarily developed and optimized for natural images. In\nmany specialized settings, such as medical images, this poses an\noften-overlooked problem regarding suitability. In previous studies, the IQA\nmeasure HaarPSI showed promising behavior for natural and medical images.\nHaarPSI is based on Haar wavelet representations and the framework allows\noptimization of two parameters. So far, these parameters have been aligned for\nnatural images. Here, we optimize these parameters for two annotated medical\ndata sets, a photoacoustic and a chest X-Ray data set. We observe that they are\nmore sensitive to the parameter choices than the employed natural images, and\non the other hand both medical data sets lead to similar parameter values when\noptimized. We denote the optimized setting, which improves the performance for\nthe medical images notably, by HaarPSI$_{MED}$. The results suggest that\nadapting common IQA measures within their frameworks for medical images can\nprovide a valuable, generalizable addition to the employment of more specific\ntask-based measures.\n","authors":["Clemens Karner","Janek Gröhl","Ian Selby","Judith Babar","Jake Beckford","Thomas R Else","Timothy J Sadler","Shahab Shahipasand","Arthikkaa Thavakumar","Michael Roberts","James H. F. Rudd","Carola-Bibiane Schönlieb","Jonathan R Weir-McCall","Anna Breger"],"pdf_url":"https://arxiv.org/pdf/2410.24098v1.pdf","comment":"5 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.08570v2","updated":"2024-10-31T16:20:26Z","published":"2024-08-16T07:12:47Z","title":"EraW-Net: Enhance-Refine-Align W-Net for Scene-Associated Driver\n Attention Estimation","summary":" Associating driver attention with driving scene across two fields of views\n(FOVs) is a hard cross-domain perception problem, which requires comprehensive\nconsideration of cross-view mapping, dynamic driving scene analysis, and driver\nstatus tracking. Previous methods typically focus on a single view or map\nattention to the scene via estimated gaze, failing to exploit the implicit\nconnection between them. Moreover, simple fusion modules are insufficient for\nmodeling the complex relationships between the two views, making information\nintegration challenging. To address these issues, we propose a novel method for\nend-to-end scene-associated driver attention estimation, called EraW-Net. This\nmethod enhances the most discriminative dynamic cues, refines feature\nrepresentations, and facilitates semantically aligned cross-domain integration\nthrough a W-shaped architecture, termed W-Net. Specifically, a Dynamic Adaptive\nFilter Module (DAF-Module) is proposed to address the challenges of frequently\nchanging driving environments by extracting vital regions. It suppresses the\nindiscriminately recorded dynamics and highlights crucial ones by innovative\njoint frequency-spatial analysis, enhancing the model's ability to parse\ncomplex dynamics. Additionally, to track driver states during non-fixed facial\nposes, we propose a Global Context Sharing Module (GCS-Module) to construct\nrefined feature representations by capturing hierarchical features that adapt\nto various scales of head and eye movements. Finally, W-Net achieves systematic\ncross-view information integration through its \"Encoding-Independent Partial\nDecoding-Fusion Decoding\" structure, addressing semantic misalignment in\nheterogeneous data integration. Experiments demonstrate that the proposed\nmethod robustly and accurately estimates the mapping of driver attention in\nscene on large public datasets.\n","authors":["Jun Zhou","Chunsheng Liu","Faliang Chang","Wenqian Wang","Penghui Hao","Yiming Huang","Zhiqiang Yang"],"pdf_url":"https://arxiv.org/pdf/2408.08570v2.pdf","comment":"13pages, 9 figures"},{"id":"http://arxiv.org/abs/2410.24075v1","updated":"2024-10-31T16:13:55Z","published":"2024-10-31T16:13:55Z","title":"Identifying Spatio-Temporal Drivers of Extreme Events","summary":" The spatio-temporal relations of impacts of extreme events and their drivers\nin climate data are not fully understood and there is a need of machine\nlearning approaches to identify such spatio-temporal relations from data. The\ntask, however, is very challenging since there are time delays between extremes\nand their drivers, and the spatial response of such drivers is inhomogeneous.\nIn this work, we propose a first approach and benchmarks to tackle this\nchallenge. Our approach is trained end-to-end to predict spatio-temporally\nextremes and spatio-temporally drivers in the physical input variables jointly.\nBy enforcing the network to predict extremes from spatio-temporal binary masks\nof identified drivers, the network successfully identifies drivers that are\ncorrelated with extremes. We evaluate our approach on three newly created\nsynthetic benchmarks, where two of them are based on remote sensing or\nreanalysis climate data, and on two real-world reanalysis datasets. The source\ncode and datasets are publicly available at the project page\nhttps://hakamshams.github.io/IDE.\n","authors":["Mohamad Hakam Shams Eddin","Juergen Gall"],"pdf_url":"https://arxiv.org/pdf/2410.24075v1.pdf","comment":"Accepted at the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2410.22551v2","updated":"2024-10-31T16:04:48Z","published":"2024-10-29T21:37:03Z","title":"FairSkin: Fair Diffusion for Skin Disease Image Generation","summary":" Image generation is a prevailing technique for clinical data augmentation for\nadvancing diagnostic accuracy and reducing healthcare disparities. Diffusion\nModel (DM) has become a leading method in generating synthetic medical images,\nbut it suffers from a critical twofold bias: (1) The quality of images\ngenerated for Caucasian individuals is significantly higher, as measured by the\nFrechet Inception Distance (FID). (2) The ability of the downstream-task\nlearner to learn critical features from disease images varies across different\nskin tones. These biases pose significant risks, particularly in skin disease\ndetection, where underrepresentation of certain skin tones can lead to\nmisdiagnosis or neglect of specific conditions. To address these challenges, we\npropose FairSkin, a novel DM framework that mitigates these biases through a\nthree-level resampling mechanism, ensuring fairer representation across racial\nand disease categories. Our approach significantly improves the diversity and\nquality of generated images, contributing to more equitable skin disease\ndetection in clinical settings.\n","authors":["Ruichen Zhang","Yuguang Yao","Zhen Tan","Zhiming Li","Pan Wang","Huan Liu","Jingtong Hu","Sijia Liu","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2410.22551v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24060v1","updated":"2024-10-31T15:57:04Z","published":"2024-10-31T15:57:04Z","title":"Understanding Generalizability of Diffusion Models Requires Rethinking\n the Hidden Gaussian Structure","summary":" In this work, we study the generalizability of diffusion models by looking\ninto the hidden properties of the learned score functions, which are\nessentially a series of deep denoisers trained on various noise levels. We\nobserve that as diffusion models transition from memorization to\ngeneralization, their corresponding nonlinear diffusion denoisers exhibit\nincreasing linearity. This discovery leads us to investigate the linear\ncounterparts of the nonlinear diffusion models, which are a series of linear\nmodels trained to match the function mappings of the nonlinear diffusion\ndenoisers. Surprisingly, these linear denoisers are approximately the optimal\ndenoisers for a multivariate Gaussian distribution characterized by the\nempirical mean and covariance of the training dataset. This finding implies\nthat diffusion models have the inductive bias towards capturing and utilizing\nthe Gaussian structure (covariance information) of the training dataset for\ndata generation. We empirically demonstrate that this inductive bias is a\nunique property of diffusion models in the generalization regime, which becomes\nincreasingly evident when the model's capacity is relatively small compared to\nthe training dataset size. In the case that the model is highly\noverparameterized, this inductive bias emerges during the initial training\nphases before the model fully memorizes its training data. Our study provides\ncrucial insights into understanding the notable strong generalization\nphenomenon recently observed in real-world diffusion models.\n","authors":["Xiang Li","Yixiang Dai","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2410.24060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08557v2","updated":"2024-10-31T15:52:52Z","published":"2023-11-14T21:39:15Z","title":"Low-light Pedestrian Detection in Visible and Infrared Image Feeds:\n Issues and Challenges","summary":" Pedestrian detection has become a cornerstone for several high-level tasks,\nincluding autonomous driving, intelligent transportation, and traffic\nsurveillance. There are several works focussed on pedestrian detection using\nvisible images, mainly in the daytime. However, this task is very intriguing\nwhen the environmental conditions change to poor lighting or nighttime.\nRecently, new ideas have been spurred to use alternative sources, such as Far\nInfraRed (FIR) temperature sensor feeds for detecting pedestrians in low-light\nconditions. This study reviews recent developments in low-light pedestrian\ndetection approaches. It systematically categorizes and analyses various\nalgorithms from region-based to non-region-based and graph-based learning\nmethodologies by highlighting their methodologies, implementation issues, and\nchallenges. It also outlines the key benchmark datasets that can be used for\nresearch and development of advanced pedestrian detection algorithms,\nparticularly in low-light situations.\n","authors":["Thangarajah Akilan","Hrishikesh Vachhani"],"pdf_url":"https://arxiv.org/pdf/2311.08557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24055v1","updated":"2024-10-31T15:48:36Z","published":"2024-10-31T15:48:36Z","title":"Advanced Predictive Quality Assessment for Ultrasonic Additive\n Manufacturing with Deep Learning Model","summary":" Ultrasonic Additive Manufacturing (UAM) employs ultrasonic welding to bond\nsimilar or dissimilar metal foils to a substrate, resulting in solid,\nconsolidated metal components. However, certain processing conditions can lead\nto inter-layer defects, affecting the final product's quality. This study\ndevelops a method to monitor in-process quality using deep learning-based\nconvolutional neural networks (CNNs). The CNN models were evaluated on their\nability to classify samples with and without embedded thermocouples across five\npower levels (300W, 600W, 900W, 1200W, 1500W) using thermal images with\nsupervised labeling. Four distinct CNN classification models were created for\ndifferent scenarios including without (baseline) and with thermocouples, only\nwithout thermocouples across power levels, only with thermocouples across power\nlevels, and combined without and with thermocouples across power levels. The\nmodels achieved 98.29% accuracy on combined baseline and thermocouple images,\n97.10% for baseline images across power levels, 97.43% for thermocouple images,\nand 97.27% for both types across power levels. The high accuracy, above 97%,\ndemonstrates the system's effectiveness in identifying and classifying\nconditions within the UAM process, providing a reliable tool for quality\nassurance and process control in manufacturing environments.\n","authors":["Lokendra Poudel","Sushant Jha","Ryan Meeker","Duy-Nhat Phan","Rahul Bhowmik"],"pdf_url":"https://arxiv.org/pdf/2410.24055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16129v2","updated":"2024-10-31T15:46:45Z","published":"2024-06-23T15:03:35Z","title":"UDHF2-Net: Uncertainty-diffusion-model-based High-Frequency TransFormer\n Network for Remotely Sensed Imagery Interpretation","summary":" Remotely sensed imagery interpretation (RSII) faces the three major problems:\n(1) objective representation of spatial distribution patterns; (2) edge\nuncertainty problem caused by downsampling encoder and intrinsic edge noises\n(e.g., mixed pixel and edge occlusion etc.); and (3) false detection problem\ncaused by geometric registration error in change detection. To solve the\naforementioned problems, uncertainty-diffusion-model-based high-Frequency\nTransFormer network (UDHF2-Net) is the first to be proposed, whose\nsuperiorities are as follows: (1) a spatially-stationary-and-non-stationary\nhigh-frequency connection paradigm (SHCP) is proposed to enhance the\ninteraction of spatially frequency-wise stationary and non-stationary features\nto yield high-fidelity edge extraction result. Inspired by HRFormer, SHCP\nproposes high-frequency-wise stream to replace high-resolution-wise stream in\nHRFormer through the whole encoder-decoder process with parallel frequency-wise\nhigh-to-low streams, so it improves the edge extraction accuracy by\ncontinuously remaining high-frequency information; (2) a\nmask-and-geo-knowledge-based uncertainty diffusion module (MUDM), which is a\nself-supervised learning strategy, is proposed to improve the edge accuracy of\nextraction and change detection by gradually removing the simulated spectrum\nnoises based on geo-knowledge and the generated diffused spectrum noises; (3) a\nfrequency-wise semi-pseudo-Siamese UDHF2-Net is the first to be proposed to\nbalance accuracy and complexity for change detection. Besides the\naforementioned spectrum noises in semantic segmentation, MUDM is also a\nself-supervised learning strategy to effectively reduce the edge false change\ndetection from the generated imagery with geometric registration error.\n","authors":["Pengfei Zhang","Chang Li","Yongjun Zhang","Rongjun Qin"],"pdf_url":"https://arxiv.org/pdf/2406.16129v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09168v2","updated":"2024-10-31T15:44:36Z","published":"2024-06-13T14:30:35Z","title":"SR-CACO-2: A Dataset for Confocal Fluorescence Microscopy Image\n Super-Resolution","summary":" Confocal fluorescence microscopy is one of the most accessible and widely\nused imaging techniques for the study of biological processes at the cellular\nand subcellular levels. Scanning confocal microscopy allows the capture of\nhigh-quality images from thick three-dimensional (3D) samples, yet suffers from\nwell-known limitations such as photobleaching and phototoxicity of specimens\ncaused by intense light exposure, limiting its applications. Cellular damage\ncan be alleviated by changing imaging parameters to reduce light exposure,\noften at the expense of image quality. Machine/deep learning methods for\nsingle-image super-resolution (SISR) can be applied to restore image quality by\nupscaling lower-resolution (LR) images to yield high-resolution images (HR).\nThese SISR methods have been successfully applied to photo-realistic images due\npartly to the abundance of publicly available data. In contrast, the lack of\npublicly available data partly limits their application and success in scanning\nconfocal microscopy. In this paper, we introduce a large scanning confocal\nmicroscopy dataset named SR-CACO-2 that is comprised of low- and\nhigh-resolution image pairs marked for three different fluorescent markers. It\nallows the evaluation of performance of SISR methods on three different\nupscaling levels (X2, X4, X8). SR-CACO-2 contains the human epithelial cell\nline Caco-2 (ATCC HTB-37), and it is composed of 2,200 unique images, captured\nwith four resolutions and three markers, forming 9,937 image patches for SISR\nmethods. We provide benchmarking results for 16 state-of-the-art methods of the\nmain SISR families. Results show that these methods have limited success in\nproducing high-resolution textures. The dataset is freely accessible under a\nCreative Commons license (CC BY-NC-SA 4.0). Our dataset, code and pretrained\nweights for SISR methods are available: https://github.com/sbelharbi/sr-caco-2.\n","authors":["Soufiane Belharbi","Mara KM Whitford","Phuong Hoang","Shakeeb Murtaza","Luke McCaffrey","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2406.09168v2.pdf","comment":"27 pages, 15 figures, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.24046v1","updated":"2024-10-31T15:42:24Z","published":"2024-10-31T15:42:24Z","title":"Deep Learning with HM-VGG: AI Strategies for Multi-modal Image Analysis","summary":" This study introduces the Hybrid Multi-modal VGG (HM-VGG) model, a\ncutting-edge deep learning approach for the early diagnosis of glaucoma. The\nHM-VGG model utilizes an attention mechanism to process Visual Field (VF) data,\nenabling the extraction of key features that are vital for identifying early\nsigns of glaucoma. Despite the common reliance on large annotated datasets, the\nHM-VGG model excels in scenarios with limited data, achieving remarkable\nresults with small sample sizes. The model's performance is underscored by its\nhigh metrics in Precision, Accuracy, and F1-Score, indicating its potential for\nreal-world application in glaucoma detection. The paper also discusses the\nchallenges associated with ophthalmic image analysis, particularly the\ndifficulty of obtaining large volumes of annotated data. It highlights the\nimportance of moving beyond single-modality data, such as VF or Optical\nCoherence Tomography (OCT) images alone, to a multimodal approach that can\nprovide a richer, more comprehensive dataset. This integration of different\ndata types is shown to significantly enhance diagnostic accuracy. The HM- VGG\nmodel offers a promising tool for doctors, streamlining the diagnostic process\nand improving patient outcomes. Furthermore, its applicability extends to\ntelemedicine and mobile healthcare, making diagnostic services more accessible.\nThe research presented in this paper is a significant step forward in the field\nof medical image processing and has profound implications for clinical\nophthalmology.\n","authors":["Junliang Du","Yiru Cang","Tong Zhou","Jiacheng Hu","Weijie He"],"pdf_url":"https://arxiv.org/pdf/2410.24046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24037v1","updated":"2024-10-31T15:34:49Z","published":"2024-10-31T15:34:49Z","title":"TPC: Test-time Procrustes Calibration for Diffusion-based Human Image\n Animation","summary":" Human image animation aims to generate a human motion video from the inputs\nof a reference human image and a target motion video. Current diffusion-based\nimage animation systems exhibit high precision in transferring human identity\ninto targeted motion, yet they still exhibit irregular quality in their\noutputs. Their optimal precision is achieved only when the physical\ncompositions (i.e., scale and rotation) of the human shapes in the reference\nimage and target pose frame are aligned. In the absence of such alignment,\nthere is a noticeable decline in fidelity and consistency. Especially, in\nreal-world environments, this compositional misalignment commonly occurs,\nposing significant challenges to the practical usage of current systems. To\nthis end, we propose Test-time Procrustes Calibration (TPC), which enhances the\nrobustness of diffusion-based image animation systems by maintaining optimal\nperformance even when faced with compositional misalignment, effectively\naddressing real-world scenarios. The TPC provides a calibrated reference image\nfor the diffusion model, enhancing its capability to understand the\ncorrespondence between human shapes in the reference and target images. Our\nmethod is simple and can be applied to any diffusion-based image animation\nsystem in a model-agnostic manner, improving the effectiveness at test time\nwithout additional training.\n","authors":["Sunjae Yoon","Gwanhyeong Koo","Younghwan Lee","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2410.24037v1.pdf","comment":"24 pages, 16 figures, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.24034v1","updated":"2024-10-31T15:32:14Z","published":"2024-10-31T15:32:14Z","title":"Handwriting Recognition in Historical Documents with Multimodal LLM","summary":" There is an immense quantity of historical and cultural documentation that\nexists only as handwritten manuscripts. At the same time, performing OCR across\nscripts and different handwriting styles has proven to be an enormously\ndifficult problem relative to the process of digitizing print. While recent\nTransformer based models have achieved relatively strong performance, they rely\nheavily on manually transcribed training data and have difficulty generalizing\nacross writers. Multimodal LLM, such as GPT-4v and Gemini, have demonstrated\neffectiveness in performing OCR and computer vision tasks with few shot\nprompting. In this paper, I evaluate the accuracy of handwritten document\ntranscriptions generated by Gemini against the current state of the art\nTransformer based methods.\n Keywords: Optical Character Recognition, Multimodal Language Models, Cultural\nPreservation, Mass digitization, Handwriting Recognitio\n","authors":["Lucian Li"],"pdf_url":"https://arxiv.org/pdf/2410.24034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24031v1","updated":"2024-10-31T15:29:51Z","published":"2024-10-31T15:29:51Z","title":"A Multi-Modal Approach for Face Anti-Spoofing in Non-Calibrated Systems\n using Disparity Maps","summary":" Face recognition technologies are increasingly used in various applications,\nyet they are vulnerable to face spoofing attacks. These spoofing attacks often\ninvolve unique 3D structures, such as printed papers or mobile device screens.\nAlthough stereo-depth cameras can detect such attacks effectively, their\nhigh-cost limits their widespread adoption. Conversely, two-sensor systems\nwithout extrinsic calibration offer a cost-effective alternative but are unable\nto calculate depth using stereo techniques. In this work, we propose a method\nto overcome this challenge by leveraging facial attributes to derive disparity\ninformation and estimate relative depth for anti-spoofing purposes, using\nnon-calibrated systems. We introduce a multi-modal anti-spoofing model, coined\nDisparity Model, that incorporates created disparity maps as a third modality\nalongside the two original sensor modalities. We demonstrate the effectiveness\nof the Disparity Model in countering various spoof attacks using a\ncomprehensive dataset collected from the Intel RealSense ID Solution F455. Our\nmethod outperformed existing methods in the literature, achieving an Equal\nError Rate (EER) of 1.71% and a False Negative Rate (FNR) of 2.77% at a False\nPositive Rate (FPR) of 1%. These errors are lower by 2.45% and 7.94% than the\nerrors of the best comparison method, respectively. Additionally, we introduce\na model ensemble that addresses 3D spoof attacks as well, achieving an EER of\n2.04% and an FNR of 3.83% at an FPR of 1%. Overall, our work provides a\nstate-of-the-art solution for the challenging task of anti-spoofing in\nnon-calibrated systems that lack depth information.\n","authors":["Ariel Larey","Eyal Rond","Omer Achrack"],"pdf_url":"https://arxiv.org/pdf/2410.24031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24018v1","updated":"2024-10-31T15:20:43Z","published":"2024-10-31T15:20:43Z","title":"Bayesian-guided Label Mapping for Visual Reprogramming","summary":" Visual reprogramming (VR) leverages the intrinsic capabilities of pretrained\nvision models by adapting their input or output interfaces to solve downstream\ntasks whose labels (i.e., downstream labels) might be totally different from\nthe labels associated with the pretrained models (i.e., pretrained labels).\nWhen adapting the output interface, label mapping methods transform the\npretrained labels to downstream labels by establishing a gradient-free\none-to-one correspondence between the two sets of labels. However, in this\npaper, we reveal that one-to-one mappings may overlook the complex relationship\nbetween pretrained and downstream labels. Motivated by this observation, we\npropose a Bayesian-guided Label Mapping (BLM) method. BLM constructs an\niteratively-updated probabilistic label mapping matrix, with each element\nquantifying a pairwise relationship between pretrained and downstream labels.\nThe assignment of values to the constructed matrix is guided by Bayesian\nconditional probability, considering the joint distribution of the downstream\nlabels and the labels predicted by the pretrained model on downstream samples.\nExperiments conducted on both pretrained vision models (e.g., ResNeXt) and\nvision-language models (e.g., CLIP) demonstrate the superior performance of BLM\nover existing label mapping methods. The success of BLM also offers a\nprobabilistic lens through which to understand and analyze the effectiveness of\nVR. Our code is available at https://github.com/tmlr-group/BayesianLM.\n","authors":["Chengyi Cai","Zesheng Ye","Lei Feng","Jianzhong Qi","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2410.24018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24015v1","updated":"2024-10-31T15:17:14Z","published":"2024-10-31T15:17:14Z","title":"Unveiling Synthetic Faces: How Synthetic Datasets Can Expose Real\n Identities","summary":" Synthetic data generation is gaining increasing popularity in different\ncomputer vision applications. Existing state-of-the-art face recognition models\nare trained using large-scale face datasets, which are crawled from the\nInternet and raise privacy and ethical concerns. To address such concerns,\nseveral works have proposed generating synthetic face datasets to train face\nrecognition models. However, these methods depend on generative models, which\nare trained on real face images. In this work, we design a simple yet effective\nmembership inference attack to systematically study if any of the existing\nsynthetic face recognition datasets leak any information from the real data\nused to train the generator model. We provide an extensive study on 6\nstate-of-the-art synthetic face recognition datasets, and show that in all\nthese synthetic datasets, several samples from the original real dataset are\nleaked. To our knowledge, this paper is the first work which shows the leakage\nfrom training data of generator models into the generated synthetic face\nrecognition datasets. Our study demonstrates privacy pitfalls in synthetic face\nrecognition datasets and paves the way for future studies on generating\nresponsible synthetic face datasets.\n","authors":["Hatef Otroshi Shahreza","Sébastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2410.24015v1.pdf","comment":"Accepted in NeurIPS 2024 Workshop on New Frontiers in Adversarial\n Machine Learning"},{"id":"http://arxiv.org/abs/2410.24010v1","updated":"2024-10-31T15:10:38Z","published":"2024-10-31T15:10:38Z","title":"Re-assembling the past: The RePAIR dataset and benchmark for real world\n 2D and 3D puzzle solving","summary":" This paper proposes the RePAIR dataset that represents a challenging\nbenchmark to test modern computational and data driven methods for\npuzzle-solving and reassembly tasks. Our dataset has unique properties that are\nuncommon to current benchmarks for 2D and 3D puzzle solving. The fragments and\nfractures are realistic, caused by a collapse of a fresco during a World War II\nbombing at the Pompeii archaeological park. The fragments are also eroded and\nhave missing pieces with irregular shapes and different dimensions, challenging\nfurther the reassembly algorithms. The dataset is multi-modal providing high\nresolution images with characteristic pictorial elements, detailed 3D scans of\nthe fragments and meta-data annotated by the archaeologists. Ground truth has\nbeen generated through several years of unceasing fieldwork, including the\nexcavation and cleaning of each fragment, followed by manual puzzle solving by\narchaeologists of a subset of approx. 1000 pieces among the 16000 available.\nAfter digitizing all the fragments in 3D, a benchmark was prepared to challenge\ncurrent reassembly and puzzle-solving methods that often solve more simplistic\nsynthetic scenarios. The tested baselines show that there clearly exists a gap\nto fill in solving this computationally complex problem.\n","authors":["Theodore Tsesmelis","Luca Palmieri","Marina Khoroshiltseva","Adeela Islam","Gur Elkin","Ofir Itzhak Shahar","Gianluca Scarpellini","Stefano Fiorini","Yaniv Ohayon","Nadav Alali","Sinem Aslan","Pietro Morerio","Sebastiano Vascon","Elena Gravina","Maria Cristina Napolitano","Giuseppe Scarpati","Gabriel Zuchtriegel","Alexandra Spühler","Michel E. Fuchs","Stuart James","Ohad Ben-Shahar","Marcello Pelillo","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2410.24010v1.pdf","comment":"NeurIPS 2024, Track Datasets and Benchmarks, 10 pages"},{"id":"http://arxiv.org/abs/2410.24006v1","updated":"2024-10-31T15:09:36Z","published":"2024-10-31T15:09:36Z","title":"DiffPAD: Denoising Diffusion-based Adversarial Patch Decontamination","summary":" In the ever-evolving adversarial machine learning landscape, developing\neffective defenses against patch attacks has become a critical challenge,\nnecessitating reliable solutions to safeguard real-world AI systems. Although\ndiffusion models have shown remarkable capacity in image synthesis and have\nbeen recently utilized to counter $\\ell_p$-norm bounded attacks, their\npotential in mitigating localized patch attacks remains largely underexplored.\nIn this work, we propose DiffPAD, a novel framework that harnesses the power of\ndiffusion models for adversarial patch decontamination. DiffPAD first performs\nsuper-resolution restoration on downsampled input images, then adopts\nbinarization, dynamic thresholding scheme and sliding window for effective\nlocalization of adversarial patches. Such a design is inspired by the\ntheoretically derived correlation between patch size and diffusion restoration\nerror that is generalized across diverse patch attack scenarios. Finally,\nDiffPAD applies inpainting techniques to the original input images with the\nestimated patch region being masked. By integrating closed-form solutions for\nsuper-resolution restoration and image inpainting into the conditional reverse\nsampling process of a pre-trained diffusion model, DiffPAD obviates the need\nfor text guidance or fine-tuning. Through comprehensive experiments, we\ndemonstrate that DiffPAD not only achieves state-of-the-art adversarial\nrobustness against patch attacks but also excels in recovering naturalistic\nimages without patch remnants.\n","authors":["Jia Fu","Xiao Zhang","Sepideh Pashami","Fatemeh Rahimian","Anders Holst"],"pdf_url":"https://arxiv.org/pdf/2410.24006v1.pdf","comment":"Accepted to 2025 IEEE/CVF Winter Conference on Applications of\n Computer Vision (WACV)"},{"id":"http://arxiv.org/abs/2405.10723v2","updated":"2024-10-31T15:05:50Z","published":"2024-05-17T12:11:58Z","title":"Eddeep: Fast eddy-current distortion correction for diffusion MRI with\n deep learning","summary":" Modern diffusion MRI sequences commonly acquire a large number of volumes\nwith diffusion sensitization gradients of differing strengths or directions.\nSuch sequences rely on echo-planar imaging (EPI) to achieve reasonable scan\nduration. However, EPI is vulnerable to off-resonance effects, leading to\ntissue susceptibility and eddy-current induced distortions. The latter is\nparticularly problematic because it causes misalignment between volumes,\ndisrupting downstream modelling and analysis. The essential correction of eddy\ndistortions is typically done post-acquisition, with image registration.\nHowever, this is non-trivial because correspondence between volumes can be\nseverely disrupted due to volume-specific signal attenuations induced by\nvarying directions and strengths of the applied gradients. This challenge has\nbeen successfully addressed by the popular FSL~Eddy tool but at considerable\ncomputational cost. We propose an alternative approach, leveraging recent\nadvances in image processing enabled by deep learning (DL). It consists of two\nconvolutional neural networks: 1) An image translator to restore correspondence\nbetween images; 2) A registration model to align the translated images. Results\ndemonstrate comparable distortion estimates to FSL~Eddy, while requiring only\nmodest training sample sizes. This work, to the best of our knowledge, is the\nfirst to tackle this problem with deep learning. Together with recently\ndeveloped DL-based susceptibility correction techniques, they pave the way for\nreal-time preprocessing of diffusion MRI, facilitating its wider uptake in the\nclinic.\n","authors":["Antoine Legouhy","Ross Callaghan","Whitney Stee","Philippe Peigneux","Hojjat Azadbakht","Hui Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.10723v2.pdf","comment":"accepted in MICCAI 2024 conference"},{"id":"http://arxiv.org/abs/2410.24002v1","updated":"2024-10-31T15:02:16Z","published":"2024-10-31T15:02:16Z","title":"Assessing the Efficacy of Classical and Deep Neuroimaging Biomarkers in\n Early Alzheimer's Disease Diagnosis","summary":" Alzheimer's disease (AD) is the leading cause of dementia, and its early\ndetection is crucial for effective intervention, yet current diagnostic methods\noften fall short in sensitivity and specificity. This study aims to detect\nsignificant indicators of early AD by extracting and integrating various\nimaging biomarkers, including radiomics, hippocampal texture descriptors,\ncortical thickness measurements, and deep learning features. We analyze\nstructural magnetic resonance imaging (MRI) scans from the Alzheimer's Disease\nNeuroimaging Initiative (ADNI) cohorts, utilizing comprehensive image analysis\nand machine learning techniques. Our results show that combining multiple\nbiomarkers significantly improves detection accuracy. Radiomics and texture\nfeatures emerged as the most effective predictors for early AD, achieving AUCs\nof 0.88 and 0.72 for AD and MCI detection, respectively. Although deep learning\nfeatures proved to be less effective than traditional approaches, incorporating\nage with other biomarkers notably enhanced MCI detection performance.\nAdditionally, our findings emphasize the continued importance of classical\nimaging biomarkers in the face of modern deep-learning approaches, providing a\nrobust framework for early AD diagnosis.\n","authors":["Milla E. Nielsen","Mads Nielsen","Mostafa Mehdipour Ghazi"],"pdf_url":"https://arxiv.org/pdf/2410.24002v1.pdf","comment":"SPIE Medical Imaging (MI25)"},{"id":"http://arxiv.org/abs/2410.24001v1","updated":"2024-10-31T15:02:05Z","published":"2024-10-31T15:02:05Z","title":"ImOV3D: Learning Open-Vocabulary Point Clouds 3D Object Detection from\n Only 2D Images","summary":" Open-vocabulary 3D object detection (OV-3Det) aims to generalize beyond the\nlimited number of base categories labeled during the training phase. The\nbiggest bottleneck is the scarcity of annotated 3D data, whereas 2D image\ndatasets are abundant and richly annotated. Consequently, it is intuitive to\nleverage the wealth of annotations in 2D images to alleviate the inherent data\nscarcity in OV-3Det. In this paper, we push the task setup to its limits by\nexploring the potential of using solely 2D images to learn OV-3Det. The major\nchallenges for this setup is the modality gap between training images and\ntesting point clouds, which prevents effective integration of 2D knowledge into\nOV-3Det. To address this challenge, we propose a novel framework ImOV3D to\nleverage pseudo multimodal representation containing both images and point\nclouds (PC) to close the modality gap. The key of ImOV3D lies in flexible\nmodality conversion where 2D images can be lifted into 3D using monocular depth\nestimation and can also be derived from 3D scenes through rendering. This\nallows unifying both training images and testing point clouds into a common\nimage-PC representation, encompassing a wealth of 2D semantic information and\nalso incorporating the depth and structural characteristics of 3D spatial data.\nWe carefully conduct such conversion to minimize the domain gap between\ntraining and test cases. Extensive experiments on two benchmark datasets,\nSUNRGBD and ScanNet, show that ImOV3D significantly outperforms existing\nmethods, even in the absence of ground truth 3D training data. With the\ninclusion of a minimal amount of real 3D data for fine-tuning, the performance\nalso significantly surpasses previous state-of-the-art. Codes and pre-trained\nmodels are released on the https://github.com/yangtiming/ImOV3D.\n","authors":["Timing Yang","Yuanliang Ju","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2410.24001v1.pdf","comment":"Accepted by NeurIPS 2024. Code link\n https://github.com/yangtiming/ImOV3D"},{"id":"http://arxiv.org/abs/2211.15656v3","updated":"2024-10-31T15:01:41Z","published":"2022-11-28T18:59:02Z","title":"SuperFusion: Multilevel LiDAR-Camera Fusion for Long-Range HD Map\n Generation","summary":" High-definition (HD) semantic map generation of the environment is an\nessential component of autonomous driving. Existing methods have achieved good\nperformance in this task by fusing different sensor modalities, such as LiDAR\nand camera. However, current works are based on raw data or network\nfeature-level fusion and only consider short-range HD map generation, limiting\ntheir deployment to realistic autonomous driving applications. In this paper,\nwe focus on the task of building the HD maps in both short ranges, i.e., within\n30 m, and also predicting long-range HD maps up to 90 m, which is required by\ndownstream path planning and control tasks to improve the smoothness and safety\nof autonomous driving. To this end, we propose a novel network named\nSuperFusion, exploiting the fusion of LiDAR and camera data at multiple levels.\nWe use LiDAR depth to improve image depth estimation and use image features to\nguide long-range LiDAR feature prediction. We benchmark our SuperFusion on the\nnuScenes dataset and a self-recorded dataset and show that it outperforms the\nstate-of-the-art baseline methods with large margins on all intervals.\nAdditionally, we apply the generated HD map to a downstream path planning task,\ndemonstrating that the long-range HD maps predicted by our method can lead to\nbetter path planning for autonomous vehicles. Our code has been released at\nhttps://github.com/haomo-ai/SuperFusion.\n","authors":["Hao Dong","Weihao Gu","Xianjing Zhang","Jintao Xu","Rui Ai","Huimin Lu","Juho Kannala","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2211.15656v3.pdf","comment":"ICRA 2024"},{"id":"http://arxiv.org/abs/2407.03550v2","updated":"2024-10-31T14:51:32Z","published":"2024-07-04T00:07:50Z","title":"CoMix: A Comprehensive Benchmark for Multi-Task Comic Understanding","summary":" The comic domain is rapidly advancing with the development of single-page\nanalysis and synthesis models. However, evaluation metrics and datasets lag\nbehind, often limited to small-scale or single-style test sets. We introduce a\nnovel benchmark, CoMix, designed to evaluate the multi-task capabilities of\nmodels in comic analysis. Unlike existing benchmarks that focus on isolated\ntasks such as object detection or text recognition, CoMix addresses a broader\nrange of tasks including object detection, speaker identification, character\nre-identification, reading order, and multi-modal reasoning tasks like\ncharacter naming and dialogue generation. Our benchmark comprises three\nexisting datasets with expanded annotations to support multi-task evaluation.\nTo mitigate the over-representation of manga-style data, we have incorporated a\nnew dataset of carefully selected American comic-style books, thereby enriching\nthe diversity of comic styles. CoMix is designed to assess pre-trained models\nin zero-shot and limited fine-tuning settings, probing their transfer\ncapabilities across different comic styles and tasks. The validation split of\nthe benchmark is publicly available for research purposes, and an evaluation\nserver for the held-out test split is also provided. Comparative results\nbetween human performance and state-of-the-art models reveal a significant\nperformance gap, highlighting substantial opportunities for advancements in\ncomic understanding. The dataset, baseline models, and code are accessible at\nhttps://github.com/emanuelevivoli/CoMix-dataset. This initiative sets a new\nstandard for comprehensive comic analysis, providing the community with a\ncommon benchmark for evaluation on a large and varied set.\n","authors":["Emanuele Vivoli","Marco Bertini","Dimosthenis Karatzas"],"pdf_url":"https://arxiv.org/pdf/2407.03550v2.pdf","comment":"Accepted at NeurIPS 2024 (D&B)"},{"id":"http://arxiv.org/abs/2410.23991v1","updated":"2024-10-31T14:50:48Z","published":"2024-10-31T14:50:48Z","title":"Localization, balance and affinity: a stronger multifaceted\n collaborative salient object detector in remote sensing images","summary":" Despite significant advancements in salient object detection(SOD) in optical\nremote sensing images(ORSI), challenges persist due to the intricate edge\nstructures of ORSIs and the complexity of their contextual relationships.\nCurrent deep learning approaches encounter difficulties in accurately\nidentifying boundary features and lack efficiency in collaboratively modeling\nthe foreground and background by leveraging contextual features. To address\nthese challenges, we propose a stronger multifaceted collaborative salient\nobject detector in ORSIs, termed LBA-MCNet, which incorporates aspects of\nlocalization, balance, and affinity. The network focuses on accurately locating\ntargets, balancing detailed features, and modeling image-level global context\ninformation. Specifically, we design the Edge Feature Adaptive Balancing and\nAdjusting(EFABA) module for precise edge localization, using edge features to\nguide attention to boundaries and preserve spatial details. Moreover, we design\nthe Global Distributed Affinity Learning(GDAL) module to model global context.\nIt captures global context by generating an affinity map from the encoders\nfinal layer, ensuring effective modeling of global patterns. Additionally, deep\nsupervision during deconvolution further enhances feature representation.\nFinally, we compared with 28 state of the art approaches on three publicly\navailable datasets. The results clearly demonstrate the superiority of our\nmethod.\n","authors":["Yakun Xie","Suning Liu","Hongyu Chen","Shaohan Cao","Huixin Zhang","Dejun Feng","Qian Wan","Jun Zhu","Qing Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.23991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15856v2","updated":"2024-10-31T14:48:23Z","published":"2023-12-26T02:50:42Z","title":"SERF: Fine-Grained Interactive 3D Segmentation and Editing with Radiance\n Fields","summary":" Although significant progress has been made in the field of 2D-based\ninteractive editing, fine-grained 3D-based interactive editing remains\nrelatively unexplored. This limitation can be attributed to two main\nchallenges: the lack of an efficient 3D representation robust to different\nmodifications and the absence of an effective 3D interactive segmentation\nmethod. In this paper, we introduce a novel fine-grained interactive 3D\nsegmentation and editing algorithm with radiance fields, which we refer to as\nSERF. Our method entails creating a neural mesh representation by integrating\nmulti-view algorithms with pre-trained 2D models. Building upon this\nrepresentation, we introduce a novel surface rendering technique that preserves\nlocal information and is robust to deformation. Moreover, this representation\nforms the basis for achieving accurate and interactive 3D segmentation without\nrequiring 3D supervision. Harnessing this representation facilitates a range of\ninteractive 3D editing operations, encompassing tasks such as interactive\ngeometry editing and texture painting. Extensive experiments and visualization\nexamples of editing on both real and synthetic data demonstrate the superiority\nof our method on representation quality and editing ability.\n","authors":["Kaichen Zhou","Lanqing Hong","Enze Xie","Yongxin Yang","Zhenguo Li","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.15856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22629v2","updated":"2024-10-31T14:44:44Z","published":"2024-10-30T01:22:37Z","title":"CrossEarth: Geospatial Vision Foundation Model for Domain Generalizable\n Remote Sensing Semantic Segmentation","summary":" The field of Remote Sensing Domain Generalization (RSDG) has emerged as a\ncritical and valuable research frontier, focusing on developing models that\ngeneralize effectively across diverse scenarios. Despite the substantial domain\ngaps in RS images that are characterized by variabilities such as location,\nwavelength, and sensor type, research in this area remains underexplored: (1)\nCurrent cross-domain methods primarily focus on Domain Adaptation (DA), which\nadapts models to predefined domains rather than to unseen ones; (2) Few studies\ntargeting the RSDG issue, especially for semantic segmentation tasks, where\nexisting models are developed for specific unknown domains, struggling with\nissues of underfitting on other unknown scenarios; (3) Existing RS foundation\nmodels tend to prioritize in-domain performance over cross-domain\ngeneralization. To this end, we introduce the first vision foundation model for\nRSDG semantic segmentation, CrossEarth. CrossEarth demonstrates strong\ncross-domain generalization through a specially designed data-level Earth-Style\nInjection pipeline and a model-level Multi-Task Training pipeline. In addition,\nfor the semantic segmentation task, we have curated an RSDG benchmark\ncomprising 28 cross-domain settings across various regions, spectral bands,\nplatforms, and climates, providing a comprehensive framework for testing the\ngeneralizability of future RSDG models. Extensive experiments on this benchmark\ndemonstrate the superiority of CrossEarth over existing state-of-the-art\nmethods.\n","authors":["Ziyang Gong","Zhixiang Wei","Di Wang","Xianzheng Ma","Hongruixuan Chen","Yuru Jia","Yupeng Deng","Zhenming Ji","Xiangwei Zhu","Naoto Yokoya","Jing Zhang","Bo Du","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.22629v2.pdf","comment":"The codes and models will be available at\n https://github.com/Cuzyoung/CrossEarth"},{"id":"http://arxiv.org/abs/2410.23988v1","updated":"2024-10-31T14:42:26Z","published":"2024-10-31T14:42:26Z","title":"JEMA: A Joint Embedding Framework for Scalable Co-Learning with\n Multimodal Alignment","summary":" This work introduces JEMA (Joint Embedding with Multimodal Alignment), a\nnovel co-learning framework tailored for laser metal deposition (LMD), a\npivotal process in metal additive manufacturing. As Industry 5.0 gains traction\nin industrial applications, efficient process monitoring becomes increasingly\ncrucial. However, limited data and the opaque nature of AI present challenges\nfor its application in an industrial setting. JEMA addresses this challenges by\nleveraging multimodal data, including multi-view images and metadata such as\nprocess parameters, to learn transferable semantic representations. By applying\na supervised contrastive loss function, JEMA enables robust learning and\nsubsequent process monitoring using only the primary modality, simplifying\nhardware requirements and computational overhead. We investigate the\neffectiveness of JEMA in LMD process monitoring, focusing specifically on its\ngeneralization to downstream tasks such as melt pool geometry prediction,\nachieved without extensive fine-tuning. Our empirical evaluation demonstrates\nthe high scalability and performance of JEMA, particularly when combined with\nVision Transformer models. We report an 8% increase in performance in\nmultimodal settings and a 1% improvement in unimodal settings compared to\nsupervised contrastive learning. Additionally, the learned embedding\nrepresentation enables the prediction of metadata, enhancing interpretability\nand making possible the assessment of the added metadata's contributions. Our\nframework lays the foundation for integrating multisensor data with metadata,\nenabling diverse downstream tasks within the LMD domain and beyond.\n","authors":["Joao Sousa","Roya Darabi","Armando Sousa","Frank Brueckner","Luís Paulo Reis","Ana Reis"],"pdf_url":"https://arxiv.org/pdf/2410.23988v1.pdf","comment":"26 pages, 14 figures"},{"id":"http://arxiv.org/abs/2312.14556v2","updated":"2024-10-31T14:37:59Z","published":"2023-12-22T09:29:45Z","title":"CaptainCook4D: A Dataset for Understanding Errors in Procedural\n Activities","summary":" Following step-by-step procedures is an essential component of various\nactivities carried out by individuals in their daily lives. These procedures\nserve as a guiding framework that helps to achieve goals efficiently, whether\nit is assembling furniture or preparing a recipe. However, the complexity and\nduration of procedural activities inherently increase the likelihood of making\nerrors. Understanding such procedural activities from a sequence of frames is a\nchallenging task that demands an accurate interpretation of visual information\nand the ability to reason about the structure of the activity. To this end, we\ncollect a new egocentric 4D dataset, CaptainCook4D, comprising 384 recordings\n(94.5 hours) of people performing recipes in real kitchen environments. This\ndataset consists of two distinct types of activity: one in which participants\nadhere to the provided recipe instructions and another in which they deviate\nand induce errors. We provide 5.3K step annotations and 10K fine-grained action\nannotations and benchmark the dataset for the following tasks: supervised error\nrecognition, multistep localization, and procedure learning\n","authors":["Rohith Peddi","Shivvrat Arya","Bharath Challa","Likhitha Pallapothula","Akshay Vyas","Bhavya Gouripeddi","Jikai Wang","Qifan Zhang","Vasundhara Komaragiri","Eric Ragan","Nicholas Ruozzi","Yu Xiang","Vibhav Gogate"],"pdf_url":"https://arxiv.org/pdf/2312.14556v2.pdf","comment":"Accepted to the 2024 Neural Information Processing Systems Datasets\n and Benchmarks Track, Project Page:\n https://captaincook4d.github.io/captain-cook/"},{"id":"http://arxiv.org/abs/2407.12582v2","updated":"2024-10-31T14:37:42Z","published":"2024-07-17T14:09:46Z","title":"Embracing Events and Frames with Hierarchical Feature Refinement Network\n for Object Detection","summary":" In frame-based vision, object detection faces substantial performance\ndegradation under challenging conditions due to the limited sensing capability\nof conventional cameras. Event cameras output sparse and asynchronous events,\nproviding a potential solution to solve these problems. However, effectively\nfusing two heterogeneous modalities remains an open issue. In this work, we\npropose a novel hierarchical feature refinement network for event-frame fusion.\nThe core concept is the design of the coarse-to-fine fusion module, denoted as\nthe cross-modality adaptive feature refinement (CAFR) module. In the initial\nphase, the bidirectional cross-modality interaction (BCI) part facilitates\ninformation bridging from two distinct sources. Subsequently, the features are\nfurther refined by aligning the channel-level mean and variance in the two-fold\nadaptive feature refinement (TAFR) part. We conducted extensive experiments on\ntwo benchmarks: the low-resolution PKU-DDD17-Car dataset and the\nhigh-resolution DSEC dataset. Experimental results show that our method\nsurpasses the state-of-the-art by an impressive margin of $\\textbf{8.0}\\%$ on\nthe DSEC dataset. Besides, our method exhibits significantly better robustness\n(\\textbf{69.5}\\% versus \\textbf{38.7}\\%) when introducing 15 different\ncorruption types to the frame images. The code can be found at the link\n(https://github.com/HuCaoFighting/FRN).\n","authors":["Hu Cao","Zehua Zhang","Yan Xia","Xinyi Li","Jiahao Xia","Guang Chen","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2407.12582v2.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2410.22637v2","updated":"2024-10-31T14:35:31Z","published":"2024-10-30T02:04:23Z","title":"Consistency Diffusion Bridge Models","summary":" Diffusion models (DMs) have become the dominant paradigm of generative\nmodeling in a variety of domains by learning stochastic processes from noise to\ndata. Recently, diffusion denoising bridge models (DDBMs), a new formulation of\ngenerative modeling that builds stochastic processes between fixed data\nendpoints based on a reference diffusion process, have achieved empirical\nsuccess across tasks with coupled data distribution, such as image-to-image\ntranslation. However, DDBM's sampling process typically requires hundreds of\nnetwork evaluations to achieve decent performance, which may impede their\npractical deployment due to high computational demands. In this work, inspired\nby the recent advance of consistency models in DMs, we tackle this problem by\nlearning the consistency function of the probability-flow ordinary differential\nequation (PF-ODE) of DDBMs, which directly predicts the solution at a starting\nstep given any point on the ODE trajectory. Based on a dedicated general-form\nODE solver, we propose two paradigms: consistency bridge distillation and\nconsistency bridge training, which is flexible to apply on DDBMs with broad\ndesign choices. Experimental results show that our proposed method could sample\n$4\\times$ to $50\\times$ faster than the base DDBM and produce better visual\nquality given the same step in various tasks with pixel resolution ranging from\n$64 \\times 64$ to $256 \\times 256$, as well as supporting downstream tasks such\nas semantic interpolation in the data space.\n","authors":["Guande He","Kaiwen Zheng","Jianfei Chen","Fan Bao","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.22637v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23970v1","updated":"2024-10-31T14:25:55Z","published":"2024-10-31T14:25:55Z","title":"TrAct: Making First-layer Pre-Activations Trainable","summary":" We consider the training of the first layer of vision models and notice the\nclear relationship between pixel values and gradient update magnitudes: the\ngradients arriving at the weights of a first layer are by definition directly\nproportional to (normalized) input pixel values. Thus, an image with low\ncontrast has a smaller impact on learning than an image with higher contrast,\nand a very bright or very dark image has a stronger impact on the weights than\nan image with moderate brightness. In this work, we propose performing gradient\ndescent on the embeddings produced by the first layer of the model. However,\nswitching to discrete inputs with an embedding layer is not a reasonable option\nfor vision models. Thus, we propose the conceptual procedure of (i) a gradient\ndescent step on first layer activations to construct an activation proposal,\nand (ii) finding the optimal weights of the first layer, i.e., those weights\nwhich minimize the squared distance to the activation proposal. We provide a\nclosed form solution of the procedure and adjust it for robust stochastic\ntraining while computing everything efficiently. Empirically, we find that\nTrAct (Training Activations) speeds up training by factors between 1.25x and 4x\nwhile requiring only a small computational overhead. We demonstrate the utility\nof TrAct with different optimizers for a range of different vision models\nincluding convolutional and transformer architectures.\n","authors":["Felix Petersen","Christian Borgelt","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2410.23970v1.pdf","comment":"Published at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23962v1","updated":"2024-10-31T14:14:30Z","published":"2024-10-31T14:14:30Z","title":"Image Synthesis with Class-Aware Semantic Diffusion Models for Surgical\n Scene Segmentation","summary":" Surgical scene segmentation is essential for enhancing surgical precision,\nyet it is frequently compromised by the scarcity and imbalance of available\ndata. To address these challenges, semantic image synthesis methods based on\ngenerative adversarial networks and diffusion models have been developed.\nHowever, these models often yield non-diverse images and fail to capture small,\ncritical tissue classes, limiting their effectiveness. In response, we propose\nthe Class-Aware Semantic Diffusion Model (CASDM), a novel approach which\nutilizes segmentation maps as conditions for image synthesis to tackle data\nscarcity and imbalance. Novel class-aware mean squared error and class-aware\nself-perceptual loss functions have been defined to prioritize critical, less\nvisible classes, thereby enhancing image quality and relevance. Furthermore, to\nour knowledge, we are the first to generate multi-class segmentation maps using\ntext prompts in a novel fashion to specify their contents. These maps are then\nused by CASDM to generate surgical scene images, enhancing datasets for\ntraining and validating segmentation models. Our evaluation, which assesses\nboth image quality and downstream segmentation performance, demonstrates the\nstrong effectiveness and generalisability of CASDM in producing realistic\nimage-map pairs, significantly advancing surgical scene segmentation across\ndiverse and challenging datasets.\n","authors":["Yihang Zhou","Rebecca Towning","Zaid Awad","Stamatia Giannarou"],"pdf_url":"https://arxiv.org/pdf/2410.23962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23946v1","updated":"2024-10-31T14:02:40Z","published":"2024-10-31T14:02:40Z","title":"MV-CC: Mask Enhanced Video Model for Remote Sensing Change Caption","summary":" Remote sensing image change caption (RSICC) aims to provide natural language\ndescriptions for bi-temporal remote sensing images. Since Change Caption (CC)\ntask requires both spatial and temporal features, previous works follow an\nencoder-fusion-decoder architecture. They use an image encoder to extract\nspatial features and the fusion module to integrate spatial features and\nextract temporal features, which leads to increasingly complex manual design of\nthe fusion module. In this paper, we introduce a novel video model-based\nparadigm without design of the fusion module and propose a Mask-enhanced Video\nmodel for Change Caption (MV-CC). Specifically, we use the off-the-shelf video\nencoder to simultaneously extract the temporal and spatial features of\nbi-temporal images. Furthermore, the types of changes in the CC are set based\non specific task requirements, and to enable the model to better focus on the\nregions of interest, we employ masks obtained from the Change Detection (CD)\nmethod to explicitly guide the CC model. Experimental results demonstrate that\nour proposed method can obtain better performance compared with other\nstate-of-the-art RSICC methods. The code is available at\nhttps://github.com/liuruixun/MV-CC.\n","authors":["Ruixun Liu","Kaiyu Li","Jiayi Song","Dongwei Sun","Xiangyong Cao"],"pdf_url":"https://arxiv.org/pdf/2410.23946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03438v2","updated":"2024-10-31T13:41:19Z","published":"2024-10-04T13:52:22Z","title":"Dessie: Disentanglement for Articulated 3D Horse Shape and Pose\n Estimation from Images","summary":" In recent years, 3D parametric animal models have been developed to aid in\nestimating 3D shape and pose from images and video. While progress has been\nmade for humans, it's more challenging for animals due to limited annotated\ndata. To address this, we introduce the first method using synthetic data\ngeneration and disentanglement to learn to regress 3D shape and pose. Focusing\non horses, we use text-based texture generation and a synthetic data pipeline\nto create varied shapes, poses, and appearances, learning disentangled spaces.\nOur method, Dessie, surpasses existing 3D horse reconstruction methods and\ngeneralizes to other large animals like zebras, cows, and deer. See the project\nwebsite at: \\url{https://celiali.github.io/Dessie/}.\n","authors":["Ci Li","Yi Yang","Zehang Weng","Elin Hernlund","Silvia Zuffi","Hedvig Kjellström"],"pdf_url":"https://arxiv.org/pdf/2410.03438v2.pdf","comment":"ACCV2024"},{"id":"http://arxiv.org/abs/2410.23931v1","updated":"2024-10-31T13:41:16Z","published":"2024-10-31T13:41:16Z","title":"Manipulating Vehicle 3D Shapes through Latent Space Editing","summary":" Although 3D object editing has the potential to significantly influence\nvarious industries, recent research in 3D generation and editing has primarily\nfocused on converting text and images into 3D models, often overlooking the\nneed for fine-grained control over the editing of existing 3D objects. This\npaper introduces a framework that employs a pre-trained regressor, enabling\ncontinuous, precise, attribute-specific modifications to both the stylistic and\ngeometric attributes of vehicle 3D models. Our method not only preserves the\ninherent identity of vehicle 3D objects, but also supports multi-attribute\nediting, allowing for extensive customization without compromising the model's\nstructural integrity. Experimental results demonstrate the efficacy of our\napproach in achieving detailed edits on various vehicle 3D models.\n","authors":["JiangDong Miao","Tatsuya Ikeda","Bisser Raytchev","Ryota Mizoguchi","Takenori Hiraoka","Takuji Nakashima","Keigo Shimizu","Toru Higaki","Kazufumi Kaneda"],"pdf_url":"https://arxiv.org/pdf/2410.23931v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2410.23918v1","updated":"2024-10-31T13:26:11Z","published":"2024-10-31T13:26:11Z","title":"BitStack: Fine-Grained Size Control for Compressed Large Language Models\n in Variable Memory Environments","summary":" Large language models (LLMs) have revolutionized numerous applications, yet\ntheir deployment remains challenged by memory constraints on local devices.\nWhile scaling laws have enhanced LLM capabilities, the primary bottleneck has\nshifted from \\textit{capability} to \\textit{availability}, emphasizing the need\nfor efficient memory management. Traditional compression methods, such as\nquantization, often require predefined compression ratios and separate\ncompression processes for each setting, complicating deployment in variable\nmemory environments. In this paper, we introduce \\textbf{BitStack}, a novel,\ntraining-free weight compression approach that enables megabyte-level\ntrade-offs between memory usage and model performance. By leveraging weight\ndecomposition, BitStack can dynamically adjust the model size with minimal\ntransmission between running memory and storage devices. Our approach\niteratively decomposes weight matrices while considering the significance of\neach parameter, resulting in an approximately 1-bit per parameter residual\nblock in each decomposition iteration. These blocks are sorted and stacked in\nstorage as basic transmission units, with different quantities loaded based on\ncurrent memory availability. Extensive experiments across a wide range of tasks\ndemonstrate that, despite offering fine-grained size control, BitStack\nconsistently matches or surpasses strong quantization baselines, particularly\nat extreme compression ratios. To the best of our knowledge, this is the first\ndecomposition-based method that effectively bridges the gap to practical\ncompression techniques like quantization. Code is available at\nhttps://github.com/xinghaow99/BitStack.\n","authors":["Xinghao Wang","Pengyu Wang","Bo Wang","Dong Zhang","Yunhua Zhou","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2410.23918v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04230v2","updated":"2024-10-31T13:18:14Z","published":"2024-06-06T16:30:41Z","title":"M3LEO: A Multi-Modal, Multi-Label Earth Observation Dataset Integrating\n Interferometric SAR and Multispectral Data","summary":" Satellite-based remote sensing has revolutionised the way we address global\nchallenges. Huge quantities of Earth Observation (EO) data are generated by\nsatellite sensors daily, but processing these large datasets for use in ML\npipelines is technically and computationally challenging. While some\npreprocessed Earth observation datasets exist, their content is often limited\nto optical or near-optical wavelength data, which is ineffective at night or in\nadverse weather conditions. Synthetic Aperture Radar (SAR), an active sensing\ntechnique based on microwave length radiation, offers a viable alternative.\nHowever, the application of machine learning to SAR has been limited due to a\nlack of ML-ready data and pipelines, particularly for the full diversity of SAR\ndata, including polarimetry, coherence and interferometry. In this work, we\nintroduce M3LEO, a multi-modal, multi-label Earth observation dataset that\nincludes polarimetric, interferometric, and coherence SAR data derived from\nSentinel-1, alongside multispectral Sentinel-2 imagery and auxiliary data\ndescribing terrain properties such as land use. M3LEO spans approximately 17M\n4x4 km data chips from six diverse geographic regions. The dataset is\ncomplemented by a flexible PyTorch Lightning framework configured using Hydra\nto accommodate its use across diverse ML applications in Earth observation. We\nprovide tools to process any dataset available on popular platforms such as\nGoogle Earth Engine for seamless integration with our framework. We show that\nthe distribution shift in self-supervised embeddings is substantial across\ngeographic regions, even when controlling for terrain properties. Data:\nhuggingface.co/M3LEO, Code: github.com/spaceml-org/M3LEO.\n","authors":["Matthew J Allen","Francisco Dorr","Joseph Alejandro Gallego Mejia","Laura Martínez-Ferrer","Anna Jungbluth","Freddie Kalaitzis","Raúl Ramos-Pollán"],"pdf_url":"https://arxiv.org/pdf/2406.04230v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2410.23910v1","updated":"2024-10-31T13:13:32Z","published":"2024-10-31T13:13:32Z","title":"Uncertainty Estimation for 3D Object Detection via Evidential Learning","summary":" 3D object detection is an essential task for computer vision applications in\nautonomous vehicles and robotics. However, models often struggle to quantify\ndetection reliability, leading to poor performance on unfamiliar scenes. We\nintroduce a framework for quantifying uncertainty in 3D object detection by\nleveraging an evidential learning loss on Bird's Eye View representations in\nthe 3D detector. These uncertainty estimates require minimal computational\noverhead and are generalizable across different architectures. We demonstrate\nboth the efficacy and importance of these uncertainty estimates on identifying\nout-of-distribution scenes, poorly localized objects, and missing (false\nnegative) detections; our framework consistently improves over baselines by\n10-20% on average. Finally, we integrate this suite of tasks into a system\nwhere a 3D object detector auto-labels driving scenes and our uncertainty\nestimates verify label correctness before the labels are used to train a second\nmodel. Here, our uncertainty-driven verification results in a 1% improvement in\nmAP and a 1-2% improvement in NDS.\n","authors":["Nikita Durasov","Rafid Mahmood","Jiwoong Choi","Marc T. Law","James Lucas","Pascal Fua","Jose M. Alvarez"],"pdf_url":"https://arxiv.org/pdf/2410.23910v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23906v1","updated":"2024-10-31T13:11:09Z","published":"2024-10-31T13:11:09Z","title":"From Web Data to Real Fields: Low-Cost Unsupervised Domain Adaptation\n for Agricultural Robots","summary":" In precision agriculture, vision models often struggle with new, unseen\nfields where crops and weeds have been influenced by external factors,\nresulting in compositions and appearances that differ from the learned\ndistribution. This paper aims to adapt to specific fields at low cost using\nUnsupervised Domain Adaptation (UDA). We explore a novel domain shift from a\ndiverse, large pool of internet-sourced data to a small set of data collected\nby a robot at specific locations, minimizing the need for extensive on-field\ndata collection. Additionally, we introduce a novel module -- the Multi-level\nAttention-based Adversarial Discriminator (MAAD) -- which can be integrated at\nthe feature extractor level of any detection model. In this study, we\nincorporate MAAD with CenterNet to simultaneously detect leaf, stem, and vein\ninstances. Our results show significant performance improvements in the\nunlabeled target domain compared to baseline models, with a 7.5% increase in\nobject detection accuracy and a 5.1% improvement in keypoint detection.\n","authors":["Vasileios Tzouras","Lazaros Nalpantidis","Ronja Güldenring"],"pdf_url":"https://arxiv.org/pdf/2410.23906v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2410.23905v1","updated":"2024-10-31T13:10:50Z","published":"2024-10-31T13:10:50Z","title":"Text-DiFuse: An Interactive Multi-Modal Image Fusion Framework based on\n Text-modulated Diffusion Model","summary":" Existing multi-modal image fusion methods fail to address the compound\ndegradations presented in source images, resulting in fusion images plagued by\nnoise, color bias, improper exposure, \\textit{etc}. Additionally, these methods\noften overlook the specificity of foreground objects, weakening the salience of\nthe objects of interest within the fused images. To address these challenges,\nthis study proposes a novel interactive multi-modal image fusion framework\nbased on the text-modulated diffusion model, called Text-DiFuse. First, this\nframework integrates feature-level information integration into the diffusion\nprocess, allowing adaptive degradation removal and multi-modal information\nfusion. This is the first attempt to deeply and explicitly embed information\nfusion within the diffusion process, effectively addressing compound\ndegradation in image fusion. Second, by embedding the combination of the text\nand zero-shot location model into the diffusion fusion process, a\ntext-controlled fusion re-modulation strategy is developed. This enables\nuser-customized text control to improve fusion performance and highlight\nforeground objects in the fused images. Extensive experiments on diverse public\ndatasets show that our Text-DiFuse achieves state-of-the-art fusion performance\nacross various scenarios with complex degradation. Moreover, the semantic\nsegmentation experiment validates the significant enhancement in semantic\nperformance achieved by our text-controlled fusion re-modulation strategy. The\ncode is publicly available at https://github.com/Leiii-Cao/Text-DiFuse.\n","authors":["Hao Zhang","Lei Cao","Jiayi Ma"],"pdf_url":"https://arxiv.org/pdf/2410.23905v1.pdf","comment":"Accepted by the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2303.15124v2","updated":"2024-10-31T13:10:11Z","published":"2023-03-27T11:56:20Z","title":"Blind Inpainting with Object-aware Discrimination for Artificial Marker\n Removal","summary":" Medical images often incorporate doctor-added markers that can hinder\nAI-based diagnosis. This issue highlights the need of inpainting techniques to\nrestore the corrupted visual contents. However, existing methods require manual\nmask annotation as input, limiting the application scenarios. In this paper, we\npropose a novel blind inpainting method that automatically reconstructs visual\ncontents within the corrupted regions without mask input as guidance. Our model\nincludes a blind reconstruction network and an object-aware discriminator for\nadversarial training. The reconstruction network contains two branches that\npredict corrupted regions in images and simultaneously restore the missing\nvisual contents. Leveraging the potent recognition capability of a dense object\ndetector, the object-aware discriminator ensures markers undetectable after\ninpainting. Thus, the restored images closely resemble the clean ones. We\nevaluate our method on three datasets of various medical imaging modalities,\nconfirming better performance over other state-of-the-art methods.\n","authors":["Xuechen Guo","Wenhao Hu","Chiming Ni","Wenhao Chai","Shiyan Li","Gaoang Wang"],"pdf_url":"https://arxiv.org/pdf/2303.15124v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23904v1","updated":"2024-10-31T13:06:29Z","published":"2024-10-31T13:06:29Z","title":"EZ-HOI: VLM Adaptation via Guided Prompt Learning for Zero-Shot HOI\n Detection","summary":" Detecting Human-Object Interactions (HOI) in zero-shot settings, where models\nmust handle unseen classes, poses significant challenges. Existing methods that\nrely on aligning visual encoders with large Vision-Language Models (VLMs) to\ntap into the extensive knowledge of VLMs, require large, computationally\nexpensive models and encounter training difficulties. Adapting VLMs with prompt\nlearning offers an alternative to direct alignment. However, fine-tuning on\ntask-specific datasets often leads to overfitting to seen classes and\nsuboptimal performance on unseen classes, due to the absence of unseen class\nlabels. To address these challenges, we introduce a novel prompt learning-based\nframework for Efficient Zero-Shot HOI detection (EZ-HOI). First, we introduce\nLarge Language Model (LLM) and VLM guidance for learnable prompts, integrating\ndetailed HOI descriptions and visual semantics to adapt VLMs to HOI tasks.\nHowever, because training datasets contain seen-class labels alone, fine-tuning\nVLMs on such datasets tends to optimize learnable prompts for seen classes\ninstead of unseen ones. Therefore, we design prompt learning for unseen classes\nusing information from related seen classes, with LLMs utilized to highlight\nthe differences between unseen and related seen classes. Quantitative\nevaluations on benchmark datasets demonstrate that our EZ-HOI achieves\nstate-of-the-art performance across various zero-shot settings with only 10.35%\nto 33.95% of the trainable parameters compared to existing methods. Code is\navailable at https://github.com/ChelsieLei/EZ-HOI.\n","authors":["Qinqian Lei","Bo Wang","Robby T. Tan"],"pdf_url":"https://arxiv.org/pdf/2410.23904v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2402.04857v4","updated":"2024-10-31T13:01:13Z","published":"2024-02-07T13:54:56Z","title":"Advancing Video Anomaly Detection: A Concise Review and a New Dataset","summary":" Video Anomaly Detection (VAD) finds widespread applications in security\nsurveillance, traffic monitoring, industrial monitoring, and healthcare.\nDespite extensive research efforts, there remains a lack of concise reviews\nthat provide insightful guidance for researchers. Such reviews would serve as\nquick references to grasp current challenges, research trends, and future\ndirections. In this paper, we present such a review, examining models and\ndatasets from various perspectives. We emphasize the critical relationship\nbetween model and dataset, where the quality and diversity of datasets\nprofoundly influence model performance, and dataset development adapts to the\nevolving needs of emerging approaches. Our review identifies practical issues,\nincluding the absence of comprehensive datasets with diverse scenarios. To\naddress this, we introduce a new dataset, Multi-Scenario Anomaly Detection\n(MSAD), comprising 14 distinct scenarios captured from various camera views.\nOur dataset has diverse motion patterns and challenging variations, such as\ndifferent lighting and weather conditions, providing a robust foundation for\ntraining superior models. We conduct an in-depth analysis of recent\nrepresentative models using MSAD and highlight its potential in addressing the\nchallenges of detecting anomalies across diverse and evolving surveillance\nscenarios. [Project website: https://msad-dataset.github.io/]\n","authors":["Liyun Zhu","Lei Wang","Arjun Raj","Tom Gedeon","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2402.04857v4.pdf","comment":"Accepted at the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024) Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2407.08447v2","updated":"2024-10-31T12:58:08Z","published":"2024-07-11T12:41:32Z","title":"WildGaussians: 3D Gaussian Splatting in the Wild","summary":" While the field of 3D scene reconstruction is dominated by NeRFs due to their\nphotorealistic quality, 3D Gaussian Splatting (3DGS) has recently emerged,\noffering similar quality with real-time rendering speeds. However, both methods\nprimarily excel with well-controlled 3D scenes, while in-the-wild data -\ncharacterized by occlusions, dynamic objects, and varying illumination -\nremains challenging. NeRFs can adapt to such conditions easily through\nper-image embedding vectors, but 3DGS struggles due to its explicit\nrepresentation and lack of shared parameters. To address this, we introduce\nWildGaussians, a novel approach to handle occlusions and appearance changes\nwith 3DGS. By leveraging robust DINO features and integrating an appearance\nmodeling module within 3DGS, our method achieves state-of-the-art results. We\ndemonstrate that WildGaussians matches the real-time rendering speed of 3DGS\nwhile surpassing both 3DGS and NeRF baselines in handling in-the-wild data, all\nwithin a simple architectural framework.\n","authors":["Jonas Kulhanek","Songyou Peng","Zuzana Kukelova","Marc Pollefeys","Torsten Sattler"],"pdf_url":"https://arxiv.org/pdf/2407.08447v2.pdf","comment":"NeurIPS 2024; Project page: https://wild-gaussians.github.io/"},{"id":"http://arxiv.org/abs/2410.23891v1","updated":"2024-10-31T12:52:52Z","published":"2024-10-31T12:52:52Z","title":"AllClear: A Comprehensive Dataset and Benchmark for Cloud Removal in\n Satellite Imagery","summary":" Clouds in satellite imagery pose a significant challenge for downstream\napplications. A major challenge in current cloud removal research is the\nabsence of a comprehensive benchmark and a sufficiently large and diverse\ntraining dataset. To address this problem, we introduce the largest public\ndataset -- $\\textit{AllClear}$ for cloud removal, featuring 23,742 globally\ndistributed regions of interest (ROIs) with diverse land-use patterns,\ncomprising 4 million images in total. Each ROI includes complete temporal\ncaptures from the year 2022, with (1) multi-spectral optical imagery from\nSentinel-2 and Landsat 8/9, (2) synthetic aperture radar (SAR) imagery from\nSentinel-1, and (3) auxiliary remote sensing products such as cloud masks and\nland cover maps. We validate the effectiveness of our dataset by benchmarking\nperformance, demonstrating the scaling law -- the PSNR rises from $28.47$ to\n$33.87$ with $30\\times$ more data, and conducting ablation studies on the\ntemporal length and the importance of individual modalities. This dataset aims\nto provide comprehensive coverage of the Earth's surface and promote better\ncloud removal results.\n","authors":["Hangyu Zhou","Chia-Hsiang Kao","Cheng Perng Phoo","Utkarsh Mall","Bharath Hariharan","Kavita Bala"],"pdf_url":"https://arxiv.org/pdf/2410.23891v1.pdf","comment":"Accepted at NeurIPS 2024 Datasets and Benchmarks Track. Code and data\n available at https://allclear.cs.cornell.edu/"},{"id":"http://arxiv.org/abs/2410.10356v2","updated":"2024-10-31T12:49:09Z","published":"2024-10-14T10:17:24Z","title":"FasterDiT: Towards Faster Diffusion Transformers Training without\n Architecture Modification","summary":" Diffusion Transformers (DiT) have attracted significant attention in\nresearch. However, they suffer from a slow convergence rate. In this paper, we\naim to accelerate DiT training without any architectural modification. We\nidentify the following issues in the training process: firstly, certain\ntraining strategies do not consistently perform well across different data.\nSecondly, the effectiveness of supervision at specific timesteps is limited. In\nresponse, we propose the following contributions: (1) We introduce a new\nperspective for interpreting the failure of the strategies. Specifically, we\nslightly extend the definition of Signal-to-Noise Ratio (SNR) and suggest\nobserving the Probability Density Function (PDF) of SNR to understand the\nessence of the data robustness of the strategy. (2) We conduct numerous\nexperiments and report over one hundred experimental results to empirically\nsummarize a unified accelerating strategy from the perspective of PDF. (3) We\ndevelop a new supervision method that further accelerates the training process\nof DiT. Based on them, we propose FasterDiT, an exceedingly simple and\npracticable design strategy. With few lines of code modifications, it achieves\n2.30 FID on ImageNet 256 resolution at 1000k iterations, which is comparable to\nDiT (2.27 FID) but 7 times faster in training.\n","authors":["Jingfeng Yao","Wang Cheng","Wenyu Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2410.10356v2.pdf","comment":"NeurIPS 2024 (poster); update to camera-ready version"},{"id":"http://arxiv.org/abs/2407.16430v2","updated":"2024-10-31T12:48:05Z","published":"2024-07-23T12:28:59Z","title":"Rethinking Out-of-Distribution Detection on Imbalanced Data Distribution","summary":" Detecting and rejecting unknown out-of-distribution (OOD) samples is critical\nfor deployed neural networks to void unreliable predictions. In real-world\nscenarios, however, the efficacy of existing OOD detection methods is often\nimpeded by the inherent imbalance of in-distribution (ID) data, which causes\nsignificant performance decline. Through statistical observations, we have\nidentified two common challenges faced by different OOD detectors:\nmisidentifying tail class ID samples as OOD, while erroneously predicting OOD\nsamples as head class from ID. To explain this phenomenon, we introduce a\ngeneralized statistical framework, termed ImOOD, to formulate the OOD detection\nproblem on imbalanced data distribution. Consequently, the theoretical analysis\nreveals that there exists a class-aware bias item between balanced and\nimbalanced OOD detection, which contributes to the performance gap. Building\nupon this finding, we present a unified training-time regularization technique\nto mitigate the bias and boost imbalanced OOD detectors across architecture\ndesigns. Our theoretically grounded method translates into consistent\nimprovements on the representative CIFAR10-LT, CIFAR100-LT, and ImageNet-LT\nbenchmarks against several state-of-the-art OOD detection approaches. Code is\navailable at https://github.com/alibaba/imood.\n","authors":["Kai Liu","Zhihang Fu","Sheng Jin","Chao Chen","Ze Chen","Rongxin Jiang","Fan Zhou","Yaowu Chen","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2407.16430v2.pdf","comment":"This paper has been accepted by NeurIPS 2024. Code is available at\n https://github.com/alibaba/imood"},{"id":"http://arxiv.org/abs/2408.12282v2","updated":"2024-10-31T12:30:46Z","published":"2024-08-22T10:34:01Z","title":"Subsurface Scattering for 3D Gaussian Splatting","summary":" 3D reconstruction and relighting of objects made from scattering materials\npresent a significant challenge due to the complex light transport beneath the\nsurface. 3D Gaussian Splatting introduced high-quality novel view synthesis at\nreal-time speeds. While 3D Gaussians efficiently approximate an object's\nsurface, they fail to capture the volumetric properties of subsurface\nscattering. We propose a framework for optimizing an object's shape together\nwith the radiance transfer field given multi-view OLAT (one light at a time)\ndata. Our method decomposes the scene into an explicit surface represented as\n3D Gaussians, with a spatially varying BRDF, and an implicit volumetric\nrepresentation of the scattering component. A learned incident light field\naccounts for shadowing. We optimize all parameters jointly via ray-traced\ndifferentiable rendering. Our approach enables material editing, relighting and\nnovel view synthesis at interactive rates. We show successful application on\nsynthetic data and introduce a newly acquired multi-view multi-light dataset of\nobjects in a light-stage setup. Compared to previous work we achieve comparable\nor better results at a fraction of optimization and rendering time while\nenabling detailed control over material attributes. Project page\nhttps://sss.jdihlmann.com/\n","authors":["Jan-Niklas Dihlmann","Arjun Majumdar","Andreas Engelhardt","Raphael Braun","Hendrik P. A. Lensch"],"pdf_url":"https://arxiv.org/pdf/2408.12282v2.pdf","comment":"Project page: https://sss.jdihlmann.com/"},{"id":"http://arxiv.org/abs/2405.15688v2","updated":"2024-10-31T12:24:34Z","published":"2024-05-24T16:27:05Z","title":"UNION: Unsupervised 3D Object Detection using Object Appearance-based\n Pseudo-Classes","summary":" Unsupervised 3D object detection methods have emerged to leverage vast\namounts of data without requiring manual labels for training. Recent approaches\nrely on dynamic objects for learning to detect mobile objects but penalize the\ndetections of static instances during training. Multiple rounds of (self)\ntraining are used to add detected static instances to the set of training\ntargets; this procedure to improve performance is computationally expensive. To\naddress this, we propose the method UNION. We use spatial clustering and\nself-supervised scene flow to obtain a set of static and dynamic object\nproposals from LiDAR. Subsequently, object proposals' visual appearances are\nencoded to distinguish static objects in the foreground and background by\nselecting static instances that are visually similar to dynamic objects. As a\nresult, static and dynamic mobile objects are obtained together, and existing\ndetectors can be trained with a single training. In addition, we extend 3D\nobject discovery to detection by using object appearance-based cluster labels\nas pseudo-class labels for training object classification. We conduct extensive\nexperiments on the nuScenes dataset and increase the state-of-the-art\nperformance for unsupervised 3D object discovery, i.e. UNION more than doubles\nthe average precision to 38.4. The code is available at\ngithub.com/TedLentsch/UNION.\n","authors":["Ted Lentsch","Holger Caesar","Dariu M. Gavrila"],"pdf_url":"https://arxiv.org/pdf/2405.15688v2.pdf","comment":"NeurIPS'24"},{"id":"http://arxiv.org/abs/2312.10112v3","updated":"2024-10-31T12:19:37Z","published":"2023-12-15T09:09:25Z","title":"NM-FlowGAN: Modeling sRGB Noise without Paired Images using a Hybrid\n Approach of Normalizing Flows and GAN","summary":" Modeling and synthesizing real sRGB noise is crucial for various low-level\nvision tasks, such as building datasets for training image denoising systems.\nThe distribution of real sRGB noise is highly complex and affected by a\nmultitude of factors, making its accurate modeling extremely challenging.\nTherefore, recent studies have proposed methods that employ data-driven\ngenerative models, such as Generative Adversarial Networks (GAN) and\nNormalizing Flows. These studies achieve more accurate modeling of sRGB noise\ncompared to traditional noise modeling methods. However, there are performance\nlimitations due to the inherent characteristics of each generative model. To\naddress this issue, we propose NM-FlowGAN, a hybrid approach that exploits the\nstrengths of both GAN and Normalizing Flows. We combine pixel-wise noise\nmodeling networks based on Normalizing Flows and spatial correlation modeling\nnetworks based on GAN. Specifically, the pixel-wise noise modeling network\nleverages the high training stability of Normalizing Flows to capture noise\ncharacteristics that are affected by a multitude of factors, and the spatial\ncorrelation networks efficiently model pixel-to-pixel relationships. In\nparticular, unlike recent methods that rely on paired noisy images, our method\nsynthesizes noise using clean images and factors that affect noise\ncharacteristics, such as easily obtainable parameters like camera type and ISO\nsettings, making it applicable to various fields where obtaining noisy-clean\nimage pairs is not feasible. In our experiments, our NM-FlowGAN outperforms\nother baselines in the sRGB noise synthesis task. Moreover, the denoising\nneural network trained with synthesized image pairs from our model shows\nsuperior performance compared to other baselines. Our code is available at:\n\\url{https://github.com/YoungJooHan/NM-FlowGAN}.\n","authors":["Young Joo Han","Ha-Jin Yu"],"pdf_url":"https://arxiv.org/pdf/2312.10112v3.pdf","comment":"13 pages, 10 figures, 8 tables"},{"id":"http://arxiv.org/abs/2404.16022v2","updated":"2024-10-31T12:17:39Z","published":"2024-04-24T17:55:33Z","title":"PuLID: Pure and Lightning ID Customization via Contrastive Alignment","summary":" We propose Pure and Lightning ID customization (PuLID), a novel tuning-free\nID customization method for text-to-image generation. By incorporating a\nLightning T2I branch with a standard diffusion one, PuLID introduces both\ncontrastive alignment loss and accurate ID loss, minimizing disruption to the\noriginal model and ensuring high ID fidelity. Experiments show that PuLID\nachieves superior performance in both ID fidelity and editability. Another\nattractive property of PuLID is that the image elements (e.g., background,\nlighting, composition, and style) before and after the ID insertion are kept as\nconsistent as possible. Codes and models are available at\nhttps://github.com/ToTheBeginning/PuLID\n","authors":["Zinan Guo","Yanze Wu","Zhuowei Chen","Lang Chen","Peng Zhang","Qian He"],"pdf_url":"https://arxiv.org/pdf/2404.16022v2.pdf","comment":"NeurIPS 2024. Codes and models are available at\n https://github.com/ToTheBeginning/PuLID"},{"id":"http://arxiv.org/abs/2410.23854v1","updated":"2024-10-31T12:04:30Z","published":"2024-10-31T12:04:30Z","title":"Airway Labeling Meets Clinical Applications: Reflecting Topology\n Consistency and Outliers via Learnable Attentions","summary":" Accurate airway anatomical labeling is crucial for clinicians to identify and\nnavigate complex bronchial structures during bronchoscopy. Automatic airway\nanatomical labeling is challenging due to significant individual variability\nand anatomical variations. Previous methods are prone to generate inconsistent\npredictions, which is harmful for preoperative planning and intraoperative\nnavigation. This paper aims to address these challenges by proposing a novel\nmethod that enhances topological consistency and improves the detection of\nabnormal airway branches.\n We propose a novel approach incorporating two modules: the Soft Subtree\nConsistency (SSC) and the Abnormal Branch Saliency (ABS). The SSC module\nconstructs a soft subtree to capture clinically relevant topological\nrelationships, allowing for flexible feature aggregation within and across\nsubtrees. The ABS module facilitates the interaction between node features and\nprototypes to distinguish abnormal branches, preventing the erroneous\naggregation of features between normal and abnormal nodes.\n Evaluated on a challenging dataset characterized by severe airway distortion\nand atrophy, our method achieves superior performance compared to\nstate-of-the-art approaches. Specifically, it attains a 91.4% accuracy at the\nsegmental level and an 83.7% accuracy at the subsegmental level, representing a\n1.4% increase in subsegmental accuracy and a 3.1% increase in topological\nconsistency. Notably, the method demonstrates reliable performance in cases\nwith disease-induced airway deformities, ensuring consistent and accurate\nlabeling.\n","authors":["Chenyu Li","Minghui Zhang","Chuyan Zhang","Yun Gu"],"pdf_url":"https://arxiv.org/pdf/2410.23854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19370v2","updated":"2024-10-31T12:00:26Z","published":"2024-09-28T14:50:45Z","title":"MambaEviScrib: Mamba and Evidence-Guided Consistency Enhance CNN\n Robustness for Scribble-Based Weakly Supervised Ultrasound Image Segmentation","summary":" Segmenting anatomical structures and lesions from ultrasound images\ncontributes to disease assessment. Weakly supervised learning (WSL) based on\nsparse annotation has achieved encouraging performance and demonstrated the\npotential to reduce annotation costs. This study attempts to introduce\nscribble-based WSL into ultrasound image segmentation tasks. However,\nultrasound images often suffer from poor contrast and unclear edges, coupled\nwith insufficient supervison signals for edges, posing challenges to edge\nprediction. Uncertainty modeling has been proven to facilitate models in\ndealing with these issues. Nevertheless, existing uncertainty estimation\nparadigms are not robust enough and often filter out predictions near decision\nboundaries, resulting in unstable edge predictions. Therefore, we propose\nleveraging predictions near decision boundaries effectively. Specifically, we\nintroduce Dempster-Shafer Theory (DST) of evidence to design an Evidence-Guided\nConsistency strategy. This strategy utilizes high-evidence predictions, which\nare more likely to occur near high-density regions, to guide the optimization\nof low-evidence predictions that may appear near decision boundaries.\nFurthermore, the diverse sizes and locations of lesions in ultrasound images\npose a challenge for CNNs with local receptive fields, as they struggle to\nmodel global information. Therefore, we introduce Visual Mamba based on\nstructured state space sequence models, which achieves long-range dependency\nwith linear computational complexity, and we construct a novel hybrid CNN-Mamba\nframework. During training, the collaboration between the CNN branch and the\nMamba branch in the proposed framework draws inspiration from each other based\non the EGC strategy. Experiments demonstrate the competitiveness of the\nproposed method. Dataset and code will be available on\nhttps://github.com/GtLinyer/MambaEviScrib.\n","authors":["Xiaoxiang Han","Xinyu Li","Jiang Shang","Yiman Liu","Keyan Chen","Shugong Xu","Qiaohong Liu","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.19370v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12490v2","updated":"2024-10-31T11:42:07Z","published":"2024-10-16T12:13:17Z","title":"Stabilize the Latent Space for Image Autoregressive Modeling: A Unified\n Perspective","summary":" Latent-based image generative models, such as Latent Diffusion Models (LDMs)\nand Mask Image Models (MIMs), have achieved notable success in image generation\ntasks. These models typically leverage reconstructive autoencoders like VQGAN\nor VAE to encode pixels into a more compact latent space and learn the data\ndistribution in the latent space instead of directly from pixels. However, this\npractice raises a pertinent question: Is it truly the optimal choice? In\nresponse, we begin with an intriguing observation: despite sharing the same\nlatent space, autoregressive models significantly lag behind LDMs and MIMs in\nimage generation. This finding contrasts sharply with the field of NLP, where\nthe autoregressive model GPT has established a commanding presence. To address\nthis discrepancy, we introduce a unified perspective on the relationship\nbetween latent space and generative models, emphasizing the stability of latent\nspace in image generative modeling. Furthermore, we propose a simple but\neffective discrete image tokenizer to stabilize the latent space for image\ngenerative modeling by applying K-Means on the latent features of\nself-supervised learning models. Experimental results show that image\nautoregressive modeling with our tokenizer (DiGIT) benefits both image\nunderstanding and image generation with the next token prediction principle,\nwhich is inherently straightforward for GPT models but challenging for other\ngenerative models. Remarkably, for the first time, a GPT-style autoregressive\nmodel for images outperforms LDMs, which also exhibits substantial improvement\nakin to GPT when scaling up model size. Our findings underscore the potential\nof an optimized latent space and the integration of discrete tokenization in\nadvancing the capabilities of image generative models. The code is available at\n\\url{https://github.com/DAMO-NLP-SG/DiGIT}.\n","authors":["Yongxin Zhu","Bocheng Li","Hang Zhang","Xin Li","Linli Xu","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2410.12490v2.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23836v1","updated":"2024-10-31T11:32:33Z","published":"2024-10-31T11:32:33Z","title":"Stereo-Talker: Audio-driven 3D Human Synthesis with Prior-Guided\n Mixture-of-Experts","summary":" This paper introduces Stereo-Talker, a novel one-shot audio-driven human\nvideo synthesis system that generates 3D talking videos with precise lip\nsynchronization, expressive body gestures, temporally consistent\nphoto-realistic quality, and continuous viewpoint control. The process follows\na two-stage approach. In the first stage, the system maps audio input to\nhigh-fidelity motion sequences, encompassing upper-body gestures and facial\nexpressions. To enrich motion diversity and authenticity, large language model\n(LLM) priors are integrated with text-aligned semantic audio features,\nleveraging LLMs' cross-modal generalization power to enhance motion quality. In\nthe second stage, we improve diffusion-based video generation models by\nincorporating a prior-guided Mixture-of-Experts (MoE) mechanism: a view-guided\nMoE focuses on view-specific attributes, while a mask-guided MoE enhances\nregion-based rendering stability. Additionally, a mask prediction module is\ndevised to derive human masks from motion data, enhancing the stability and\naccuracy of masks and enabling mask guiding during inference. We also introduce\na comprehensive human video dataset with 2,203 identities, covering diverse\nbody gestures and detailed annotations, facilitating broad generalization. The\ncode, data, and pre-trained models will be released for research purposes.\n","authors":["Xiang Deng","Youxin Pang","Xiaochen Zhao","Chao Xu","Lizhen Wang","Hongjiang Xiao","Shi Yan","Hongwen Zhang","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2410.23836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14927v3","updated":"2024-10-31T11:32:19Z","published":"2024-06-21T07:37:17Z","title":"GIC: Gaussian-Informed Continuum for Physical Property Identification\n and Simulation","summary":" This paper studies the problem of estimating physical properties (system\nidentification) through visual observations. To facilitate geometry-aware\nguidance in physical property estimation, we introduce a novel hybrid framework\nthat leverages 3D Gaussian representation to not only capture explicit shapes\nbut also enable the simulated continuum to render object masks as 2D shape\nsurrogates during training. We propose a new dynamic 3D Gaussian framework\nbased on motion factorization to recover the object as 3D Gaussian point sets\nacross different time states. Furthermore, we develop a coarse-to-fine filling\nstrategy to generate the density fields of the object from the Gaussian\nreconstruction, allowing for the extraction of object continuums along with\ntheir surfaces and the integration of Gaussian attributes into these continuum.\nIn addition to the extracted object surfaces, the Gaussian-informed continuum\nalso enables the rendering of object masks during simulations, serving as\n2D-shape guidance for physical property estimation. Extensive experimental\nevaluations demonstrate that our pipeline achieves state-of-the-art performance\nacross multiple benchmarks and metrics. Additionally, we illustrate the\neffectiveness of the proposed method through real-world demonstrations,\nshowcasing its practical utility. Our project page is at\nhttps://jukgei.github.io/project/gic.\n","authors":["Junhao Cai","Yuji Yang","Weihao Yuan","Yisheng He","Zilong Dong","Liefeng Bo","Hui Cheng","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2406.14927v3.pdf","comment":"21 pages, 8 figures, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23835v1","updated":"2024-10-31T11:29:41Z","published":"2024-10-31T11:29:41Z","title":"Counterfactual MRI Data Augmentation using Conditional Denoising\n Diffusion Generative Models","summary":" Deep learning (DL) models in medical imaging face challenges in\ngeneralizability and robustness due to variations in image acquisition\nparameters (IAP). In this work, we introduce a novel method using conditional\ndenoising diffusion generative models (cDDGMs) to generate counterfactual\nmagnetic resonance (MR) images that simulate different IAP without altering\npatient anatomy. We demonstrate that using these counterfactual images for data\naugmentation can improve segmentation accuracy, particularly in\nout-of-distribution settings, enhancing the overall generalizability and\nrobustness of DL models across diverse imaging conditions. Our approach shows\npromise in addressing domain and covariate shifts in medical imaging. The code\nis publicly available at https:\n//github.com/pedromorao/Counterfactual-MRI-Data-Augmentation\n","authors":["Pedro Morão","Joao Santinha","Yasna Forghani","Nuno Loução","Pedro Gouveia","Mario A. T. Figueiredo"],"pdf_url":"https://arxiv.org/pdf/2410.23835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07966v3","updated":"2024-10-31T11:25:40Z","published":"2024-09-12T11:53:05Z","title":"ProbTalk3D: Non-Deterministic Emotion Controllable Speech-Driven 3D\n Facial Animation Synthesis Using VQ-VAE","summary":" Audio-driven 3D facial animation synthesis has been an active field of\nresearch with attention from both academia and industry. While there are\npromising results in this area, recent approaches largely focus on lip-sync and\nidentity control, neglecting the role of emotions and emotion control in the\ngenerative process. That is mainly due to the lack of emotionally rich facial\nanimation data and algorithms that can synthesize speech animations with\nemotional expressions at the same time. In addition, majority of the models are\ndeterministic, meaning given the same audio input, they produce the same output\nmotion. We argue that emotions and non-determinism are crucial to generate\ndiverse and emotionally-rich facial animations. In this paper, we propose\nProbTalk3D a non-deterministic neural network approach for emotion controllable\nspeech-driven 3D facial animation synthesis using a two-stage VQ-VAE model and\nan emotionally rich facial animation dataset 3DMEAD. We provide an extensive\ncomparative analysis of our model against the recent 3D facial animation\nsynthesis approaches, by evaluating the results objectively, qualitatively, and\nwith a perceptual user study. We highlight several objective metrics that are\nmore suitable for evaluating stochastic outputs and use both in-the-wild and\nground truth data for subjective evaluation. To our knowledge, that is the\nfirst non-deterministic 3D facial animation synthesis method incorporating a\nrich emotion dataset and emotion control with emotion labels and intensity\nlevels. Our evaluation demonstrates that the proposed model achieves superior\nperformance compared to state-of-the-art emotion-controlled, deterministic and\nnon-deterministic models. We recommend watching the supplementary video for\nquality judgement. The entire codebase is publicly available\n(https://github.com/uuembodiedsocialai/ProbTalk3D/).\n","authors":["Sichun Wu","Kazi Injamamul Haque","Zerrin Yumak"],"pdf_url":"https://arxiv.org/pdf/2409.07966v3.pdf","comment":"14 pages, 9 figures, 3 tables. Includes code. Accepted at ACM\n SIGGRAPH MIG 2024"},{"id":"http://arxiv.org/abs/2410.23834v1","updated":"2024-10-31T11:23:19Z","published":"2024-10-31T11:23:19Z","title":"Denoising Diffusion Models for Anomaly Localization in Medical Images","summary":" This chapter explores anomaly localization in medical images using denoising\ndiffusion models. After providing a brief methodological background of these\nmodels, including their application to image reconstruction and their\nconditioning using guidance mechanisms, we provide an overview of available\ndatasets and evaluation metrics suitable for their application to anomaly\nlocalization in medical images. In this context, we discuss supervision schemes\nranging from fully supervised segmentation to semi-supervised, weakly\nsupervised, self-supervised, and unsupervised methods, and provide insights\ninto the effectiveness and limitations of these approaches. Furthermore, we\nhighlight open challenges in anomaly localization, including detection bias,\ndomain shift, computational cost, and model interpretability. Our goal is to\nprovide an overview of the current state of the art in the field, outline\nresearch gaps, and highlight the potential of diffusion models for robust\nanomaly localization in medical images.\n","authors":["Cosmin I. Bercea","Philippe C. Cattin","Julia A. Schnabel","Julia Wolleb"],"pdf_url":"https://arxiv.org/pdf/2410.23834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23831v1","updated":"2024-10-31T11:21:21Z","published":"2024-10-31T11:21:21Z","title":"FRoundation: Are Foundation Models Ready for Face Recognition?","summary":" Foundation models are predominantly trained in an unsupervised or\nself-supervised manner on highly diverse and large-scale datasets, making them\nbroadly applicable to various downstream tasks. In this work, we investigate\nfor the first time whether such models are suitable for the specific domain of\nface recognition. We further propose and demonstrate the adaptation of these\nmodels for face recognition across different levels of data availability.\nExtensive experiments are conducted on multiple foundation models and datasets\nof varying scales for training and fine-tuning, with evaluation on a wide range\nof benchmarks. Our results indicate that, despite their versatility,\npre-trained foundation models underperform in face recognition compared to\nsimilar architectures trained specifically for this task. However, fine-tuning\nfoundation models yields promising results, often surpassing models trained\nfrom scratch when training data is limited. Even with access to large-scale\nface recognition training datasets, fine-tuned foundation models perform\ncomparably to models trained from scratch, but with lower training\ncomputational costs and without relying on the assumption of extensive data\navailability. Our analysis also explores bias in face recognition, with\nslightly higher bias observed in some settings when using foundation models.\n","authors":["Tahar Chettaoui","Naser Damer","Fadi Boutros"],"pdf_url":"https://arxiv.org/pdf/2410.23831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23828v1","updated":"2024-10-31T11:20:13Z","published":"2024-10-31T11:20:13Z","title":"Show Me What and Where has Changed? Question Answering and Grounding for\n Remote Sensing Change Detection","summary":" Remote sensing change detection aims to perceive changes occurring on the\nEarth's surface from remote sensing data in different periods, and feed these\nchanges back to humans. However, most existing methods only focus on detecting\nchange regions, lacking the ability to interact with users to identify changes\nthat the users expect. In this paper, we introduce a new task named Change\nDetection Question Answering and Grounding (CDQAG), which extends the\ntraditional change detection task by providing interpretable textual answers\nand intuitive visual evidence. To this end, we construct the first CDQAG\nbenchmark dataset, termed QAG-360K, comprising over 360K triplets of questions,\ntextual answers, and corresponding high-quality visual masks. It encompasses 10\nessential land-cover categories and 8 comprehensive question types, which\nprovides a large-scale and diverse dataset for remote sensing applications.\nBased on this, we present VisTA, a simple yet effective baseline method that\nunifies the tasks of question answering and grounding by delivering both visual\nand textual answers. Our method achieves state-of-the-art results on both the\nclassic CDVQA and the proposed CDQAG datasets. Extensive qualitative and\nquantitative experimental results provide useful insights for the development\nof better CDQAG models, and we hope that our work can inspire further research\nin this important yet underexplored direction. The proposed benchmark dataset\nand method are available at https://github.com/like413/VisTA.\n","authors":["Ke Li","Fuyu Dong","Di Wang","Shaofeng Li","Quan Wang","Xinbo Gao","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2410.23828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01762v4","updated":"2024-10-31T11:11:24Z","published":"2023-05-27T06:00:51Z","title":"Rapid Plug-in Defenders","summary":" In the realm of daily services, the deployment of deep neural networks\nunderscores the paramount importance of their reliability. However, the\nvulnerability of these networks to adversarial attacks, primarily\nevasion-based, poses a concerning threat to their functionality. Common methods\nfor enhancing robustness involve heavy adversarial training or leveraging\nlearned knowledge from clean data, both necessitating substantial computational\nresources. This inherent time-intensive nature severely limits the agility of\nlarge foundational models to swiftly counter adversarial perturbations. To\naddress this challenge, this paper focuses on the Rapid Plug-in Defender\n(RaPiD) problem, aiming to rapidly counter adversarial perturbations without\naltering the deployed model. Drawing inspiration from the generalization and\nthe universal computation ability of pre-trained transformer models, we propose\na novel method termed CeTaD (Considering Pre-trained Transformers as Defenders)\nfor RaPiD, optimized for efficient computation. CeTaD strategically fine-tunes\nthe normalization layer parameters within the defender using a limited set of\nclean and adversarial examples. Our evaluation centers on assessing CeTaD's\neffectiveness, transferability, and the impact of different components in\nscenarios involving one-shot adversarial examples. The proposed method is\ncapable of rapidly adapting to various attacks and different application\nscenarios without altering the target model and clean training data. We also\nexplore the influence of varying training data conditions on CeTaD's\nperformance. Notably, CeTaD exhibits adaptability across differentiable service\nmodels and proves the potential of continuous learning.\n","authors":["Kai Wu","Yujian Betterest Li","Jian Lou","Xiaoyu Zhang","Handing Wang","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2306.01762v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23822v1","updated":"2024-10-31T11:07:26Z","published":"2024-10-31T11:07:26Z","title":"Parameter-Efficient Fine-Tuning Medical Multimodal Large Language Models\n for Medical Visual Grounding","summary":" Multimodal Large Language Models (MLLMs) inherit the superior text\nunderstanding capabilities of LLMs and extend these capabilities to multimodal\nscenarios. These models achieve excellent results in the general domain of\nmultimodal tasks. However, in the medical domain, the substantial training\ncosts and the requirement for extensive medical data pose challenges to the\ndevelopment of medical MLLMs. Furthermore, due to the free-text form of\nanswers, tasks such as visual grounding that need to produce output in a\nprescribed form become difficult for MLLMs. So far, there have been no medical\nMLLMs works in medical visual grounding area. For the medical vision grounding\ntask, which involves identifying locations in medical images based on short\ntext descriptions, we propose Parameter-efficient Fine-tuning medical\nmultimodal large language models for Medcial Visual Grounding (PFMVG). To\nvalidate the performance of the model, we evaluate it on a public benchmark\ndataset for medical visual grounding, where it achieves competitive results,\nand significantly outperforming GPT-4v. Our code will be open sourced after\npeer review.\n","authors":["Jinlong He","Pengfei Li","Gang Liu","Shenjun Zhong"],"pdf_url":"https://arxiv.org/pdf/2410.23822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23820v1","updated":"2024-10-31T11:05:09Z","published":"2024-10-31T11:05:09Z","title":"Disentangling Disentangled Representations: Towards Improved Latent\n Units via Diffusion Models","summary":" Disentangled representation learning (DRL) aims to break down observed data\ninto core intrinsic factors for a profound understanding of the data. In\nreal-world scenarios, manually defining and labeling these factors are\nnon-trivial, making unsupervised methods attractive. Recently, there have been\nlimited explorations of utilizing diffusion models (DMs), which are already\nmainstream in generative modeling, for unsupervised DRL. They implement their\nown inductive bias to ensure that each latent unit input to the DM expresses\nonly one distinct factor. In this context, we design Dynamic Gaussian Anchoring\nto enforce attribute-separated latent units for more interpretable DRL. This\nunconventional inductive bias explicitly delineates the decision boundaries\nbetween attributes while also promoting the independence among latent units.\nAdditionally, we also propose Skip Dropout technique, which easily modifies the\ndenoising U-Net to be more DRL-friendly, addressing its uncooperative nature\nwith the disentangling feature extractor. Our methods, which carefully consider\nthe latent unit semantics and the distinct DM structure, enhance the\npracticality of DM-based disentangled representations, demonstrating\nstate-of-the-art disentanglement performance on both synthetic and real data,\nas well as advantages in downstream tasks.\n","authors":["Youngjun Jun","Jiwoo Park","Kyobin Choo","Tae Eun Choi","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2410.23820v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23806v1","updated":"2024-10-31T10:46:11Z","published":"2024-10-31T10:46:11Z","title":"Human Action Recognition (HAR) Using Skeleton-based Quantum Spatial\n Temporal Relative Transformer Network: ST-RTR","summary":" Quantum Human Action Recognition (HAR) is an interesting research area in\nhuman-computer interaction used to monitor the activities of elderly and\ndisabled individuals affected by physical and mental health. In the recent era,\nskeleton-based HAR has received much attention because skeleton data has shown\nthat it can handle changes in striking, body size, camera views, and complex\nbackgrounds. One key characteristic of ST-GCN is automatically learning spatial\nand temporal patterns from skeleton sequences. It has some limitations, as this\nmethod only works for short-range correlation due to its limited receptive\nfield. Consequently, understanding human action requires long-range\ninterconnection. To address this issue, we developed a quantum spatial-temporal\nrelative transformer ST-RTR model. The ST-RTR includes joint and relay nodes,\nwhich allow efficient communication and data transmission within the network.\nThese nodes help to break the inherent spatial and temporal skeleton\ntopologies, which enables the model to understand long-range human action\nbetter. Furthermore, we combine quantum ST-RTR with a fusion model for further\nperformance improvements. To assess the performance of the quantum ST-RTR\nmethod, we conducted experiments on three skeleton-based HAR benchmarks: NTU\nRGB+D 60, NTU RGB+D 120, and UAV-Human. It boosted CS and CV by 2.11 % and\n1.45% on NTU RGB+D 60, 1.25% and 1.05% on NTU RGB+D 120. On UAV-Human datasets,\naccuracy improved by 2.54%. The experimental outcomes explain that the proposed\nST-RTR model significantly improves action recognition associated with the\nstandard ST-GCN method.\n","authors":["Faisal Mehmood","Enqing Chen","Touqeer Abbas","Samah M. Alzanin"],"pdf_url":"https://arxiv.org/pdf/2410.23806v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23800v1","updated":"2024-10-31T10:35:59Z","published":"2024-10-31T10:35:59Z","title":"SOAR: Self-Occluded Avatar Recovery from a Single Video In the Wild","summary":" Self-occlusion is common when capturing people in the wild, where the\nperformer do not follow predefined motion scripts. This challenges existing\nmonocular human reconstruction systems that assume full body visibility. We\nintroduce Self-Occluded Avatar Recovery (SOAR), a method for complete human\nreconstruction from partial observations where parts of the body are entirely\nunobserved. SOAR leverages structural normal prior and generative diffusion\nprior to address such an ill-posed reconstruction problem. For structural\nnormal prior, we model human with an reposable surfel model with well-defined\nand easily readable shapes. For generative diffusion prior, we perform an\ninitial reconstruction and refine it using score distillation. On various\nbenchmarks, we show that SOAR performs favorably than state-of-the-art\nreconstruction and generation methods, and on-par comparing to concurrent\nworks. Additional video results and code are available at\nhttps://soar-avatar.github.io/.\n","authors":["Zhuoyang Pan","Angjoo Kanazawa","Hang Gao"],"pdf_url":"https://arxiv.org/pdf/2410.23800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23788v1","updated":"2024-10-31T10:13:05Z","published":"2024-10-31T10:13:05Z","title":"EDT: An Efficient Diffusion Transformer Framework Inspired by Human-like\n Sketching","summary":" Transformer-based Diffusion Probabilistic Models (DPMs) have shown more\npotential than CNN-based DPMs, yet their extensive computational requirements\nhinder widespread practical applications. To reduce the computation budget of\ntransformer-based DPMs, this work proposes the Efficient Diffusion Transformer\n(EDT) framework. The framework includes a lightweight-design diffusion model\narchitecture, and a training-free Attention Modulation Matrix and its\nalternation arrangement in EDT inspired by human-like sketching. Additionally,\nwe propose a token relation-enhanced masking training strategy tailored\nexplicitly for EDT to augment its token relation learning capability. Our\nextensive experiments demonstrate the efficacy of EDT. The EDT framework\nreduces training and inference costs and surpasses existing transformer-based\ndiffusion models in image synthesis performance, thereby achieving a\nsignificant overall enhancement. With lower FID, EDT-S, EDT-B, and EDT-XL\nattained speed-ups of 3.93x, 2.84x, and 1.92x respectively in the training\nphase, and 2.29x, 2.29x, and 2.22x respectively in inference, compared to the\ncorresponding sizes of MDTv2. The source code is released at\nhttps://github.com/xinwangChen/EDT.\n","authors":["Xinwang Chen","Ning Liu","Yichen Zhu","Feifei Feng","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2410.23788v1.pdf","comment":"Xinwang Chen and Ning Liu are with equal contributions. This paper\n has been accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.07085v2","updated":"2024-10-31T10:06:16Z","published":"2024-06-11T09:22:39Z","title":"CAT: Coordinating Anatomical-Textual Prompts for Multi-Organ and Tumor\n Segmentation","summary":" Existing promptable segmentation methods in the medical imaging field\nprimarily consider either textual or visual prompts to segment relevant\nobjects, yet they often fall short when addressing anomalies in medical images,\nlike tumors, which may vary greatly in shape, size, and appearance. Recognizing\nthe complexity of medical scenarios and the limitations of textual or visual\nprompts, we propose a novel dual-prompt schema that leverages the complementary\nstrengths of visual and textual prompts for segmenting various organs and\ntumors. Specifically, we introduce CAT, an innovative model that Coordinates\nAnatomical prompts derived from 3D cropped images with Textual prompts enriched\nby medical domain knowledge. The model architecture adopts a general\nquery-based design, where prompt queries facilitate segmentation queries for\nmask prediction. To synergize two types of prompts within a unified framework,\nwe implement a ShareRefiner, which refines both segmentation and prompt queries\nwhile disentangling the two types of prompts. Trained on a consortium of 10\npublic CT datasets, CAT demonstrates superior performance in multiple\nsegmentation tasks. Further validation on a specialized in-house dataset\nreveals the remarkable capacity of segmenting tumors across multiple cancer\nstages. This approach confirms that coordinating multimodal prompts is a\npromising avenue for addressing complex scenarios in the medical domain.\n","authors":["Zhongzhen Huang","Yankai Jiang","Rongzhao Zhang","Shaoting Zhang","Xiaofan Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.07085v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10897v2","updated":"2024-10-31T10:01:59Z","published":"2024-07-15T16:46:14Z","title":"Optical Diffusion Models for Image Generation","summary":" Diffusion models generate new samples by progressively decreasing the noise\nfrom the initially provided random distribution. This inference procedure\ngenerally utilizes a trained neural network numerous times to obtain the final\noutput, creating significant latency and energy consumption on digital\nelectronic hardware such as GPUs. In this study, we demonstrate that the\npropagation of a light beam through a semi-transparent medium can be programmed\nto implement a denoising diffusion model on image samples. This framework\nprojects noisy image patterns through passive diffractive optical layers, which\ncollectively only transmit the predicted noise term in the image. The optical\ntransparent layers, which are trained with an online training approach,\nbackpropagating the error to the analytical model of the system, are passive\nand kept the same across different steps of denoising. Hence this method\nenables high-speed image generation with minimal power consumption, benefiting\nfrom the bandwidth and energy efficiency of optical information processing.\n","authors":["Ilker Oguz","Niyazi Ulas Dinc","Mustafa Yildirim","Junjie Ke","Innfarn Yoo","Qifei Wang","Feng Yang","Christophe Moser","Demetri Psaltis"],"pdf_url":"https://arxiv.org/pdf/2407.10897v2.pdf","comment":"17 pages, 10 figures"},{"id":"http://arxiv.org/abs/2410.23782v1","updated":"2024-10-31T09:55:32Z","published":"2024-10-31T09:55:32Z","title":"Video Token Merging for Long-form Video Understanding","summary":" As the scale of data and models for video understanding rapidly expand,\nhandling long-form video input in transformer-based models presents a practical\nchallenge. Rather than resorting to input sampling or token dropping, which may\nresult in information loss, token merging shows promising results when used in\ncollaboration with transformers. However, the application of token merging for\nlong-form video processing is not trivial. We begin with the premise that token\nmerging should not rely solely on the similarity of video tokens; the saliency\nof tokens should also be considered. To address this, we explore various video\ntoken merging strategies for long-form video classification, starting with a\nsimple extension of image token merging, moving to region-concentrated merging,\nand finally proposing a learnable video token merging (VTM) algorithm that\ndynamically merges tokens based on their saliency. Extensive experimental\nresults show that we achieve better or comparable performances on the LVU,\nCOIN, and Breakfast datasets. Moreover, our approach significantly reduces\nmemory costs by 84% and boosts throughput by approximately 6.89 times compared\nto baseline algorithms.\n","authors":["Seon-Ho Lee","Jue Wang","Zhikang Zhang","David Fan","Xinyu Li"],"pdf_url":"https://arxiv.org/pdf/2410.23782v1.pdf","comment":"21 pages, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23780v1","updated":"2024-10-31T09:53:21Z","published":"2024-10-31T09:53:21Z","title":"Driving by the Rules: A Benchmark for Integrating Traffic Sign\n Regulations into Vectorized HD Map","summary":" Ensuring adherence to traffic sign regulations is essential for both human\nand autonomous vehicle navigation. While current benchmark datasets concentrate\non lane perception or basic traffic sign recognition, they often overlook the\nintricate task of integrating these regulations into lane operations.\nAddressing this gap, we introduce MapDR, a novel dataset designed for the\nextraction of Driving Rules from traffic signs and their association with\nvectorized, locally perceived HD Maps. MapDR features over 10,000 annotated\nvideo clips that capture the intricate correlation between traffic sign\nregulations and lanes. We define two pivotal sub-tasks: 1) Rule Extraction from\nTraffic Sign, which accurately deciphers regulatory instructions, and 2)\nRule-Lane Correspondence Reasoning, which aligns these rules with their\nrespective lanes. Built upon this benchmark, we provide a multimodal solution\nthat offers a strong baseline for advancing autonomous driving technologies. It\nfills a critical gap in the integration of traffic sign rules, contributing to\nthe development of reliable autonomous navigation systems.\n","authors":["Xinyuan Chang","Maixuan Xue","Xinran Liu","Zheng Pan","Xing Wei"],"pdf_url":"https://arxiv.org/pdf/2410.23780v1.pdf","comment":"27 pages, 13 figures"},{"id":"http://arxiv.org/abs/2410.23775v1","updated":"2024-10-31T09:45:00Z","published":"2024-10-31T09:45:00Z","title":"In-Context LoRA for Diffusion Transformers","summary":" Recent research arXiv:2410.15027 has explored the use of diffusion\ntransformers (DiTs) for task-agnostic image generation by simply concatenating\nattention tokens across images. However, despite substantial computational\nresources, the fidelity of the generated images remains suboptimal. In this\nstudy, we reevaluate and streamline this framework by hypothesizing that\ntext-to-image DiTs inherently possess in-context generation capabilities,\nrequiring only minimal tuning to activate them. Through diverse task\nexperiments, we qualitatively demonstrate that existing text-to-image DiTs can\neffectively perform in-context generation without any tuning. Building on this\ninsight, we propose a remarkably simple pipeline to leverage the in-context\nabilities of DiTs: (1) concatenate images instead of tokens, (2) perform joint\ncaptioning of multiple images, and (3) apply task-specific LoRA tuning using\nsmall datasets (e.g., $20\\sim 100$ samples) instead of full-parameter tuning\nwith large datasets. We name our models In-Context LoRA (IC-LoRA). This\napproach requires no modifications to the original DiT models, only changes to\nthe training data. Remarkably, our pipeline generates high-fidelity image sets\nthat better adhere to prompts. While task-specific in terms of tuning data, our\nframework remains task-agnostic in architecture and pipeline, offering a\npowerful tool for the community and providing valuable insights for further\nresearch on product-level task-agnostic generation systems. We release our\ncode, data, and models at https://github.com/ali-vilab/In-Context-LoRA\n","authors":["Lianghua Huang","Wei Wang","Zhi-Fan Wu","Yupeng Shi","Huanzhang Dou","Chen Liang","Yutong Feng","Yu Liu","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.23775v1.pdf","comment":"Project page: https://ali-vilab.github.io/In-Context-Lora-Page/"},{"id":"http://arxiv.org/abs/2410.14790v2","updated":"2024-10-31T09:34:45Z","published":"2024-10-02T13:42:38Z","title":"SSL-NBV: A Self-Supervised-Learning-Based Next-Best-View algorithm for\n Efficient 3D Plant Reconstruction by a Robot","summary":" The 3D reconstruction of plants is challenging due to their complex shape\ncausing many occlusions. Next-Best-View (NBV) methods address this by\niteratively selecting new viewpoints to maximize information gain (IG).\nDeep-learning-based NBV (DL-NBV) methods demonstrate higher computational\nefficiency over classic voxel-based NBV approaches but current methods require\nextensive training using ground-truth plant models, making them impractical for\nreal-world plants. These methods, moreover, rely on offline training with\npre-collected data, limiting adaptability in changing agricultural\nenvironments. This paper proposes a self-supervised learning-based NBV method\n(SSL-NBV) that uses a deep neural network to predict the IG for candidate\nviewpoints. The method allows the robot to gather its own training data during\ntask execution by comparing new 3D sensor data to the earlier gathered data and\nby employing weakly-supervised learning and experience replay for efficient\nonline learning. Comprehensive evaluations were conducted in simulation and\nreal-world environments using cross-validation. The results showed that SSL-NBV\nrequired fewer views for plant reconstruction than non-NBV methods and was over\n800 times faster than a voxel-based method. SSL-NBV reduced training\nannotations by over 90% compared to a baseline DL-NBV. Furthermore, SSL-NBV\ncould adapt to novel scenarios through online fine-tuning. Also using real\nplants, the results showed that the proposed method can learn to effectively\nplan new viewpoints for 3D plant reconstruction. Most importantly, SSL-NBV\nautomated the entire network training and uses continuous online learning,\nallowing it to operate in changing agricultural environments.\n","authors":["Jianchao Ci","Eldert J. van Henten","Xin Wang","Akshay K. Burusa","Gert Kootstra"],"pdf_url":"https://arxiv.org/pdf/2410.14790v2.pdf","comment":"22 pages, 11 figures, 1 table"},{"id":"http://arxiv.org/abs/2410.23767v1","updated":"2024-10-31T09:29:55Z","published":"2024-10-31T09:29:55Z","title":"Open-Set 3D object detection in LiDAR data as an Out-of-Distribution\n problem","summary":" 3D Object Detection from LiDAR data has achieved industry-ready performance\nin controlled environments through advanced deep learning methods. However,\nthese neural network models are limited by a finite set of inlier object\ncategories. Our work redefines the open-set 3D Object Detection problem in\nLiDAR data as an Out-Of-Distribution (OOD) problem to detect outlier objects.\nThis approach brings additional information in comparison with traditional\nobject detection. We establish a comparative benchmark and show that two-stage\nOOD methods, notably autolabelling, show promising results for 3D OOD Object\nDetection. Our contributions include setting a rigorous evaluation protocol by\nexamining the evaluation of hyperparameters and evaluating strategies for\ngenerating additional data to train an OOD-aware 3D object detector. This\ncomprehensive analysis is essential for developing robust 3D object detection\nsystems that can perform reliably in diverse and unpredictable real-world\nscenarios.\n","authors":["Louis Soum-Fontez","Jean-Emmanuel Deschaud","François Goulette"],"pdf_url":"https://arxiv.org/pdf/2410.23767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23758v1","updated":"2024-10-31T09:25:02Z","published":"2024-10-31T09:25:02Z","title":"Reverse Attitude Statistics Based Star Map Identification Method","summary":" The star tracker is generally affected by the atmospheric background light\nand the aerodynamic environment when working in near space, which results in\nmissing stars or false stars. Moreover, high-speed maneuvering may cause star\ntrailing, which reduces the accuracy of the star position. To address the\nchallenges for starmap identification, a reverse attitude statistics based\nmethod is proposed to handle position noise, false stars, and missing stars.\nConversely to existing methods which match before solving for attitude, this\nmethod introduces attitude solving into the matching process, and obtains the\nfinal match and the correct attitude simultaneously by frequency statistics.\nFirstly, based on stable angular distance features, the initial matching is\nobtained by utilizing spatial hash indexing. Then, the dual-vector attitude\ndetermination is introduced to calculate potential attitude. Finally, the star\npairs are accurately matched by applying a frequency statistics filtering\nmethod. In addition, Bayesian optimization is employed to find optimal\nparameters under the impact of noises, which is able to enhance the algorithm\nperformance further. In this work, the proposed method is validated in\nsimulation, field test and on-orbit experiment. Compared with the\nstate-of-the-art, the identification rate is improved by more than 14.3%, and\nthe solving time is reduced by over 28.5%.\n","authors":["Shunmei Dong","Qinglong Wang","Haiqing Wang","Qianqian Wang"],"pdf_url":"https://arxiv.org/pdf/2410.23758v1.pdf","comment":"10 pages, 17figures, 4 tables, 4663 words, submitted to IEEE Sensors\n Journal"},{"id":"http://arxiv.org/abs/2403.09400v3","updated":"2024-10-31T09:21:29Z","published":"2024-03-14T13:50:44Z","title":"ConDiSR: Contrastive Disentanglement and Style Regularization for Single\n Domain Generalization","summary":" Medical data often exhibits distribution shifts, which cause test-time\nperformance degradation for deep learning models trained using standard\nsupervised learning pipelines. This challenge is addressed in the field of\nDomain Generalization (DG) with the sub-field of Single Domain Generalization\n(SDG) being specifically interesting due to the privacy- or logistics-related\nissues often associated with medical data. Existing disentanglement-based SDG\nmethods heavily rely on structural information embedded in segmentation masks,\nhowever classification labels do not provide such dense information. This work\nintroduces a novel SDG method aimed at medical image classification that\nleverages channel-wise contrastive disentanglement. It is further enhanced with\nreconstruction-based style regularization to ensure extraction of distinct\nstyle and structure feature representations. We evaluate our method on the\ncomplex task of multicenter histopathology image classification, comparing it\nagainst state-of-the-art (SOTA) SDG baselines. Results demonstrate that our\nmethod surpasses the SOTA by a margin of 1% in average accuracy while also\nshowing more stable performance. This study highlights the importance and\nchallenges of exploring SDG frameworks in the context of the classification\ntask. The code is publicly available at\nhttps://github.com/BioMedIA-MBZUAI/ConDiSR\n","authors":["Aleksandr Matsun","Numan Saeed","Fadillah Adamsyah Maani","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.09400v3.pdf","comment":"A flaw was found in the results acquisition"},{"id":"http://arxiv.org/abs/2410.23751v1","updated":"2024-10-31T09:11:56Z","published":"2024-10-31T09:11:56Z","title":"EXACFS -- A CIL Method to mitigate Catastrophic Forgetting","summary":" Deep neural networks (DNNS) excel at learning from static datasets but\nstruggle with continual learning, where data arrives sequentially. Catastrophic\nforgetting, the phenomenon of forgetting previously learned knowledge, is a\nprimary challenge. This paper introduces EXponentially Averaged Class-wise\nFeature Significance (EXACFS) to mitigate this issue in the class incremental\nlearning (CIL) setting. By estimating the significance of model features for\neach learned class using loss gradients, gradually aging the significance\nthrough the incremental tasks and preserving the significant features through a\ndistillation loss, EXACFS effectively balances remembering old knowledge\n(stability) and learning new knowledge (plasticity). Extensive experiments on\nCIFAR-100 and ImageNet-100 demonstrate EXACFS's superior performance in\npreserving stability while acquiring plasticity.\n","authors":["S Balasubramanian","M Sai Subramaniam","Sai Sriram Talasu","P Yedu Krishna","Manepalli Pranav Phanindra Sai","Ravi Mukkamala","Darshan Gera"],"pdf_url":"https://arxiv.org/pdf/2410.23751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13495v2","updated":"2024-10-31T09:11:37Z","published":"2024-06-19T12:35:02Z","title":"DF40: Toward Next-Generation Deepfake Detection","summary":" We propose a new comprehensive benchmark to revolutionize the current\ndeepfake detection field to the next generation. Predominantly, existing works\nidentify top-notch detection algorithms and models by adhering to the common\npractice: training detectors on one specific dataset (e.g., FF++) and testing\nthem on other prevalent deepfake datasets. This protocol is often regarded as a\n\"golden compass\" for navigating SoTA detectors. But can these stand-out\n\"winners\" be truly applied to tackle the myriad of realistic and diverse\ndeepfakes lurking in the real world? If not, what underlying factors contribute\nto this gap? In this work, we found the dataset (both train and test) can be\nthe \"primary culprit\" due to: (1) forgery diversity: Deepfake techniques are\ncommonly referred to as both face forgery and entire image synthesis. Most\nexisting datasets only contain partial types of them, with limited forgery\nmethods implemented; (2) forgery realism: The dominated training dataset, FF++,\ncontains out-of-date forgery techniques from the past four years. \"Honing\nskills\" on these forgeries makes it difficult to guarantee effective detection\ngeneralization toward nowadays' SoTA deepfakes; (3) evaluation protocol: Most\ndetection works perform evaluations on one type, which hinders the development\nof universal deepfake detectors. To address this dilemma, we construct a highly\ndiverse deepfake detection dataset called DF40, which comprises 40 distinct\ndeepfake techniques. We then conduct comprehensive evaluations using 4 standard\nevaluation protocols and 8 representative detection methods, resulting in over\n2,000 evaluations. Through these evaluations, we provide an extensive analysis\nfrom various perspectives, leading to 7 new insightful findings. We also open\nup 4 valuable yet previously underexplored research questions to inspire future\nworks. Our project page is https://github.com/YZY-stack/DF40.\n","authors":["Zhiyuan Yan","Taiping Yao","Shen Chen","Yandan Zhao","Xinghe Fu","Junwei Zhu","Donghao Luo","Chengjie Wang","Shouhong Ding","Yunsheng Wu","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2406.13495v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2108.05080 by other authors"},{"id":"http://arxiv.org/abs/2405.14702v2","updated":"2024-10-31T09:08:48Z","published":"2024-05-23T15:37:06Z","title":"G3: An Effective and Adaptive Framework for Worldwide Geolocalization\n Using Large Multi-Modality Models","summary":" Worldwide geolocalization aims to locate the precise location at the\ncoordinate level of photos taken anywhere on the Earth. It is very challenging\ndue to 1) the difficulty of capturing subtle location-aware visual semantics,\nand 2) the heterogeneous geographical distribution of image data. As a result,\nexisting studies have clear limitations when scaled to a worldwide context.\nThey may easily confuse distant images with similar visual contents, or cannot\nadapt to various locations worldwide with different amounts of relevant data.\nTo resolve these limitations, we propose G3, a novel framework based on\nRetrieval-Augmented Generation (RAG). In particular, G3 consists of three\nsteps, i.e., Geo-alignment, Geo-diversification, and Geo-verification to\noptimize both retrieval and generation phases of worldwide geolocalization.\nDuring Geo-alignment, our solution jointly learns expressive multi-modal\nrepresentations for images, GPS and textual descriptions, which allows us to\ncapture location-aware semantics for retrieving nearby images for a given\nquery. During Geo-diversification, we leverage a prompt ensembling method that\nis robust to inconsistent retrieval performance for different image queries.\nFinally, we combine both retrieved and generated GPS candidates in\nGeo-verification for location prediction. Experiments on two well-established\ndatasets IM2GPS3k and YFCC4k verify the superiority of G3 compared to other\nstate-of-the-art methods. Our code and data are available online for\nreproduction.\n","authors":["Pengyue Jia","Yiding Liu","Xiaopeng Li","Yuhao Wang","Yantong Du","Xiao Han","Xuetao Wei","Shuaiqiang Wang","Dawei Yin","Xiangyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.14702v2.pdf","comment":"Accepted to NeurIPS2024"},{"id":"http://arxiv.org/abs/2312.05508v3","updated":"2024-10-31T09:04:41Z","published":"2023-12-09T09:08:03Z","title":"Improving Adversarial Robust Fairness via Anti-Bias Soft Label\n Distillation","summary":" Adversarial Training (AT) has been widely proved to be an effective method to\nimprove the adversarial robustness against adversarial examples for Deep Neural\nNetworks (DNNs). As a variant of AT, Adversarial Robustness Distillation (ARD)\nhas demonstrated its superior performance in improving the robustness of small\nstudent models with the guidance of large teacher models. However, both AT and\nARD encounter the robust fairness problem: these models exhibit strong\nrobustness when facing part of classes (easy class), but weak robustness when\nfacing others (hard class). In this paper, we give an in-depth analysis of the\npotential factors and argue that the smoothness degree of samples' soft labels\nfor different classes (i.e., hard class or easy class) will affect the robust\nfairness of DNNs from both empirical observation and theoretical analysis.\nBased on the above finding, we propose an Anti-Bias Soft Label Distillation\n(ABSLD) method to mitigate the adversarial robust fairness problem within the\nframework of Knowledge Distillation (KD). Specifically, ABSLD adaptively\nreduces the student's error risk gap between different classes to achieve\nfairness by adjusting the class-wise smoothness degree of samples' soft labels\nduring the training process, and the smoothness degree of soft labels is\ncontrolled by assigning different temperatures in KD to different classes.\nExtensive experiments demonstrate that ABSLD outperforms state-of-the-art AT,\nARD, and robust fairness methods in the comprehensive metric (Normalized\nStandard Deviation) of robustness and fairness.\n","authors":["Shiji Zhao","Ranjie Duan","Xizhe Wang","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2312.05508v3.pdf","comment":"Accepted by NeurIPS2024"},{"id":"http://arxiv.org/abs/2410.23744v1","updated":"2024-10-31T08:59:34Z","published":"2024-10-31T08:59:34Z","title":"EchoNarrator: Generating natural text explanations for ejection fraction\n predictions","summary":" Ejection fraction (EF) of the left ventricle (LV) is considered as one of the\nmost important measurements for diagnosing acute heart failure and can be\nestimated during cardiac ultrasound acquisition. While recent successes in deep\nlearning research successfully estimate EF values, the proposed models often\nlack an explanation for the prediction. However, providing clear and intuitive\nexplanations for clinical measurement predictions would increase the trust of\ncardiologists in these models. In this paper, we explore predicting EF\nmeasurements with Natural Language Explanation (NLE). We propose a model that\nin a single forward pass combines estimation of the LV contour over multiple\nframes, together with a set of modules and routines for computing various\nmotion and shape attributes that are associated with ejection fraction. It then\nfeeds the attributes into a large language model to generate text that helps to\nexplain the network's outcome in a human-like manner. We provide experimental\nevaluation of our explanatory output, as well as EF prediction, and show that\nour model can provide EF comparable to state-of-the-art together with\nmeaningful and accurate natural language explanation to the prediction. The\nproject page can be found at https://github.com/guybenyosef/EchoNarrator .\n","authors":["Sarina Thomas","Qing Cao","Anna Novikova","Daria Kulikova","Guy Ben-Yosef"],"pdf_url":"https://arxiv.org/pdf/2410.23744v1.pdf","comment":"accepted for MICCAI 2024"},{"id":"http://arxiv.org/abs/2405.15223v3","updated":"2024-10-31T08:58:08Z","published":"2024-05-24T05:29:12Z","title":"iVideoGPT: Interactive VideoGPTs are Scalable World Models","summary":" World models empower model-based agents to interactively explore, reason, and\nplan within imagined environments for real-world decision-making. However, the\nhigh demand for interactivity poses challenges in harnessing recent\nadvancements in video generative models for developing world models at scale.\nThis work introduces Interactive VideoGPT (iVideoGPT), a scalable\nautoregressive transformer framework that integrates multimodal signals--visual\nobservations, actions, and rewards--into a sequence of tokens, facilitating an\ninteractive experience of agents via next-token prediction. iVideoGPT features\na novel compressive tokenization technique that efficiently discretizes\nhigh-dimensional visual observations. Leveraging its scalable architecture, we\nare able to pre-train iVideoGPT on millions of human and robotic manipulation\ntrajectories, establishing a versatile foundation that is adaptable to serve as\ninteractive world models for a wide range of downstream tasks. These include\naction-conditioned video prediction, visual planning, and model-based\nreinforcement learning, where iVideoGPT achieves competitive performance\ncompared with state-of-the-art methods. Our work advances the development of\ninteractive general world models, bridging the gap between generative video\nmodels and practical model-based reinforcement learning applications. Code and\npre-trained models are available at https://thuml.github.io/iVideoGPT.\n","authors":["Jialong Wu","Shaofeng Yin","Ningya Feng","Xu He","Dong Li","Jianye Hao","Mingsheng Long"],"pdf_url":"https://arxiv.org/pdf/2405.15223v3.pdf","comment":"NeurIPS 2024. Code is available at project website:\n https://thuml.github.io/iVideoGPT"},{"id":"http://arxiv.org/abs/2410.23742v1","updated":"2024-10-31T08:58:00Z","published":"2024-10-31T08:58:00Z","title":"Scaled Inverse Graphics: Efficiently Learning Large Sets of 3D Scenes","summary":" While the field of inverse graphics has been witnessing continuous growth,\ntechniques devised thus far predominantly focus on learning individual scene\nrepresentations. In contrast, learning large sets of scenes has been a\nconsiderable bottleneck in NeRF developments, as repeatedly applying inverse\ngraphics on a sequence of scenes, though essential for various applications,\nremains largely prohibitive in terms of resource costs. We introduce a\nframework termed \"scaled inverse graphics\", aimed at efficiently learning large\nsets of scene representations, and propose a novel method to this end. It\noperates in two stages: (i) training a compression model on a subset of scenes,\nthen (ii) training NeRF models on the resulting smaller representations,\nthereby reducing the optimization space per new scene. In practice, we compact\nthe representation of scenes by learning NeRFs in a latent space to reduce the\nimage resolution, and sharing information across scenes to reduce NeRF\nrepresentation complexity. We experimentally show that our method presents both\nthe lowest training time and memory footprint in scaled inverse graphics\ncompared to other methods applied independently on each scene. Our codebase is\npublicly available as open-source. Our project page can be found at\nhttps://scaled-ig.github.io .\n","authors":["Karim Kassab","Antoine Schnepf","Jean-Yves Franceschi","Laurent Caraffa","Flavian Vasile","Jeremie Mary","Andrew Comport","Valérie Gouet-Brunet"],"pdf_url":"https://arxiv.org/pdf/2410.23742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23738v1","updated":"2024-10-31T08:54:23Z","published":"2024-10-31T08:54:23Z","title":"MLLA-UNet: Mamba-like Linear Attention in an Efficient U-Shape Model for\n Medical Image Segmentation","summary":" Recent advancements in medical imaging have resulted in more complex and\ndiverse images, with challenges such as high anatomical variability, blurred\ntissue boundaries, low organ contrast, and noise. Traditional segmentation\nmethods struggle to address these challenges, making deep learning approaches,\nparticularly U-shaped architectures, increasingly prominent. However, the\nquadratic complexity of standard self-attention makes Transformers\ncomputationally prohibitive for high-resolution images. To address these\nchallenges, we propose MLLA-UNet (Mamba-Like Linear Attention UNet), a novel\narchitecture that achieves linear computational complexity while maintaining\nhigh segmentation accuracy through its innovative combination of linear\nattention and Mamba-inspired adaptive mechanisms, complemented by an efficient\nsymmetric sampling structure for enhanced feature processing. Our architecture\neffectively preserves essential spatial features while capturing long-range\ndependencies at reduced computational complexity. Additionally, we introduce a\nnovel sampling strategy for multi-scale feature fusion. Experiments demonstrate\nthat MLLA-UNet achieves state-of-the-art performance on six challenging\ndatasets with 24 different segmentation tasks, including but not limited to\nFLARE22, AMOS CT, and ACDC, with an average DSC of 88.32%. These results\nunderscore the superiority of MLLA-UNet over existing methods. Our\ncontributions include the novel 2D segmentation architecture and its empirical\nvalidation. The code is available via https://github.com/csyfjiang/MLLA-UNet.\n","authors":["Yufeng Jiang","Zongxi Li","Xiangyan Chen","Haoran Xie","Jing Cai"],"pdf_url":"https://arxiv.org/pdf/2410.23738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23736v1","updated":"2024-10-31T08:49:05Z","published":"2024-10-31T08:49:05Z","title":"MoTaDual: Modality-Task Dual Alignment for Enhanced Zero-shot Composed\n Image Retrieval","summary":" Composed Image Retrieval (CIR) is a challenging vision-language task,\nutilizing bi-modal (image+text) queries to retrieve target images. Despite the\nimpressive performance of supervised CIR, the dependence on costly,\nmanually-labeled triplets limits its scalability and zero-shot capability. To\naddress this issue, zero-shot composed image retrieval (ZS-CIR) is presented\nalong with projection-based approaches. However, such methods face two major\nproblems, i.e., task discrepancy between pre-training (image $\\leftrightarrow$\ntext) and inference (image+text $\\rightarrow$ image), and modality discrepancy.\nThe latter pertains to approaches based on text-only projection training due to\nthe necessity of feature extraction from the reference image during inference.\nIn this paper, we propose a two-stage framework to tackle both discrepancies.\nFirst, to ensure efficiency and scalability, a textual inversion network is\npre-trained on large-scale caption datasets. Subsequently, we put forward\nModality-Task Dual Alignment (MoTaDual) as the second stage, where\nlarge-language models (LLMs) generate triplet data for fine-tuning, and\nadditionally, prompt learning is introduced in a multi-modal context to\neffectively alleviate both modality and task discrepancies. The experimental\nresults show that our MoTaDual achieves the state-of-the-art performance across\nfour widely used ZS-CIR benchmarks, while maintaining low training time and\ncomputational cost. The code will be released soon.\n","authors":["Haiwen Li","Fei Su","Zhicheng Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.23736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08845v4","updated":"2024-10-31T08:38:26Z","published":"2024-06-13T06:09:22Z","title":"Rethinking Human Evaluation Protocol for Text-to-Video Models: Enhancing\n Reliability,Reproducibility, and Practicality","summary":" Recent text-to-video (T2V) technology advancements, as demonstrated by models\nsuch as Gen2, Pika, and Sora, have significantly broadened its applicability\nand popularity. Despite these strides, evaluating these models poses\nsubstantial challenges. Primarily, due to the limitations inherent in automatic\nmetrics, manual evaluation is often considered a superior method for assessing\nT2V generation. However, existing manual evaluation protocols face\nreproducibility, reliability, and practicality issues. To address these\nchallenges, this paper introduces the Text-to-Video Human Evaluation (T2VHE)\nprotocol, a comprehensive and standardized protocol for T2V models. The T2VHE\nprotocol includes well-defined metrics, thorough annotator training, and an\neffective dynamic evaluation module. Experimental results demonstrate that this\nprotocol not only ensures high-quality annotations but can also reduce\nevaluation costs by nearly 50\\%. We will open-source the entire setup of the\nT2VHE protocol, including the complete protocol workflow, the dynamic\nevaluation component details, and the annotation interface code. This will help\ncommunities establish more sophisticated human assessment protocols.\n","authors":["Tianle Zhang","Langtian Ma","Yuchen Yan","Yuchen Zhang","Kai Wang","Yue Yang","Ziyao Guo","Wenqi Shao","Yang You","Yu Qiao","Ping Luo","Kaipeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.08845v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16666v4","updated":"2024-10-31T08:37:22Z","published":"2024-04-25T15:06:58Z","title":"PhyRecon: Physically Plausible Neural Scene Reconstruction","summary":" We address the issue of physical implausibility in multi-view neural\nreconstruction. While implicit representations have gained popularity in\nmulti-view 3D reconstruction, previous work struggles to yield physically\nplausible results, limiting their utility in domains requiring rigorous\nphysical accuracy. This lack of plausibility stems from the absence of physics\nmodeling in existing methods and their inability to recover intricate\ngeometrical structures. In this paper, we introduce PHYRECON, the first\napproach to leverage both differentiable rendering and differentiable physics\nsimulation to learn implicit surface representations. PHYRECON features a novel\ndifferentiable particle-based physical simulator built on neural implicit\nrepresentations. Central to this design is an efficient transformation between\nSDF-based implicit representations and explicit surface points via our proposed\nSurface Points Marching Cubes (SP-MC), enabling differentiable learning with\nboth rendering and physical losses. Additionally, PHYRECON models both\nrendering and physical uncertainty to identify and compensate for inconsistent\nand inaccurate monocular geometric priors. The physical uncertainty further\nfacilitates physics-guided pixel sampling to enhance the learning of slender\nstructures. By integrating these techniques, our model supports differentiable\njoint modeling of appearance, geometry, and physics. Extensive experiments\ndemonstrate that PHYRECON significantly improves the reconstruction quality.\nOur results also exhibit superior physical stability in physical simulators,\nwith at least a 40% improvement across all datasets, paving the way for future\nphysics-based applications.\n","authors":["Junfeng Ni","Yixin Chen","Bohan Jing","Nan Jiang","Bin Wang","Bo Dai","Puhao Li","Yixin Zhu","Song-Chun Zhu","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2404.16666v4.pdf","comment":"NeurIPS'24. Project page: https://phyrecon.github.io/"},{"id":"http://arxiv.org/abs/2410.23730v1","updated":"2024-10-31T08:33:30Z","published":"2024-10-31T08:33:30Z","title":"An Empirical Analysis of GPT-4V's Performance on Fashion Aesthetic\n Evaluation","summary":" Fashion aesthetic evaluation is the task of estimating how well the outfits\nworn by individuals in images suit them. In this work, we examine the zero-shot\nperformance of GPT-4V on this task for the first time. We show that its\npredictions align fairly well with human judgments on our datasets, and also\nfind that it struggles with ranking outfits in similar colors. The code is\navailable at https://github.com/st-tech/gpt4v-fashion-aesthetic-evaluation.\n","authors":["Yuki Hirakawa","Takashi Wada","Kazuya Morishita","Ryotaro Shimizu","Takuya Furusawa","Sai Htaung Kham","Yuki Saito"],"pdf_url":"https://arxiv.org/pdf/2410.23730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.13453v2","updated":"2024-10-31T08:29:51Z","published":"2022-03-25T05:27:28Z","title":"Model LEGO: Creating Models Like Disassembling and Assembling Building\n Blocks","summary":" With the rapid development of deep learning, the increasing complexity and\nscale of parameters make training a new model increasingly resource-intensive.\nIn this paper, we start from the classic convolutional neural network (CNN) and\nexplore a paradigm that does not require training to obtain new models. Similar\nto the birth of CNN inspired by receptive fields in the biological visual\nsystem, we draw inspiration from the information subsystem pathways in the\nbiological visual system and propose Model Disassembling and Assembling (MDA).\nDuring model disassembling, we introduce the concept of relative contribution\nand propose a component locating technique to extract task-aware components\nfrom trained CNN classifiers. For model assembling, we present the alignment\npadding strategy and parameter scaling strategy to construct a new model\ntailored for a specific task, utilizing the disassembled task-aware components.\nThe entire process is akin to playing with LEGO bricks, enabling arbitrary\nassembly of new models, and providing a novel perspective for model creation\nand reuse. Extensive experiments showcase that task-aware components\ndisassembled from CNN classifiers or new models assembled using these\ncomponents closely match or even surpass the performance of the baseline,\ndemonstrating its promising results for model reuse. Furthermore, MDA exhibits\ndiverse potential applications, with comprehensive experiments exploring model\ndecision route analysis, model compression, knowledge distillation, and more.\nThe code is available at https://github.com/jiaconghu/Model-LEGO.\n","authors":["Jiacong Hu","Jing Gao","Jingwen Ye","Yang Gao","Xingen Wang","Zunlei Feng","Mingli Song"],"pdf_url":"https://arxiv.org/pdf/2203.13453v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22733v2","updated":"2024-10-31T08:26:18Z","published":"2024-10-30T06:39:27Z","title":"ETO:Efficient Transformer-based Local Feature Matching by Organizing\n Multiple Homography Hypotheses","summary":" We tackle the efficiency problem of learning local feature matching. Recent\nadvancements have given rise to purely CNN-based and transformer-based\napproaches, each augmented with deep learning techniques. While CNN-based\nmethods often excel in matching speed, transformer-based methods tend to\nprovide more accurate matches. We propose an efficient transformer-based\nnetwork architecture for local feature matching. This technique is built on\nconstructing multiple homography hypotheses to approximate the continuous\ncorrespondence in the real world and uni-directional cross-attention to\naccelerate the refinement. On the YFCC100M dataset, our matching accuracy is\ncompetitive with LoFTR, a state-of-the-art transformer-based architecture,\nwhile the inference speed is boosted to 4 times, even outperforming the\nCNN-based methods. Comprehensive evaluations on other open datasets such as\nMegadepth, ScanNet, and HPatches demonstrate our method's efficacy,\nhighlighting its potential to significantly enhance a wide array of downstream\napplications.\n","authors":["Junjie Ni","Guofeng Zhang","Guanglin Li","Yijin Li","Xinyang Liu","Zhaoyang Huang","Hujun Bao"],"pdf_url":"https://arxiv.org/pdf/2410.22733v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04555v2","updated":"2024-10-31T08:25:08Z","published":"2024-02-07T03:19:02Z","title":"FM-Fusion: Instance-aware Semantic Mapping Boosted by Vision-Language\n Foundation Models","summary":" Semantic mapping based on the supervised object detectors is sensitive to\nimage distribution. In real-world environments, the object detection and\nsegmentation performance can lead to a major drop, preventing the use of\nsemantic mapping in a wider domain. On the other hand, the development of\nvision-language foundation models demonstrates a strong zero-shot\ntransferability across data distribution. It provides an opportunity to\nconstruct generalizable instance-aware semantic maps. Hence, this work explores\nhow to boost instance-aware semantic mapping from object detection generated\nfrom foundation models. We propose a probabilistic label fusion method to\npredict close-set semantic classes from open-set label measurements. An\ninstance refinement module merges the over-segmented instances caused by\ninconsistent segmentation. We integrate all the modules into a unified semantic\nmapping system. Reading a sequence of RGB-D input, our work incrementally\nreconstructs an instance-aware semantic map. We evaluate the zero-shot\nperformance of our method in ScanNet and SceneNN datasets. Our method achieves\n40.3 mean average precision (mAP) on the ScanNet semantic instance segmentation\ntask. It outperforms the traditional semantic mapping method significantly.\n","authors":["Chuhao Liu","Ke Wang","Jieqi Shi","Zhijian Qiao","Shaojie Shen"],"pdf_url":"https://arxiv.org/pdf/2402.04555v2.pdf","comment":"Published in IEEE RAL"},{"id":"http://arxiv.org/abs/2312.10175v3","updated":"2024-10-31T08:10:48Z","published":"2023-12-15T19:57:07Z","title":"UniAR: A Unified model for predicting human Attention and Responses on\n visual content","summary":" Progress in human behavior modeling involves understanding both implicit,\nearly-stage perceptual behavior, such as human attention, and explicit,\nlater-stage behavior, such as subjective preferences or likes. Yet most prior\nresearch has focused on modeling implicit and explicit human behavior in\nisolation; and often limited to a specific type of visual content. We propose\nUniAR -- a unified model of human attention and preference behavior across\ndiverse visual content. UniAR leverages a multimodal transformer to predict\nsubjective feedback, such as satisfaction or aesthetic quality, along with the\nunderlying human attention or interaction heatmaps and viewing order. We train\nUniAR on diverse public datasets spanning natural images, webpages, and graphic\ndesigns, and achieve SOTA performance on multiple benchmarks across various\nimage domains and behavior modeling tasks. Potential applications include\nproviding instant feedback on the effectiveness of UIs/visual content, and\nenabling designers and content-creation models to optimize their creation for\nhuman-centric improvements.\n","authors":["Peizhao Li","Junfeng He","Gang Li","Rachit Bhargava","Shaolei Shen","Nachiappan Valliappan","Youwei Liang","Hongxiang Gu","Venky Ramachandran","Golnaz Farhadi","Yang Li","Kai J Kohlhoff","Vidhya Navalpakkam"],"pdf_url":"https://arxiv.org/pdf/2312.10175v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23718v1","updated":"2024-10-31T08:08:54Z","published":"2024-10-31T08:08:54Z","title":"GaussianMarker: Uncertainty-Aware Copyright Protection of 3D Gaussian\n Splatting","summary":" 3D Gaussian Splatting (3DGS) has become a crucial method for acquiring 3D\nassets. To protect the copyright of these assets, digital watermarking\ntechniques can be applied to embed ownership information discreetly within 3DGS\nmodels. However, existing watermarking methods for meshes, point clouds, and\nimplicit radiance fields cannot be directly applied to 3DGS models, as 3DGS\nmodels use explicit 3D Gaussians with distinct structures and do not rely on\nneural networks. Naively embedding the watermark on a pre-trained 3DGS can\ncause obvious distortion in rendered images. In our work, we propose an\nuncertainty-based method that constrains the perturbation of model parameters\nto achieve invisible watermarking for 3DGS. At the message decoding stage, the\ncopyright messages can be reliably extracted from both 3D Gaussians and 2D\nrendered images even under various forms of 3D and 2D distortions. We conduct\nextensive experiments on the Blender, LLFF and MipNeRF-360 datasets to validate\nthe effectiveness of our proposed method, demonstrating state-of-the-art\nperformance on both message decoding accuracy and view synthesis quality.\n","authors":["Xiufeng Huang","Ruiqi Li","Yiu-ming Cheung","Ka Chun Cheung","Simon See","Renjie Wan"],"pdf_url":"https://arxiv.org/pdf/2410.23718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22725v2","updated":"2024-10-31T08:08:07Z","published":"2024-10-30T06:17:20Z","title":"One Prompt to Verify Your Models: Black-Box Text-to-Image Models\n Verification via Non-Transferable Adversarial Attacks","summary":" Recently, the success of Text-to-Image (T2I) models has led to the rise of\nnumerous third-party platforms, which claim to provide cheaper API services and\nmore flexibility in model options. However, this also raises a new security\nconcern: Are these third-party services truly offering the models they claim?\nTo address this problem, we propose the first T2I model verification method\nnamed Text-to-Image Model Verification via Non-Transferable Adversarial Attacks\n(TVN). The non-transferability of adversarial examples means that these\nexamples are only effective on a target model and ineffective on other models,\nthereby allowing for the verification of the target model. TVN utilizes the\nNon-dominated Sorting Genetic Algorithm II (NSGA-II) to optimize the cosine\nsimilarity of a prompt's text encoding, generating non-transferable adversarial\nprompts. By calculating the CLIP-text scores between the non-transferable\nadversarial prompts without perturbations and the images, we can verify if the\nmodel matches the claimed target model, based on a 3-sigma threshold. The\nexperiments showed that TVN performed well in both closed-set and open-set\nscenarios, achieving a verification accuracy of over 90\\%. Moreover, the\nadversarial prompts generated by TVN significantly reduced the CLIP-text scores\nof the target model, while having little effect on other models.\n","authors":["Ji Guo","Wenbo Jiang","Rui Zhang","Guoming Lu","Hongwei Li"],"pdf_url":"https://arxiv.org/pdf/2410.22725v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20568v2","updated":"2024-10-31T07:43:52Z","published":"2024-10-27T19:45:15Z","title":"Detection of adrenal anomalous findings in spinal CT images using multi\n model graph aggregation","summary":" Low back pain is the symptom that is the second most frequently reported to\nprimary care physicians, effecting 50 to 80 percent of the population in a\nlifetime, resulting in multiple referrals of patients suffering from back\nproblems, to CT and MRI scans, which are then examined by radiologists. The\nradiologists examining these spinal scans naturally focus on spinal pathologies\nand might miss other types of abnormalities, and in particular, abdominal ones,\nsuch as malignancies. Nevertheless, the patients whose spine was scanned might\nas well have malignant and other abdominal pathologies. Thus, clinicians have\nsuggested the need for computerized assistance and decision support in\nscreening spinal scans for additional abnormalities. In the current study, We\nhave addressed the important case of detecting suspicious lesions in the\nadrenal glands as an example for the overall methodology we have developed. A\npatient CT scan is integrated from multiple slices with an axial orientation.\nOur method determines whether a patient has an abnormal adrenal gland, and\nlocalises the abnormality if it exists. Our method is composed of three deep\nlearning models; each model has a different task for achieving the final goal.\nWe call our compound method the Multi Model Graph Aggregation MMGA method. The\nnovelty in this study is twofold. First, the use, for an important screening\ntask, of CT scans that are originally focused and tuned for imaging the spine,\nwhich were acquired from patients with potential spinal disorders, for\ndetection of a totally different set of abnormalities such as abdominal Adrenal\nglands pathologies. Second, we have built a complex pipeline architecture\ncomposed from three deep learning models that can be utilized for other organs\n(such as the pancreas or the kidney), or for similar applications, but using\nother types of imaging, such as MRI.\n","authors":["Shabalin Carmel","Shenkman Israel","Shelef Ilan","Ben-Arie Gal","Alex Geftler","Shahar Yuval"],"pdf_url":"https://arxiv.org/pdf/2410.20568v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10667v2","updated":"2024-10-31T07:43:14Z","published":"2024-04-16T15:43:22Z","title":"VASA-1: Lifelike Audio-Driven Talking Faces Generated in Real Time","summary":" We introduce VASA, a framework for generating lifelike talking faces with\nappealing visual affective skills (VAS) given a single static image and a\nspeech audio clip. Our premiere model, VASA-1, is capable of not only\ngenerating lip movements that are exquisitely synchronized with the audio, but\nalso producing a large spectrum of facial nuances and natural head motions that\ncontribute to the perception of authenticity and liveliness. The core\ninnovations include a holistic facial dynamics and head movement generation\nmodel that works in a face latent space, and the development of such an\nexpressive and disentangled face latent space using videos. Through extensive\nexperiments including evaluation on a set of new metrics, we show that our\nmethod significantly outperforms previous methods along various dimensions\ncomprehensively. Our method not only delivers high video quality with realistic\nfacial and head dynamics but also supports the online generation of 512x512\nvideos at up to 40 FPS with negligible starting latency. It paves the way for\nreal-time engagements with lifelike avatars that emulate human conversational\nbehaviors.\n","authors":["Sicheng Xu","Guojun Chen","Yu-Xiao Guo","Jiaolong Yang","Chong Li","Zhenyu Zang","Yizhong Zhang","Xin Tong","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2404.10667v2.pdf","comment":"NeurIPS 2024 (Oral) Camera ready. Project webpage:\n https://www.microsoft.com/en-us/research/project/vasa-1/"},{"id":"http://arxiv.org/abs/2410.23698v1","updated":"2024-10-31T07:41:13Z","published":"2024-10-31T07:41:13Z","title":"Aggregate-and-Adapt Natural Language Prompts for Downstream\n Generalization of CLIP","summary":" Large pretrained vision-language models like CLIP have shown promising\ngeneralization capability, but may struggle in specialized domains (e.g.,\nsatellite imagery) or fine-grained classification (e.g., car models) where the\nvisual concepts are unseen or under-represented during pretraining. Prompt\nlearning offers a parameter-efficient finetuning framework that can adapt CLIP\nto downstream tasks even when limited annotation data are available. In this\npaper, we improve prompt learning by distilling the textual knowledge from\nnatural language prompts (either human- or LLM-generated) to provide rich\npriors for those under-represented concepts. We first obtain a prompt\n``summary'' aligned to each input image via a learned prompt aggregator. Then\nwe jointly train a prompt generator, optimized to produce a prompt embedding\nthat stays close to the aggregated summary while minimizing task loss at the\nsame time. We dub such prompt embedding as Aggregate-and-Adapted Prompt\nEmbedding (AAPE). AAPE is shown to be able to generalize to different\ndownstream data distributions and tasks, including vision-language\nunderstanding tasks (e.g., few-shot classification, VQA) and generation tasks\n(image captioning) where AAPE achieves competitive performance. We also show\nAAPE is particularly helpful to handle non-canonical and OOD examples.\nFurthermore, AAPE learning eliminates LLM-based inference cost as required by\nbaselines, and scales better with data and LLM model size.\n","authors":["Chen Huang","Skyler Seto","Samira Abnar","David Grangier","Navdeep Jaitly","Josh Susskind"],"pdf_url":"https://arxiv.org/pdf/2410.23698v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.09774v2","updated":"2024-10-31T07:40:40Z","published":"2024-09-15T15:46:03Z","title":"Generalizing Alignment Paradigm of Text-to-Image Generation with\n Preferences through $f$-divergence Minimization","summary":" Direct Preference Optimization (DPO) has recently expanded its successful\napplication from aligning large language models (LLMs) to aligning\ntext-to-image models with human preferences, which has generated considerable\ninterest within the community. However, we have observed that these approaches\nrely solely on minimizing the reverse Kullback-Leibler divergence during\nalignment process between the fine-tuned model and the reference model,\nneglecting the incorporation of other divergence constraints. In this study, we\nfocus on extending reverse Kullback-Leibler divergence in the alignment\nparadigm of text-to-image models to $f$-divergence, which aims to garner better\nalignment performance as well as good generation diversity. We provide the\ngeneralized formula of the alignment paradigm under the $f$-divergence\ncondition and thoroughly analyze the impact of different divergence constraints\non alignment process from the perspective of gradient fields. We conduct\ncomprehensive evaluation on image-text alignment performance, human value\nalignment performance and generation diversity performance under different\ndivergence constraints, and the results indicate that alignment based on\nJensen-Shannon divergence achieves the best trade-off among them. The option of\ndivergence employed for aligning text-to-image models significantly impacts the\ntrade-off between alignment performance (especially human value alignment) and\ngeneration diversity, which highlights the necessity of selecting an\nappropriate divergence for practical applications.\n","authors":["Haoyuan Sun","Bo Xia","Yongzhe Chang","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2409.09774v2.pdf","comment":"34 pages"},{"id":"http://arxiv.org/abs/2410.23690v1","updated":"2024-10-31T07:25:39Z","published":"2024-10-31T07:25:39Z","title":"XRDSLAM: A Flexible and Modular Framework for Deep Learning based SLAM","summary":" In this paper, we propose a flexible SLAM framework, XRDSLAM. It adopts a\nmodular code design and a multi-process running mechanism, providing highly\nreusable foundational modules such as unified dataset management, 3d\nvisualization, algorithm configuration, and metrics evaluation. It can help\ndevelopers quickly build a complete SLAM system, flexibly combine different\nalgorithm modules, and conduct standardized benchmarking for accuracy and\nefficiency comparison. Within this framework, we integrate several\nstate-of-the-art SLAM algorithms with different types, including NeRF and 3DGS\nbased SLAM, and even odometry or reconstruction algorithms, which demonstrates\nthe flexibility and extensibility. We also conduct a comprehensive comparison\nand evaluation of these integrated algorithms, analyzing the characteristics of\neach. Finally, we contribute all the code, configuration and data to the\nopen-source community, which aims to promote the widespread research and\ndevelopment of SLAM technology within the open-source ecosystem.\n","authors":["Xiaomeng Wang","Nan Wang","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.23690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21991v2","updated":"2024-10-31T07:24:06Z","published":"2024-10-29T12:22:07Z","title":"From Explicit Rules to Implicit Reasoning in an Interpretable Violence\n Monitoring System","summary":" Recently, research based on pre-trained models has demonstrated outstanding\nperformance in violence surveillance tasks. However, these black-box systems\nface challenges regarding explainability during training and inference\nprocesses. An important question is how to incorporate explicit knowledge into\nthese implicit models, thereby designing expert-driven and interpretable\nviolence surveillance systems. This paper proposes a new paradigm for weakly\nsupervised violence monitoring (WSVM) called Rule base Violence monitoring\n(RuleVM). The proposed RuleVM uses a dual-branch structure for different\ndesigns for images and text. One of the branches is called the implicit branch,\nwhich uses only visual features for coarse-grained binary classification. In\nthis branch, image feature extraction is divided into two channels: one\nresponsible for extracting scene frames and the other focusing on extracting\nactions. The other branch is called the explicit branch, which utilizes\nlanguage-image alignment to perform fine-grained classification. For the\nlanguage channel design in the explicit branch, the proposed RuleCLIP uses the\nstate-of-the-art YOLO-World model to detect objects and actions in video\nframes, and association rules are identified through data mining methods as\ndescriptions of the video. Leveraging the dual-branch architecture, RuleVM\nachieves interpretable coarse-grained and fine-grained violence surveillance.\nExtensive experiments were conducted on two commonly used benchmarks, and the\nresults show that RuleCLIP achieved the best performance in both coarse-grained\nand fine-grained detection, significantly outperforming existing\nstate-of-the-art methods. Moreover, interpretability experiments uncovered some\ninteresting rules, such as the observation that as the number of people\nincreases, the risk level of violent behavior also rises.\n","authors":["Wen-Dong Jiang","Chih-Yung Chang","Hsiang-Chuan Chang","Diptendu Sinha Roy"],"pdf_url":"https://arxiv.org/pdf/2410.21991v2.pdf","comment":"12 pages,7 figures"},{"id":"http://arxiv.org/abs/2410.23687v1","updated":"2024-10-31T07:22:51Z","published":"2024-10-31T07:22:51Z","title":"Adversarial Attacks of Vision Tasks in the Past 10 Years: A Survey","summary":" Adversarial attacks, which manipulate input data to undermine model\navailability and integrity, pose significant security threats during machine\nlearning inference. With the advent of Large Vision-Language Models (LVLMs),\nnew attack vectors, such as cognitive bias, prompt injection, and jailbreak\ntechniques, have emerged. Understanding these attacks is crucial for developing\nmore robust systems and demystifying the inner workings of neural networks.\nHowever, existing reviews often focus on attack classifications and lack\ncomprehensive, in-depth analysis. The research community currently needs: 1)\nunified insights into adversariality, transferability, and generalization; 2)\ndetailed evaluations of existing methods; 3) motivation-driven attack\ncategorizations; and 4) an integrated perspective on both traditional and LVLM\nattacks. This article addresses these gaps by offering a thorough summary of\ntraditional and LVLM adversarial attacks, emphasizing their connections and\ndistinctions, and providing actionable insights for future research.\n","authors":["Chiyu Zhang","Xiaogang Xu","Jiafei Wu","Zhe Liu","Lu Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.23687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21898v2","updated":"2024-10-31T07:13:08Z","published":"2024-10-29T09:42:54Z","title":"A Longitudinal Analysis of Racial and Gender Bias in New York Times and\n Fox News Images and Articles","summary":" The manner in which different racial and gender groups are portrayed in news\ncoverage plays a large role in shaping public opinion. As such, understanding\nhow such groups are portrayed in news media is of notable societal value, and\nhas thus been a significant endeavour in both the computer and social sciences.\nYet, the literature still lacks a longitudinal study examining both the\nfrequency of appearance of different racial and gender groups in online news\narticles, as well as the context in which such groups are discussed. To fill\nthis gap, we propose two machine learning classifiers to detect the race and\nage of a given subject. Next, we compile a dataset of 123,337 images and\n441,321 online news articles from New York Times (NYT) and Fox News (Fox), and\nexamine representation through two computational approaches. Firstly, we\nexamine the frequency and prominence of appearance of racial and gender groups\nin images embedded in news articles, revealing that racial and gender\nminorities are largely under-represented, and when they do appear, they are\nfeatured less prominently compared to majority groups. Furthermore, we find\nthat NYT largely features more images of racial minority groups compared to\nFox. Secondly, we examine both the frequency and context with which racial\nminority groups are presented in article text. This reveals the narrow scope in\nwhich certain racial groups are covered and the frequency with which different\ngroups are presented as victims and/or perpetrators in a given conflict. Taken\ntogether, our analysis contributes to the literature by providing two novel\nopen-source classifiers to detect race and age from images, and shedding light\non the racial and gender biases in news articles from venues on opposite ends\nof the American political spectrum.\n","authors":["Hazem Ibrahim","Nouar AlDahoul","Syed Mustafa Ali Abbasi","Fareed Zaffar","Talal Rahwan","Yasir Zaki"],"pdf_url":"https://arxiv.org/pdf/2410.21898v2.pdf","comment":"13 pages, and 11 figures"},{"id":"http://arxiv.org/abs/2410.12419v2","updated":"2024-10-31T07:11:29Z","published":"2024-10-16T10:04:22Z","title":"Mind the Context: Attention-Guided Weak-to-Strong Consistency for\n Enhanced Semi-Supervised Medical Image Segmentation","summary":" Medical image segmentation is a pivotal step in diagnostic and therapeutic\nprocesses, relying on high-quality annotated data that is often challenging and\ncostly to obtain. Semi-supervised learning offers a promising approach to\nenhance model performance by leveraging unlabeled data. Although weak-to-strong\nconsistency is a prevalent method in semi-supervised image segmentation, there\nis a scarcity of research on perturbation strategies specifically tailored for\nsemi-supervised medical image segmentation tasks. To address this challenge,\nthis paper introduces a simple yet efficient semi-supervised learning framework\nnamed Attention-Guided weak-to-strong Consistency Match (AIGCMatch). The\nAIGCMatch framework incorporates attention-guided perturbation strategies at\nboth the image and feature levels to achieve weak-to-strong consistency\nregularization. This method not only preserves the structural information of\nmedical images but also enhances the model's ability to process complex\nsemantic information. Extensive experiments conducted on the ACDC and ISIC-2017\ndatasets have validated the effectiveness of AIGCMatch. Our method achieved a\n90.4\\% Dice score in the 7-case scenario on the ACDC dataset, surpassing the\nstate-of-the-art methods and demonstrating its potential and efficacy in\nclinical settings. Additionally, on the ISIC-2017 dataset, we significantly\noutperformed our baseline, indicating the robustness and generalizability of\nAIGCMatch across different medical image segmentation tasks.\n","authors":["Yuxuan Cheng","Chenxi Shao","Jie Ma","Guoliang Li"],"pdf_url":"https://arxiv.org/pdf/2410.12419v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22817v2","updated":"2024-10-31T07:07:27Z","published":"2024-10-30T08:51:29Z","title":"Epipolar-Free 3D Gaussian Splatting for Generalizable Novel View\n Synthesis","summary":" Generalizable 3D Gaussian splitting (3DGS) can reconstruct new scenes from\nsparse-view observations in a feed-forward inference manner, eliminating the\nneed for scene-specific retraining required in conventional 3DGS. However,\nexisting methods rely heavily on epipolar priors, which can be unreliable in\ncomplex realworld scenes, particularly in non-overlapping and occluded regions.\nIn this paper, we propose eFreeSplat, an efficient feed-forward 3DGS-based\nmodel for generalizable novel view synthesis that operates independently of\nepipolar line constraints. To enhance multiview feature extraction with 3D\nperception, we employ a selfsupervised Vision Transformer (ViT) with cross-view\ncompletion pre-training on large-scale datasets. Additionally, we introduce an\nIterative Cross-view Gaussians Alignment method to ensure consistent depth\nscales across different views. Our eFreeSplat represents an innovative approach\nfor generalizable novel view synthesis. Different from the existing pure\ngeometry-free methods, eFreeSplat focuses more on achieving epipolar-free\nfeature matching and encoding by providing 3D priors through cross-view\npretraining. We evaluate eFreeSplat on wide-baseline novel view synthesis tasks\nusing the RealEstate10K and ACID datasets. Extensive experiments demonstrate\nthat eFreeSplat surpasses state-of-the-art baselines that rely on epipolar\npriors, achieving superior geometry reconstruction and novel view synthesis\nquality. Project page: https://tatakai1.github.io/efreesplat/.\n","authors":["Zhiyuan Min","Yawei Luo","Jianwen Sun","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2410.22817v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23677v1","updated":"2024-10-31T06:55:57Z","published":"2024-10-31T06:55:57Z","title":"Wide Two-Layer Networks can Learn from Adversarial Perturbations","summary":" Adversarial examples have raised several open questions, such as why they can\ndeceive classifiers and transfer between different models. A prevailing\nhypothesis to explain these phenomena suggests that adversarial perturbations\nappear as random noise but contain class-specific features. This hypothesis is\nsupported by the success of perturbation learning, where classifiers trained\nsolely on adversarial examples and the corresponding incorrect labels\ngeneralize well to correctly labeled test data. Although this hypothesis and\nperturbation learning are effective in explaining intriguing properties of\nadversarial examples, their solid theoretical foundation is limited. In this\nstudy, we theoretically explain the counterintuitive success of perturbation\nlearning. We assume wide two-layer networks and the results hold for any data\ndistribution. We prove that adversarial perturbations contain sufficient\nclass-specific features for networks to generalize from them. Moreover, the\npredictions of classifiers trained on mislabeled adversarial examples coincide\nwith those of classifiers trained on correctly labeled clean samples. The code\nis available at https://github.com/s-kumano/perturbation-learning.\n","authors":["Soichiro Kumano","Hiroshi Kera","Toshihiko Yamasaki"],"pdf_url":"https://arxiv.org/pdf/2410.23677v1.pdf","comment":"NeurIPS24"},{"id":"http://arxiv.org/abs/2410.23676v1","updated":"2024-10-31T06:55:24Z","published":"2024-10-31T06:55:24Z","title":"Web-Scale Visual Entity Recognition: An LLM-Driven Data Approach","summary":" Web-scale visual entity recognition, the task of associating images with\ntheir corresponding entities within vast knowledge bases like Wikipedia,\npresents significant challenges due to the lack of clean, large-scale training\ndata. In this paper, we propose a novel methodology to curate such a dataset,\nleveraging a multimodal large language model (LLM) for label verification,\nmetadata generation, and rationale explanation. Instead of relying on the\nmultimodal LLM to directly annotate data, which we found to be suboptimal, we\nprompt it to reason about potential candidate entity labels by accessing\nadditional contextually relevant information (such as Wikipedia), resulting in\nmore accurate annotations. We further use the multimodal LLM to enrich the\ndataset by generating question-answer pairs and a grounded finegrained textual\ndescription (referred to as \"rationale\") that explains the connection between\nimages and their assigned entities. Experiments demonstrate that models trained\non this automatically curated data achieve state-of-the-art performance on\nweb-scale visual entity recognition tasks (e.g. +6.9% improvement in OVEN\nentity task), underscoring the importance of high-quality training data in this\ndomain.\n","authors":["Mathilde Caron","Alireza Fathi","Cordelia Schmid","Ahmet Iscen"],"pdf_url":"https://arxiv.org/pdf/2410.23676v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2402.12138v3","updated":"2024-10-31T06:38:27Z","published":"2024-02-19T13:38:15Z","title":"Perceiving Longer Sequences With Bi-Directional Cross-Attention\n Transformers","summary":" We present a novel bi-directional Transformer architecture (BiXT) which\nscales linearly with input size in terms of computational cost and memory\nconsumption, but does not suffer the drop in performance or limitation to only\none input modality seen with other efficient Transformer-based approaches. BiXT\nis inspired by the Perceiver architectures but replaces iterative attention\nwith an efficient bi-directional cross-attention module in which input tokens\nand latent variables attend to each other simultaneously, leveraging a\nnaturally emerging attention-symmetry between the two. This approach unlocks a\nkey bottleneck experienced by Perceiver-like architectures and enables the\nprocessing and interpretation of both semantics ('what') and location ('where')\nto develop alongside each other over multiple layers -- allowing its direct\napplication to dense and instance-based tasks alike. By combining efficiency\nwith the generality and performance of a full Transformer architecture, BiXT\ncan process longer sequences like point clouds, text or images at higher\nfeature resolutions and achieves competitive performance across a range of\ntasks like point cloud part segmentation, semantic image segmentation, image\nclassification, hierarchical sequence modeling and document retrieval. Our\nexperiments demonstrate that BiXT models outperform larger competitors by\nleveraging longer sequences more efficiently on vision tasks like\nclassification and segmentation, and perform on par with full Transformer\nvariants on sequence modeling and document retrieval -- but require $28\\%$\nfewer FLOPs and are up to $8.4\\times$ faster.\n","authors":["Markus Hiller","Krista A. Ehinger","Tom Drummond"],"pdf_url":"https://arxiv.org/pdf/2402.12138v3.pdf","comment":"Accepted at NeurIPS 2024; Code and models will be available at\n https://github.com/mrkshllr/BiXT"},{"id":"http://arxiv.org/abs/2410.23663v1","updated":"2024-10-31T06:26:00Z","published":"2024-10-31T06:26:00Z","title":"DIP: Diffusion Learning of Inconsistency Pattern for General DeepFake\n Detection","summary":" With the advancement of deepfake generation techniques, the importance of\ndeepfake detection in protecting multimedia content integrity has become\nincreasingly obvious. Recently, temporal inconsistency clues have been explored\nto improve the generalizability of deepfake video detection. According to our\nobservation, the temporal artifacts of forged videos in terms of motion\ninformation usually exhibits quite distinct inconsistency patterns along\nhorizontal and vertical directions, which could be leveraged to improve the\ngeneralizability of detectors. In this paper, a transformer-based framework for\nDiffusion Learning of Inconsistency Pattern (DIP) is proposed, which exploits\ndirectional inconsistencies for deepfake video detection. Specifically, DIP\nbegins with a spatiotemporal encoder to represent spatiotemporal information. A\ndirectional inconsistency decoder is adopted accordingly, where direction-aware\nattention and inconsistency diffusion are incorporated to explore potential\ninconsistency patterns and jointly learn the inherent relationships. In\naddition, the SpatioTemporal Invariant Loss (STI Loss) is introduced to\ncontrast spatiotemporally augmented sample pairs and prevent the model from\noverfitting nonessential forgery artifacts. Extensive experiments on several\npublic datasets demonstrate that our method could effectively identify\ndirectional forgery clues and achieve state-of-the-art performance.\n","authors":["Fan Nie","Jiangqun Ni","Jian Zhang","Bin Zhang","Weizhe Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.23663v1.pdf","comment":"13 pages, accepted with IEEE Trans. on Multimedia"},{"id":"http://arxiv.org/abs/2410.23658v1","updated":"2024-10-31T06:17:16Z","published":"2024-10-31T06:17:16Z","title":"GS-Blur: A 3D Scene-Based Dataset for Realistic Image Deblurring","summary":" To train a deblurring network, an appropriate dataset with paired blurry and\nsharp images is essential. Existing datasets collect blurry images either\nsynthetically by aggregating consecutive sharp frames or using sophisticated\ncamera systems to capture real blur. However, these methods offer limited\ndiversity in blur types (blur trajectories) or require extensive human effort\nto reconstruct large-scale datasets, failing to fully reflect real-world blur\nscenarios. To address this, we propose GS-Blur, a dataset of synthesized\nrealistic blurry images created using a novel approach. To this end, we first\nreconstruct 3D scenes from multi-view images using 3D Gaussian Splatting\n(3DGS), then render blurry images by moving the camera view along the randomly\ngenerated motion trajectories. By adopting various camera trajectories in\nreconstructing our GS-Blur, our dataset contains realistic and diverse types of\nblur, offering a large-scale dataset that generalizes well to real-world blur.\nUsing GS-Blur with various deblurring methods, we demonstrate its ability to\ngeneralize effectively compared to previous synthetic or real blur datasets,\nshowing significant improvements in deblurring performance.\n","authors":["Dongwoo Lee","Joonkyu Park","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2410.23658v1.pdf","comment":"Accepted at NeurIPS 2024 Datasets & Benchmarks Track"},{"id":"http://arxiv.org/abs/2410.11187v2","updated":"2024-10-31T06:09:11Z","published":"2024-10-15T02:04:05Z","title":"Multiview Scene Graph","summary":" A proper scene representation is central to the pursuit of spatial\nintelligence where agents can robustly reconstruct and efficiently understand\n3D scenes. A scene representation is either metric, such as landmark maps in 3D\nreconstruction, 3D bounding boxes in object detection, or voxel grids in\noccupancy prediction, or topological, such as pose graphs with loop closures in\nSLAM or visibility graphs in SfM. In this work, we propose to build Multiview\nScene Graphs (MSG) from unposed images, representing a scene topologically with\ninterconnected place and object nodes. The task of building MSG is challenging\nfor existing representation learning methods since it needs to jointly address\nboth visual place recognition, object detection, and object association from\nimages with limited fields of view and potentially large viewpoint changes. To\nevaluate any method tackling this task, we developed an MSG dataset and\nannotation based on a public 3D dataset. We also propose an evaluation metric\nbased on the intersection-over-union score of MSG edges. Moreover, we develop a\nnovel baseline method built on mainstream pretrained vision models, combining\nvisual place recognition and object association into one Transformer decoder\narchitecture. Experiments demonstrate that our method has superior performance\ncompared to existing relevant baselines.\n","authors":["Juexiao Zhang","Gao Zhu","Sihang Li","Xinhao Liu","Haorui Song","Xinran Tang","Chen Feng"],"pdf_url":"https://arxiv.org/pdf/2410.11187v2.pdf","comment":"To be published in NeurIPS 2024. Website at\n https://ai4ce.github.io/MSG/"},{"id":"http://arxiv.org/abs/2405.14325v3","updated":"2024-10-31T05:47:33Z","published":"2024-05-23T08:55:20Z","title":"Dinomaly: The Less Is More Philosophy in Multi-Class Unsupervised\n Anomaly Detection","summary":" Recent studies highlighted a practical setting of unsupervised anomaly\ndetection (UAD) that builds a unified model for multi-class images, serving as\nan alternative to the conventional one-class-one-model setup. Despite various\nadvancements addressing this challenging task, the detection performance under\nthe multi-class setting still lags far behind state-of-the-art class-separated\nmodels. Our research aims to bridge this substantial performance gap. In this\npaper, we introduce a minimalistic reconstruction-based anomaly detection\nframework, namely Dinomaly, which leverages pure Transformer architectures\nwithout relying on complex designs, additional modules, or specialized tricks.\nGiven this powerful framework consisted of only Attentions and MLPs, we found\nfour simple components that are essential to multi-class anomaly detection: (1)\nFoundation Transformers that extracts universal and discriminative features,\n(2) Noisy Bottleneck where pre-existing Dropouts do all the noise injection\ntricks, (3) Linear Attention that naturally cannot focus, and (4) Loose\nReconstruction that does not force layer-to-layer and point-by-point\nreconstruction. Extensive experiments are conducted across popular anomaly\ndetection benchmarks including MVTec-AD, VisA, and Real-IAD. Our proposed\nDinomaly achieves impressive image-level AUROC of 99.6%, 98.7%, and 89.3% on\nthe three datasets respectively, which is not only superior to state-of-the-art\nmulti-class UAD methods, but also achieves the most advanced class-separated\nUAD records.\n","authors":["Jia Guo","Shuai Lu","Weihang Zhang","Fang Chen","Hongen Liao","Huiqi Li"],"pdf_url":"https://arxiv.org/pdf/2405.14325v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14596v3","updated":"2024-10-31T05:38:39Z","published":"2024-06-20T17:45:02Z","title":"VLM Agents Generate Their Own Memories: Distilling Experience into\n Embodied Programs","summary":" Large-scale generative language and vision-language models excel in\nin-context learning for decision making. However, they require high-quality\nexemplar demonstrations to be included in their context window. In this work,\nwe ask: Can LLMs and VLMs generate their own examples from generic, sub-optimal\ndemonstrations? We propose In-Context Abstraction Learning (ICAL), a method\nthat builds a memory of multimodal experience from sub-optimal demonstrations\nand human feedback. Given a task demonstration that may contain inefficiencies\nor mistakes, a VLM abstracts the trajectory into a generalized program by\ncorrecting inefficient actions and annotating cognitive abstractions: causal\nrelationships, object state changes, temporal subgoals, and task-relevant\nvisual elements. These abstractions are iteratively improved through human\nfeedback while the agent attempts to execute the trajectory. The resulting\nexamples, when used as exemplars in the prompt, significantly improve\ndecision-making in retrieval-augmented LLM and VLM agents. Moreover, as the\nagent's library of examples grows, it becomes more efficient, relying less on\nhuman feedback and requiring fewer environment interactions per demonstration.\nOur ICAL agent surpasses the state-of-the-art in dialogue-based instruction\nfollowing in TEACh, multimodal web agents in VisualWebArena, and action\nanticipation in Ego4D. In TEACh, we achieve a 12.6% improvement in\ngoal-condition success. In VisualWebArena, our task success rate improves over\nthe SOTA from 14.3% to 22.7% using GPT4V. In Ego4D action forecasting, we\nimprove over few-shot GPT-4V and remain competitive with supervised models. We\nshow finetuning our retrieval-augmented in-context agent yields additional\nimprovements. Our approach significantly reduces reliance on manual prompt\nengineering and consistently outperforms in-context learning from action plans\nthat lack such abstractions.\n","authors":["Gabriel Sarch","Lawrence Jang","Michael J. Tarr","William W. Cohen","Kenneth Marino","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2406.14596v3.pdf","comment":"Project website: http://ical-learning.github.io/"},{"id":"http://arxiv.org/abs/2410.23642v1","updated":"2024-10-31T05:29:18Z","published":"2024-10-31T05:29:18Z","title":"Novel Clinical-Grade Prostate Cancer Detection and Grading Model:\n Development and Prospective Validation Using Real World Data, with\n Performance Assessment on IHC Requested Cases","summary":" Artificial intelligence may assist healthcare systems in meeting increasing\ndemand for pathology services while maintaining diagnostic quality and reducing\nturnaround time and costs. We aimed to investigate the performance of an\ninstitutionally developed system for prostate cancer detection, grading, and\nworkflow optimization and to contrast this with commercial alternatives. From\nAugust 2021 to March 2023, we scanned 21,396 slides from 1,147 patients with\npositive biopsies. We developed models for cancer detection, grading, and\nscreening of equivocal cases for IHC ordering. We compared a task-specific\nmodel trained using the PANDA dataset of prostate cancer biopsies with one\nbuilt using features extracted by the general-purpose histology foundation\nmodel, UNI and compare their performance in an unfiltered prospectively\ncollected dataset that reflects our patient population (1737 slides,95\npatients). We evaluated the contributions of a bespoke model designed to\nimprove sensitivity in detecting small cancer foci and scoring of broader\npatterns observed at lower resolution. We found high concordance between the\ndeveloped systems and pathologist reference in detection (AUC 98.5, sensitivity\n95.0, and specificity 97.8), ISUP grading (quadratic Cohen's kappa 0.869),\ngrade group 3 or higher (AUC 97.5, sensitivity 94.9, specificity 96.6) and\ncomparable to published data from commercial systems. Screening could reduce\nIHC ordering for equivocal cases by 44.5% with an overall error rate of 1.8%\n(1.4% false positive, 0.4% false negative rates). Institutions like academic\nmedical centers that have high scanning volumes and report abstraction\ncapabilities can develop accurate computational pathology models for internal\nuse. These models have the potential to aid in quality control role and to\nimprove workflow in the pathology lab to help meet future challenges in\nprostate cancer diagnosis.\n","authors":["Ramin Nateghi","Ruoji Zhou","Madeline Saft","Marina Schnauss","Clayton Neill","Ridwan Alam","Nicole Handa","Mitchell Huang","Eric V Li","Jeffery A Goldstein","Edward M Schaeffer","Menatalla Nadim","Fattaneh Pourakpour","Bogdan Isaila","Christopher Felicelli","Vikas Mehta","Behtash G Nezami","Ashley Ross","Ximing Yang","Lee AD Cooper"],"pdf_url":"https://arxiv.org/pdf/2410.23642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23641v1","updated":"2024-10-31T05:27:58Z","published":"2024-10-31T05:27:58Z","title":"Recovering Complete Actions for Cross-dataset Skeleton Action\n Recognition","summary":" Despite huge progress in skeleton-based action recognition, its\ngeneralizability to different domains remains a challenging issue. In this\npaper, to solve the skeleton action generalization problem, we present a\nrecover-and-resample augmentation framework based on a novel complete action\nprior. We observe that human daily actions are confronted with temporal\nmismatch across different datasets, as they are usually partial observations of\ntheir complete action sequences. By recovering complete actions and resampling\nfrom these full sequences, we can generate strong augmentations for unseen\ndomains. At the same time, we discover the nature of general action\ncompleteness within large datasets, indicated by the per-frame diversity over\ntime. This allows us to exploit two assets of transferable knowledge that can\nbe shared across action samples and be helpful for action completion: boundary\nposes for determining the action start, and linear temporal transforms for\ncapturing global action patterns. Therefore, we formulate the recovering stage\nas a two-step stochastic action completion with boundary pose-conditioned\nextrapolation followed by smooth linear transforms. Both the boundary poses and\nlinear transforms can be efficiently learned from the whole dataset via\nclustering. We validate our approach on a cross-dataset setting with three\nskeleton action datasets, outperforming other domain generalization approaches\nby a considerable margin.\n","authors":["Hanchao Liu","Yujiang Li","Tai-Jiang Mu","Shi-Min Hu"],"pdf_url":"https://arxiv.org/pdf/2410.23641v1.pdf","comment":"accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2403.19137v3","updated":"2024-10-31T05:22:26Z","published":"2024-03-28T04:15:58Z","title":"CLAP4CLIP: Continual Learning with Probabilistic Finetuning for\n Vision-Language Models","summary":" Continual learning (CL) aims to help deep neural networks learn new knowledge\nwhile retaining what has been learned. Owing to their powerful\ngeneralizability, pre-trained vision-language models such as Contrastive\nLanguage-Image Pre-training (CLIP) have lately gained traction as practical CL\ncandidates. However, the domain mismatch between the pre-training and the\ndownstream CL tasks often calls for finetuning of the CLIP on the latter. Most\nexisting finetuning methods exhibit deterministic nature. This makes them\noverlook the many possible interactions across the input modalities and deems\nthem unsafe for high-risk tasks requiring reliable uncertainty estimation. To\naddress these, our work proposes Continual LeArning with Probabilistic\nfinetuning (CLAP) - a probabilistic modeling framework over visual-guided text\nfeatures per task, thus providing more calibrated CL finetuning. Unlike recent\ndata-hungry anti-forgetting CL techniques, CLAP alleviates forgetting by\nexploiting the rich pre-trained knowledge of CLIP for weight initialization and\ndistribution regularization of task-specific parameters. Cooperating with the\ndiverse range of existing prompting methods, CLAP can surpass the predominant\ndeterministic finetuning approaches for CL with CLIP. We conclude with\nout-of-the-box applications of superior uncertainty estimation abilities of\nCLAP including novel data detection and exemplar selection within the existing\nCL setups. Our code is available at\n\\url{https://github.com/srvCodes/clap4clip}.\n","authors":["Saurav Jha","Dong Gong","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2403.19137v3.pdf","comment":"Accepted as a poster at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.14430v3","updated":"2024-10-31T05:14:31Z","published":"2024-05-23T11:00:07Z","title":"PipeFusion: Patch-level Pipeline Parallelism for Diffusion Transformers\n Inference","summary":" This paper presents PipeFusion, an innovative parallel methodology to tackle\nthe high latency issues associated with generating high-resolution images using\ndiffusion transformers (DiTs) models. PipeFusion partitions images into patches\nand the model layers across multiple GPUs. It employs a patch-level pipeline\nparallel strategy to orchestrate communication and computation efficiently. By\ncapitalizing on the high similarity between inputs from successive diffusion\nsteps, PipeFusion reuses one-step stale feature maps to provide context for the\ncurrent pipeline step. This approach notably reduces communication costs\ncompared to existing DiTs inference parallelism, including tensor parallel,\nsequence parallel and DistriFusion. PipeFusion also exhibits superior memory\nefficiency, because it can distribute model parameters across multiple devices,\nmaking it more suitable for DiTs with large parameter sizes, such as Flux.1.\nExperimental results demonstrate that PipeFusion achieves state-of-the-art\nperformance on 8xL40 PCIe GPUs for Pixart, Stable-Diffusion 3 and Flux.1\nmodels.Our Source code is available at https://github.com/xdit-project/xDiT.\n","authors":["Jiarui Fang","Jinzhe Pan","Jiannan Wang","Aoyu Li","Xibo Sun"],"pdf_url":"https://arxiv.org/pdf/2405.14430v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23629v1","updated":"2024-10-31T04:42:43Z","published":"2024-10-31T04:42:43Z","title":"Posture-Informed Muscular Force Learning for Robust Hand Pressure\n Estimation","summary":" We present PiMForce, a novel framework that enhances hand pressure estimation\nby leveraging 3D hand posture information to augment forearm surface\nelectromyography (sEMG) signals. Our approach utilizes detailed spatial\ninformation from 3D hand poses in conjunction with dynamic muscle activity from\nsEMG to enable accurate and robust whole-hand pressure measurements under\ndiverse hand-object interactions. We also developed a multimodal data\ncollection system that combines a pressure glove, an sEMG armband, and a\nmarkerless finger-tracking module. We created a comprehensive dataset from 21\nparticipants, capturing synchronized data of hand posture, sEMG signals, and\nexerted hand pressure across various hand postures and hand-object interaction\nscenarios using our collection system. Our framework enables precise hand\npressure estimation in complex and natural interaction scenarios. Our approach\nsubstantially mitigates the limitations of traditional sEMG-based or\nvision-based methods by integrating 3D hand posture information with sEMG\nsignals. Video demos, data, and code are available online.\n","authors":["Kyungjin Seo","Junghoon Seo","Hanseok Jeong","Sangpil Kim","Sang Ho Yoon"],"pdf_url":"https://arxiv.org/pdf/2410.23629v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23628v1","updated":"2024-10-31T04:34:28Z","published":"2024-10-31T04:34:28Z","title":"Cycle-Constrained Adversarial Denoising Convolutional Network for PET\n Image Denoising: Multi-Dimensional Validation on Large Datasets with Reader\n Study and Real Low-Dose Data","summary":" Positron emission tomography (PET) is a critical tool for diagnosing tumors\nand neurological disorders but poses radiation risks to patients, particularly\nto sensitive populations. While reducing injected radiation dose mitigates this\nrisk, it often compromises image quality. To reconstruct full-dose-quality\nimages from low-dose scans, we propose a Cycle-constrained Adversarial\nDenoising Convolutional Network (Cycle-DCN). This model integrates a noise\npredictor, two discriminators, and a consistency network, and is optimized\nusing a combination of supervised loss, adversarial loss, cycle consistency\nloss, identity loss, and neighboring Structural Similarity Index (SSIM) loss.\nExperiments were conducted on a large dataset consisting of raw PET brain data\nfrom 1,224 patients, acquired using a Siemens Biograph Vision PET/CT scanner.\nEach patient underwent a 120-seconds brain scan. To simulate low-dose PET\nconditions, images were reconstructed from shortened scan durations of 30, 12,\nand 5 seconds, corresponding to 1/4, 1/10, and 1/24 of the full-dose\nacquisition, respectively, using a custom-developed GPU-based image\nreconstruction software. The results show that Cycle-DCN significantly improves\naverage Peak Signal-to-Noise Ratio (PSNR), SSIM, and Normalized Root Mean\nSquare Error (NRMSE) across three dose levels, with improvements of up to 56%,\n35%, and 71%, respectively. Additionally, it achieves contrast-to-noise ratio\n(CNR) and Edge Preservation Index (EPI) values that closely align with\nfull-dose images, effectively preserving image details, tumor shape, and\ncontrast, while resolving issues with blurred edges. The results of reader\nstudies indicated that the images restored by Cycle-DCN consistently received\nthe highest ratings from nuclear medicine physicians, highlighting their strong\nclinical relevance.\n","authors":["Yucun Hou","Fenglin Zhan","Xin Cheng","Chenxi Li","Ziquan Yuan","Runze Liao","Haihao Wang","Jianlang Hua","Jing Wu","Jianyong Jiang"],"pdf_url":"https://arxiv.org/pdf/2410.23628v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2410.23623v1","updated":"2024-10-31T04:20:47Z","published":"2024-10-31T04:20:47Z","title":"On Learning Multi-Modal Forgery Representation for Diffusion Generated\n Video Detection","summary":" Large numbers of synthesized videos from diffusion models pose threats to\ninformation security and authenticity, leading to an increasing demand for\ngenerated content detection. However, existing video-level detection algorithms\nprimarily focus on detecting facial forgeries and often fail to identify\ndiffusion-generated content with a diverse range of semantics. To advance the\nfield of video forensics, we propose an innovative algorithm named Multi-Modal\nDetection(MM-Det) for detecting diffusion-generated videos. MM-Det utilizes the\nprofound perceptual and comprehensive abilities of Large Multi-modal Models\n(LMMs) by generating a Multi-Modal Forgery Representation (MMFR) from LMM's\nmulti-modal space, enhancing its ability to detect unseen forgery content.\nBesides, MM-Det leverages an In-and-Across Frame Attention (IAFA) mechanism for\nfeature augmentation in the spatio-temporal domain. A dynamic fusion strategy\nhelps refine forgery representations for the fusion. Moreover, we construct a\ncomprehensive diffusion video dataset, called Diffusion Video Forensics (DVF),\nacross a wide range of forgery videos. MM-Det achieves state-of-the-art\nperformance in DVF, demonstrating the effectiveness of our algorithm. Both\nsource code and DVF are available at https://github.com/SparkleXFantasy/MM-Det.\n","authors":["Xiufeng Song","Xiao Guo","Jiache Zhang","Qirui Li","Lei Bai","Xiaoming Liu","Guangtao Zhai","Xiaohong Liu"],"pdf_url":"https://arxiv.org/pdf/2410.23623v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2405.18406v3","updated":"2024-10-31T23:27:09Z","published":"2024-05-28T17:46:36Z","title":"RACCooN: A Versatile Instructional Video Editing Framework with\n Auto-Generated Narratives","summary":" Recent video generative models primarily rely on carefully written text\nprompts for specific tasks, like inpainting or style editing. They require\nlabor-intensive textual descriptions for input videos, hindering their\nflexibility to adapt personal/raw videos to user specifications. This paper\nproposes RACCooN, a versatile and user-friendly video-to-paragraph-to-video\ngenerative framework that supports multiple video editing capabilities such as\nremoval, addition, and modification, through a unified pipeline. RACCooN\nconsists of two principal stages: Video-to-Paragraph (V2P) and\nParagraph-to-Video (P2V). In the V2P stage, we automatically describe video\nscenes in well-structured natural language, capturing both the holistic context\nand focused object details. Subsequently, in the P2V stage, users can\noptionally refine these descriptions to guide the video diffusion model,\nenabling various modifications to the input video, such as removing, changing\nsubjects, and/or adding new objects. The proposed approach stands out from\nother methods through several significant contributions: (1) RACCooN suggests a\nmulti-granular spatiotemporal pooling strategy to generate well-structured\nvideo descriptions, capturing both the broad context and object details without\nrequiring complex human annotations, simplifying precise video content editing\nbased on text for users. (2) Our video generative model incorporates\nauto-generated narratives or instructions to enhance the quality and accuracy\nof the generated content. (3) RACCooN also plans to imagine new objects in a\ngiven video, so users simply prompt the model to receive a detailed video\nediting plan for complex video editing. The proposed framework demonstrates\nimpressive versatile capabilities in video-to-paragraph generation, video\ncontent editing, and can be incorporated into other SoTA video generative\nmodels for further enhancement.\n","authors":["Jaehong Yoon","Shoubin Yu","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2405.18406v3.pdf","comment":"The first two authors contribute equally. Project Page:\n https://raccoon-mllm-gen.github.io/"},{"id":"http://arxiv.org/abs/2407.17453v2","updated":"2024-10-31T23:23:22Z","published":"2024-07-24T17:37:05Z","title":"VILA$^2$: VILA Augmented VILA","summary":" While visual language model architectures and training infrastructures\nadvance rapidly, data curation remains under-explored where quantity and\nquality become a bottleneck. Existing work either crawls extra Internet data\nwith a loose guarantee of quality or distills from black-box proprietary\nmodels, e.g., GPT-4V / Gemini that are API frequency and performance bounded.\nThis work enables a VLM to improve itself via data enhancement, exploiting its\ngenerative nature. We introduce a simple yet effective VLM augmentation scheme\nthat includes a self-augment step and a specialist-augment step to iteratively\nimprove data quality and hence, model performance. In the self-augment step,\nthe instruction-finetuned VLM recaptions its pretraining caption datasets and\nthen retrains from scratch leveraging refined data. Without any expensive\nhuman-in-the-loop annotation, we observe improvements in data quality and\ndownstream accuracy boosts with three self-augmentation rounds -- a viable free\nlunch to the current VLM training recipe. When self-augmentation saturates, we\naugment the caption diversity by leveraging specialty skills picked up from\ninstruction finetuning. We finetune VLM specialists from the self-augmented VLM\nwith domain-specific experts, including spatial, grounding, and OCR, to fuse\ntask-aware synthetic data into the pretraining stage. Data quality improvements\nand hallucination reductions are cross-checked by VLM (GPT-4V, Gemini) and\nhuman judges. Combining self-augmentation and specialist-augmented training,\nVILA$^2$ consistently improves the accuracy on a wide range of benchmarks over\nthe prior art, producing a reusable pretraining dataset that is 300x more\ncost-efficient than human labeling.\n","authors":["Yunhao Fang","Ligeng Zhu","Yao Lu","Yan Wang","Pavlo Molchanov","Jan Kautz","Jang Hyun Cho","Marco Pavone","Song Han","Hongxu Yin"],"pdf_url":"https://arxiv.org/pdf/2407.17453v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09614v2","updated":"2024-10-31T21:43:04Z","published":"2024-10-12T18:28:56Z","title":"Exploring Behavior-Relevant and Disentangled Neural Dynamics with\n Generative Diffusion Models","summary":" Understanding the neural basis of behavior is a fundamental goal in\nneuroscience. Current research in large-scale neuro-behavioral data analysis\noften relies on decoding models, which quantify behavioral information in\nneural data but lack details on behavior encoding. This raises an intriguing\nscientific question: ``how can we enable in-depth exploration of neural\nrepresentations in behavioral tasks, revealing interpretable neural dynamics\nassociated with behaviors''. However, addressing this issue is challenging due\nto the varied behavioral encoding across different brain regions and mixed\nselectivity at the population level. To tackle this limitation, our approach,\nnamed ``BeNeDiff'', first identifies a fine-grained and disentangled neural\nsubspace using a behavior-informed latent variable model. It then employs\nstate-of-the-art generative diffusion models to synthesize behavior videos that\ninterpret the neural dynamics of each latent factor. We validate the method on\nmulti-session datasets containing widefield calcium imaging recordings across\nthe dorsal cortex. Through guiding the diffusion model to activate individual\nlatent factors, we verify that the neural dynamics of latent factors in the\ndisentangled neural subspace provide interpretable quantifications of the\nbehaviors of interest. At the same time, the neural subspace in BeNeDiff\ndemonstrates high disentanglement and neural reconstruction quality.\n","authors":["Yule Wang","Chengrui Li","Weihan Li","Anqi Wu"],"pdf_url":"https://arxiv.org/pdf/2410.09614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13770v2","updated":"2024-10-31T21:21:26Z","published":"2024-06-19T18:38:11Z","title":"Elliptical Attention","summary":" Pairwise dot-product self-attention is key to the success of transformers\nthat achieve state-of-the-art performance across a variety of applications in\nlanguage and vision. This dot-product self-attention computes attention weights\namong the input tokens using Euclidean distance, which makes the model prone to\nrepresentation collapse and vulnerable to contaminated samples. In this paper,\nwe propose using a Mahalanobis distance metric for computing the attention\nweights to stretch the underlying feature space in directions of high\ncontextual relevance. In particular, we define a hyper-ellipsoidal neighborhood\naround each query to increase the attention weights of the tokens lying in the\ncontextually important directions. We term this novel class of attention\nElliptical Attention. Our Elliptical Attention provides two benefits: 1)\nreducing representation collapse and 2) enhancing the model's robustness as\nElliptical Attention pays more attention to contextually relevant information\nrather than focusing on some small subset of informative features. We\nempirically demonstrate the advantages of Elliptical Attention over the\nbaseline dot-product attention and state-of-the-art attention methods on\nvarious practical tasks, including object classification, image segmentation,\nand language modeling across different data modalities.\n","authors":["Stefan K. Nielsen","Laziz U. Abdullaev","Rachel S. Y. Teo","Tan M. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2406.13770v2.pdf","comment":"10 pages in the main text. Published at NeurIPS 2024. The code is\n available at https://github.com/stefvk/Elliptical-Attention"},{"id":"http://arxiv.org/abs/2410.23109v2","updated":"2024-10-31T20:48:34Z","published":"2024-10-30T15:20:10Z","title":"NASM: Neural Anisotropic Surface Meshing","summary":" This paper introduces a new learning-based method, NASM, for anisotropic\nsurface meshing. Our key idea is to propose a graph neural network to embed an\ninput mesh into a high-dimensional (high-d) Euclidean embedding space to\npreserve curvature-based anisotropic metric by using a dot product loss between\nhigh-d edge vectors. This can dramatically reduce the computational time and\nincrease the scalability. Then, we propose a novel feature-sensitive remeshing\non the generated high-d embedding to automatically capture sharp geometric\nfeatures. We define a high-d normal metric, and then derive an automatic\ndifferentiation on a high-d centroidal Voronoi tessellation (CVT) optimization\nwith the normal metric to simultaneously preserve geometric features and\ncurvature anisotropy that exhibit in the original 3D shapes. To our knowledge,\nthis is the first time that a deep learning framework and a large dataset are\nproposed to construct a high-d Euclidean embedding space for 3D anisotropic\nsurface meshing. Experimental results are evaluated and compared with the\nstate-of-the-art in anisotropic surface meshing on a large number of surface\nmodels from Thingi10K dataset as well as tested on extensive unseen 3D shapes\nfrom Multi-Garment Network dataset and FAUST human dataset.\n","authors":["Hongbo Li","Haikuan Zhu","Sikai Zhong","Ningna Wang","Cheng Lin","Xiaohu Guo","Shiqing Xin","Wenping Wang","Jing Hua","Zichun Zhong"],"pdf_url":"https://arxiv.org/pdf/2410.23109v2.pdf","comment":"SIGGRAPH Asia 2024 (Conference Track)"},{"id":"http://arxiv.org/abs/2410.17514v3","updated":"2024-10-31T20:15:43Z","published":"2024-10-23T02:38:12Z","title":"SRA: A Novel Method to Improve Feature Embedding in Self-supervised\n Learning for Histopathological Images","summary":" Self-supervised learning has become a cornerstone in various areas,\nparticularly histopathological image analysis. Image augmentation plays a\ncrucial role in self-supervised learning, as it generates variations in image\nsamples. However, traditional image augmentation techniques often overlook the\nunique characteristics of histopathological images. In this paper, we propose a\nnew histopathology-specific image augmentation method called stain\nreconstruction augmentation (SRA). We integrate our SRA with MoCo v3, a leading\nmodel in self-supervised contrastive learning, along with our additional\ncontrastive loss terms, and call the new model SRA-MoCo v3. We demonstrate that\nour SRA-MoCo v3 always outperforms the standard MoCo v3 across various\ndownstream tasks and achieves comparable or superior performance to other\nfoundation models pre-trained on significantly larger histopathology datasets.\n","authors":["Hamid Manoochehri","Bodong Zhang","Beatrice S. Knudsen","Tolga Tasdizen"],"pdf_url":"https://arxiv.org/pdf/2410.17514v3.pdf","comment":"Hamid Manoochehri and Bodong Zhang contributed equally to this work"},{"id":"http://arxiv.org/abs/2405.17537v2","updated":"2024-10-31T20:07:53Z","published":"2024-05-27T17:57:48Z","title":"CLIBD: Bridging Vision and Genomics for Biodiversity Monitoring at Scale","summary":" Measuring biodiversity is crucial for understanding ecosystem health. While\nprior works have developed machine learning models for taxonomic classification\nof photographic images and DNA separately, in this work, we introduce a\nmultimodal approach combining both, using CLIP-style contrastive learning to\nalign images, barcode DNA, and text-based representations of taxonomic labels\nin a unified embedding space. This allows for accurate classification of both\nknown and unknown insect species without task-specific fine-tuning, leveraging\ncontrastive learning for the first time to fuse DNA and image data. Our method\nsurpasses previous single-modality approaches in accuracy by over 8% on\nzero-shot learning tasks, showcasing its effectiveness in biodiversity studies.\n","authors":["ZeMing Gong","Austin T. Wang","Xiaoliang Huo","Joakim Bruslund Haurum","Scott C. Lowe","Graham W. Taylor","Angel X. Chang"],"pdf_url":"https://arxiv.org/pdf/2405.17537v2.pdf","comment":"25 pages with 11 figures"},{"id":"http://arxiv.org/abs/2410.21302v2","updated":"2024-10-31T19:44:26Z","published":"2024-10-21T22:52:25Z","title":"Domain-Adaptive Pre-training of Self-Supervised Foundation Models for\n Medical Image Classification in Gastrointestinal Endoscopy","summary":" Video capsule endoscopy has transformed gastrointestinal endoscopy (GIE)\ndiagnostics by offering a non-invasive method for capturing detailed images of\nthe gastrointestinal tract, enabling early disease detection. However, its\npotential is limited by the sheer volume of images generated during the imaging\nprocedure, which can take anywhere from 6-8 hours and often produce up to 1\nmillion images, necessitating automated analysis. Additionally, the variability\nof these images, combined with the need for expert annotations and the scarcity\nof large, high-quality labeled datasets, constrains the effectiveness of\ncurrent medical image analysis models. To address this, we introduce a novel\nlarge gastrointestinal endoscopy dataset, called EndoExtend24, created by\nmerging and re-stratifying the train/test splits of ten existing public and\nprivate datasets, ensuring no overlap of patient data across splits.\nEndoExtend24 includes over 226,000 labeled images, as well as dynamic class\nmappings, which allow unified training across datasets with differing labeling\ngranularity, supporting up to 123 distinct pathological findings. Further, we\npropose to leverage domain adaptive pre-training of foundation models in\ncomputer vision trained with self-supervision on generic image data, to adapt\nthem to the task of GIE medical diagnosis. Specifically, the EVA-02 model,\nwhich is based on the vision transformer architecture and was trained on\nImageNet-22k with masked image modeling (using EVA-CLIP as a MIM teacher), is\npre-trained on the novel EndoExtend24 dataset to achieve domain adaptation, and\nfinally trained on the Capsule Endoscopy 2024 Challenge dataset. Experimental\nresults demonstrate strong performance with an F1 score of 0.88, an improvement\nof about 39% over the baseline model's F1 score of 0.49. Additionally, the\nmodel achieved a macro AUC score of 0.993 and a balanced accuracy of 89.3%.\n","authors":["Marcel Roth","Micha V. Nowak","Adrian Krenzer","Frank Puppe"],"pdf_url":"https://arxiv.org/pdf/2410.21302v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18923v2","updated":"2024-10-31T19:44:05Z","published":"2024-10-24T17:11:52Z","title":"SegLLM: Multi-round Reasoning Segmentation","summary":" We present SegLLM, a novel multi-round interactive reasoning segmentation\nmodel that enhances LLM-based segmentation by exploiting conversational memory\nof both visual and textual outputs. By leveraging a mask-aware multimodal LLM,\nSegLLM re-integrates previous segmentation results into its input stream,\nenabling it to reason about complex user intentions and segment objects in\nrelation to previously identified entities, including positional,\ninteractional, and hierarchical relationships, across multiple interactions.\nThis capability allows SegLLM to respond to visual and text queries in a\nchat-like manner. Evaluated on the newly curated MRSeg benchmark, SegLLM\noutperforms existing methods in multi-round interactive reasoning segmentation\nby over 20%. Additionally, we observed that training on multi-round reasoning\nsegmentation data enhances performance on standard single-round referring\nsegmentation and localization tasks, resulting in a 5.5% increase in cIoU for\nreferring expression segmentation and a 4.5% improvement in Acc@0.5 for\nreferring expression localization.\n","authors":["XuDong Wang","Shaolun Zhang","Shufan Li","Konstantinos Kallidromitis","Kehan Li","Yusuke Kato","Kazuki Kozuka","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2410.18923v2.pdf","comment":"22 pages, 10 figures, 11 tables"},{"id":"http://arxiv.org/abs/2409.09313v2","updated":"2024-10-31T19:36:24Z","published":"2024-09-14T05:17:04Z","title":"Tensor-Based Synchronization and the Low-Rankness of the Block Trifocal\n Tensor","summary":" The block tensor of trifocal tensors provides crucial geometric information\non the three-view geometry of a scene. The underlying synchronization problem\nseeks to recover camera poses (locations and orientations up to a global\ntransformation) from the block trifocal tensor. We establish an explicit Tucker\nfactorization of this tensor, revealing a low multilinear rank of $(6,4,4)$\nindependent of the number of cameras under appropriate scaling conditions. We\nprove that this rank constraint provides sufficient information for camera\nrecovery in the noiseless case. The constraint motivates a synchronization\nalgorithm based on the higher-order singular value decomposition of the block\ntrifocal tensor. Experimental comparisons with state-of-the-art global\nsynchronization methods on real datasets demonstrate the potential of this\nalgorithm for significantly improving location estimation accuracy. Overall\nthis work suggests that higher-order interactions in synchronization problems\ncan be exploited to improve performance, beyond the usual pairwise-based\napproaches.\n","authors":["Daniel Miao","Gilad Lerman","Joe Kileel"],"pdf_url":"https://arxiv.org/pdf/2409.09313v2.pdf","comment":"33 pages, 3 figures. Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.12649v3","updated":"2024-10-31T19:30:46Z","published":"2024-06-18T14:17:57Z","title":"Probabilistic Conceptual Explainers: Trustworthy Conceptual Explanations\n for Vision Foundation Models","summary":" Vision transformers (ViTs) have emerged as a significant area of focus,\nparticularly for their capacity to be jointly trained with large language\nmodels and to serve as robust vision foundation models. Yet, the development of\ntrustworthy explanation methods for ViTs has lagged, particularly in the\ncontext of post-hoc interpretations of ViT predictions. Existing sub-image\nselection approaches, such as feature-attribution and conceptual models, fall\nshort in this regard. This paper proposes five desiderata for explaining ViTs\n-- faithfulness, stability, sparsity, multi-level structure, and parsimony --\nand demonstrates the inadequacy of current methods in meeting these criteria\ncomprehensively. We introduce a variational Bayesian explanation framework,\ndubbed ProbAbilistic Concept Explainers (PACE), which models the distributions\nof patch embeddings to provide trustworthy post-hoc conceptual explanations.\nOur qualitative analysis reveals the distributions of patch-level concepts,\nelucidating the effectiveness of ViTs by modeling the joint distribution of\npatch embeddings and ViT's predictions. Moreover, these patch-level\nexplanations bridge the gap between image-level and dataset-level explanations,\nthus completing the multi-level structure of PACE. Through extensive\nexperiments on both synthetic and real-world datasets, we demonstrate that PACE\nsurpasses state-of-the-art methods in terms of the defined desiderata.\n","authors":["Hengyi Wang","Shiwei Tan","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2406.12649v3.pdf","comment":"Proceedings of the 41st International Conference on Machine Learning\n (ICML 2024)"},{"id":"http://arxiv.org/abs/2410.22530v2","updated":"2024-10-31T19:25:40Z","published":"2024-10-29T20:53:01Z","title":"Adaptive Aggregation Weights for Federated Segmentation of Pancreas MRI","summary":" Federated learning (FL) enables collaborative model training across\ninstitutions without sharing sensitive data, making it an attractive solution\nfor medical imaging tasks. However, traditional FL methods, such as Federated\nAveraging (FedAvg), face difficulties in generalizing across domains due to\nvariations in imaging protocols and patient demographics across institutions.\nThis challenge is particularly evident in pancreas MRI segmentation, where\nanatomical variability and imaging artifacts significantly impact performance.\nIn this paper, we conduct a comprehensive evaluation of FL algorithms for\npancreas MRI segmentation and introduce a novel approach that incorporates\nadaptive aggregation weights. By dynamically adjusting the contribution of each\nclient during model aggregation, our method accounts for domain-specific\ndifferences and improves generalization across heterogeneous datasets.\nExperimental results demonstrate that our approach enhances segmentation\naccuracy and reduces the impact of domain shift compared to conventional FL\nmethods while maintaining privacy-preserving capabilities. Significant\nperformance improvements are observed across multiple hospitals (centers).\n","authors":["Hongyi Pan","Gorkem Durak","Zheyuan Zhang","Yavuz Taktak","Elif Keles","Halil Ertugrul Aktas","Alpay Medetalibeyoglu","Yury Velichko","Concetto Spampinato","Ivo Schoots","Marco J. Bruno","Rajesh N. Keswani","Pallavi Tiwari","Candice Bolan","Tamas Gonda","Michael G. Goggins","Michael B. Wallace","Ziyue Xu","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2410.22530v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14979v3","updated":"2024-10-31T19:08:00Z","published":"2024-07-20T21:06:33Z","title":"RGB2Point: 3D Point Cloud Generation from Single RGB Images","summary":" We introduce RGB2Point, an unposed single-view RGB image to a 3D point cloud\ngeneration based on Transformer. RGB2Point takes an input image of an object\nand generates a dense 3D point cloud. Contrary to prior works based on CNN\nlayers and diffusion denoising approaches, we use pre-trained Transformer\nlayers that are fast and generate high-quality point clouds with consistent\nquality over available categories. Our generated point clouds demonstrate high\nquality on a real-world dataset, as evidenced by improved Chamfer distance\n(51.15%) and Earth Mover's distance (45.96%) metrics compared to the current\nstate-of-the-art. Additionally, our approach shows a better quality on a\nsynthetic dataset, achieving better Chamfer distance (39.26%), Earth Mover's\ndistance (26.95%), and F-score (47.16%). Moreover, our method produces 63.1%\nmore consistent high-quality results across various object categories compared\nto prior works. Furthermore, RGB2Point is computationally efficient, requiring\nonly 2.3GB of VRAM to reconstruct a 3D point cloud from a single RGB image, and\nour implementation generates the results 15,133x faster than a SOTA\ndiffusion-based model.\n","authors":["Jae Joong Lee","Bedrich Benes"],"pdf_url":"https://arxiv.org/pdf/2407.14979v3.pdf","comment":"Accepted to IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2410.23191v2","updated":"2024-10-31T18:19:02Z","published":"2024-10-30T16:45:59Z","title":"Continuous Spatio-Temporal Memory Networks for 4D Cardiac Cine MRI\n Segmentation","summary":" Current cardiac cine magnetic resonance image (cMR) studies focus on the end\ndiastole (ED) and end systole (ES) phases, while ignoring the abundant temporal\ninformation in the whole image sequence. This is because whole sequence\nsegmentation is currently a tedious process and inaccurate. Conventional whole\nsequence segmentation approaches first estimate the motion field between\nframes, which is then used to propagate the mask along the temporal axis.\nHowever, the mask propagation results could be prone to error, especially for\nthe basal and apex slices, where through-plane motion leads to significant\nmorphology and structural change during the cardiac cycle. Inspired by recent\nadvances in video object segmentation (VOS), based on spatio-temporal memory\n(STM) networks, we propose a continuous STM (CSTM) network for semi-supervised\nwhole heart and whole sequence cMR segmentation. Our CSTM network takes full\nadvantage of the spatial, scale, temporal and through-plane continuity prior of\nthe underlying heart anatomy structures, to achieve accurate and fast 4D\nsegmentation. Results of extensive experiments across multiple cMR datasets\nshow that our method can improve the 4D cMR segmentation performance,\nespecially for the hard-to-segment regions.\n","authors":["Meng Ye","Bingyu Xin","Leon Axel","Dimitris Metaxas"],"pdf_url":"https://arxiv.org/pdf/2410.23191v2.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2407.06192v2","updated":"2024-10-31T18:16:38Z","published":"2024-07-08T17:59:57Z","title":"Multi-Object Hallucination in Vision-Language Models","summary":" Large vision language models (LVLMs) often suffer from object hallucination,\nproducing objects not present in the given images. While current benchmarks for\nobject hallucination primarily concentrate on the presence of a single object\nclass rather than individual entities, this work systematically investigates\nmulti-object hallucination, examining how models misperceive (e.g., invent\nnonexistent objects or become distracted) when tasked with focusing on multiple\nobjects simultaneously. We introduce Recognition-based Object Probing\nEvaluation (ROPE), an automated evaluation protocol that considers the\ndistribution of object classes within a single image during testing and uses\nvisual referring prompts to eliminate ambiguity. With comprehensive empirical\nstudies and analysis of potential factors leading to multi-object\nhallucination, we found that (1). LVLMs suffer more hallucinations when\nfocusing on multiple objects compared to a single object. (2). The tested\nobject class distribution affects hallucination behaviors, indicating that\nLVLMs may follow shortcuts and spurious correlations. (3). Hallucinatory\nbehaviors are influenced by data-specific factors, salience and frequency, and\nmodel intrinsic behaviors. We hope to enable LVLMs to recognize and reason\nabout multiple objects that often occur in realistic visual scenes, provide\ninsights, and quantify our progress towards mitigating the issues.\n","authors":["Xuweiyi Chen","Ziqiao Ma","Xuejun Zhang","Sihan Xu","Shengyi Qian","Jianing Yang","David F. Fouhey","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2407.06192v2.pdf","comment":"Accepted to NeurIPS 2024 | Project page:\n https://multi-object-hallucination.github.io/"},{"id":"http://arxiv.org/abs/2410.07801v3","updated":"2024-10-31T18:06:49Z","published":"2024-10-10T10:40:42Z","title":"LucidGrasp: Robotic Framework for Autonomous Manipulation of Laboratory\n Equipment with Different Degrees of Transparency via 6D Pose Estimation","summary":" Many modern robotic systems operate autonomously, however they often lack the\nability to accurately analyze the environment and adapt to changing external\nconditions, while teleoperation systems often require special operator skills.\nIn the field of laboratory automation, the number of automated processes is\ngrowing, however such systems are usually developed to perform specific tasks.\nIn addition, many of the objects used in this field are transparent, making it\ndifficult to analyze them using visual channels. The contributions of this work\ninclude the development of a robotic framework with autonomous mode for\nmanipulating liquid-filled objects with different degrees of transparency in\ncomplex pose combinations. The conducted experiments demonstrated the\nrobustness of the designed visual perception system to accurately estimate\nobject poses for autonomous manipulation, and confirmed the performance of the\nalgorithms in dexterous operations such as liquid dispensing. The proposed\nrobotic framework can be applied for laboratory automation, since it allows\nsolving the problem of performing non-trivial manipulation tasks with the\nanalysis of object poses of varying degrees of transparency and liquid levels,\nrequiring high accuracy and repeatability.\n","authors":["Maria Makarova","Daria Trinitatova","Qian Liu","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2410.07801v3.pdf","comment":"Accepted to the 2024 IEEE International Conference on Robotics and\n Biomimetics (IEEE ROBIO 2024), 6 pages, 8 figures"},{"id":"http://arxiv.org/abs/2410.23277v2","updated":"2024-10-31T18:03:51Z","published":"2024-10-30T17:55:52Z","title":"SlowFast-VGen: Slow-Fast Learning for Action-Driven Long Video\n Generation","summary":" Human beings are endowed with a complementary learning system, which bridges\nthe slow learning of general world dynamics with fast storage of episodic\nmemory from a new experience. Previous video generation models, however,\nprimarily focus on slow learning by pre-training on vast amounts of data,\noverlooking the fast learning phase crucial for episodic memory storage. This\noversight leads to inconsistencies across temporally distant frames when\ngenerating longer videos, as these frames fall beyond the model's context\nwindow. To this end, we introduce SlowFast-VGen, a novel dual-speed learning\nsystem for action-driven long video generation. Our approach incorporates a\nmasked conditional video diffusion model for the slow learning of world\ndynamics, alongside an inference-time fast learning strategy based on a\ntemporal LoRA module. Specifically, the fast learning process updates its\ntemporal LoRA parameters based on local inputs and outputs, thereby efficiently\nstoring episodic memory in its parameters. We further propose a slow-fast\nlearning loop algorithm that seamlessly integrates the inner fast learning loop\ninto the outer slow learning loop, enabling the recall of prior multi-episode\nexperiences for context-aware skill learning. To facilitate the slow learning\nof an approximate world model, we collect a large-scale dataset of 200k videos\nwith language action annotations, covering a wide range of scenarios. Extensive\nexperiments show that SlowFast-VGen outperforms baselines across various\nmetrics for action-driven video generation, achieving an FVD score of 514\ncompared to 782, and maintaining consistency in longer videos, with an average\nof 0.37 scene cuts versus 0.89. The slow-fast learning loop algorithm\nsignificantly enhances performances on long-horizon planning tasks as well.\nProject Website: https://slowfast-vgen.github.io\n","authors":["Yining Hong","Beide Liu","Maxine Wu","Yuanhao Zhai","Kai-Wei Chang","Linjie Li","Kevin Lin","Chung-Ching Lin","Jianfeng Wang","Zhengyuan Yang","Yingnian Wu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2410.23277v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12306v2","updated":"2024-10-31T16:49:59Z","published":"2024-09-18T20:33:54Z","title":"Measuring Sound Symbolism in Audio-visual Models","summary":" Audio-visual pre-trained models have gained substantial attention recently\nand demonstrated superior performance on various audio-visual tasks. This study\ninvestigates whether pre-trained audio-visual models demonstrate non-arbitrary\nassociations between sounds and visual representations$\\unicode{x2013}$known as\nsound symbolism$\\unicode{x2013}$which is also observed in humans. We developed\na specialized dataset with synthesized images and audio samples and assessed\nthese models using a non-parametric approach in a zero-shot setting. Our\nfindings reveal a significant correlation between the models' outputs and\nestablished patterns of sound symbolism, particularly in models trained on\nspeech data. These results suggest that such models can capture sound-meaning\nconnections akin to human language processing, providing insights into both\ncognitive architectures and machine learning strategies.\n","authors":["Wei-Cheng Tseng","Yi-Jen Shih","David Harwath","Raymond Mooney"],"pdf_url":"https://arxiv.org/pdf/2409.12306v2.pdf","comment":"Errors in the introduction part that might potentially affect the\n integrity of the paper. Withdraw at the point. Will replace with an updated\n version in the future"}]},"2024-11-01T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2404.03570v3","updated":"2024-11-01T14:18:06Z","published":"2024-04-04T16:30:20Z","title":"Embodied AI with Two Arms: Zero-shot Learning, Safety and Modularity","summary":" We present an embodied AI system which receives open-ended natural language\ninstructions from a human, and controls two arms to collaboratively accomplish\npotentially long-horizon tasks over a large workspace. Our system is modular:\nit deploys state of the art Large Language Models for task\nplanning,Vision-Language models for semantic perception, and Point Cloud\ntransformers for grasping. With semantic and physical safety in mind, these\nmodules are interfaced with a real-time trajectory optimizer and a compliant\ntracking controller to enable human-robot proximity. We demonstrate performance\nfor the following tasks: bi-arm sorting, bottle opening, and trash disposal\ntasks. These are done zero-shot where the models used have not been trained\nwith any real world data from this bi-arm robot, scenes or workspace. Composing\nboth learning- and non-learning-based components in a modular fashion with\ninterpretable inputs and outputs allows the user to easily debug points of\nfailures and fragilities. One may also in-place swap modules to improve the\nrobustness of the overall platform, for instance with imitation-learned\npolicies. Please see https://sites.google.com/corp/view/safe-robots .\n","authors":["Jake Varley","Sumeet Singh","Deepali Jain","Krzysztof Choromanski","Andy Zeng","Somnath Basu Roy Chowdhury","Avinava Dubey","Vikas Sindhwani"],"pdf_url":"https://arxiv.org/pdf/2404.03570v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07195v2","updated":"2024-11-01T14:13:27Z","published":"2024-09-11T11:34:43Z","title":"Perceptive Pedipulation with Local Obstacle Avoidance","summary":" Pedipulation leverages the feet of legged robots for mobile manipulation,\neliminating the need for dedicated robotic arms. While previous works have\nshowcased blind and task-specific pedipulation skills, they fail to account for\nstatic and dynamic obstacles in the environment. To address this limitation, we\nintroduce a reinforcement learning-based approach to train a whole-body\nobstacle-aware policy that tracks foot position commands while simultaneously\navoiding obstacles. Despite training the policy in only five different static\nscenarios in simulation, we show that it generalizes to unknown environments\nwith different numbers and types of obstacles. We analyze the performance of\nour method through a set of simulation experiments and successfully deploy the\nlearned policy on the ANYmal quadruped, demonstrating its capability to follow\nfoot commands while navigating around static and dynamic obstacles.\n","authors":["Jonas Stolle","Philip Arm","Mayank Mittal","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2409.07195v2.pdf","comment":"Accepted to the IEEE International Conference on Humanoid Robots 2024"},{"id":"http://arxiv.org/abs/2310.16020v3","updated":"2024-11-01T13:59:05Z","published":"2023-10-24T17:30:26Z","title":"ConvBKI: Real-Time Probabilistic Semantic Mapping Network with\n Quantifiable Uncertainty","summary":" In this paper, we develop a modular neural network for real-time\n{\\color{black}(> 10 Hz)} semantic mapping in uncertain environments, which\nexplicitly updates per-voxel probabilistic distributions within a neural\nnetwork layer. Our approach combines the reliability of classical probabilistic\nalgorithms with the performance and efficiency of modern neural networks.\nAlthough robotic perception is often divided between modern differentiable\nmethods and classical explicit methods, a union of both is necessary for\nreal-time and trustworthy performance. We introduce a novel Convolutional\nBayesian Kernel Inference (ConvBKI) layer which incorporates semantic\nsegmentation predictions online into a 3D map through a depthwise convolution\nlayer by leveraging conjugate priors. We compare ConvBKI against\nstate-of-the-art deep learning approaches and probabilistic algorithms for\nmapping to evaluate reliability and performance. We also create a Robot\nOperating System (ROS) package of ConvBKI and test it on real-world\nperceptually challenging off-road driving data.\n","authors":["Joey Wilson","Yuewei Fu","Joshua Friesen","Parker Ewen","Andrew Capodieci","Paramsothy Jayakumar","Kira Barton","Maani Ghaffari"],"pdf_url":"https://arxiv.org/pdf/2310.16020v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2209.10663"},{"id":"http://arxiv.org/abs/2410.23634v2","updated":"2024-11-01T11:37:39Z","published":"2024-10-31T05:01:20Z","title":"Tiny Learning-Based MPC for Multirotors: Solver-Aware Learning for\n Efficient Embedded Predictive Control","summary":" Tiny aerial robots show promise for applications like environmental\nmonitoring and search-and-rescue but face challenges in control due to their\nlimited computing power and complex dynamics. Model Predictive Control (MPC)\ncan achieve agile trajectory tracking and handle constraints. Although current\nlearning-based MPC methods, such as Gaussian Process (GP) MPC, improve control\nperformance by learning residual dynamics, they are computationally demanding,\nlimiting their onboard application on tiny robots. This paper introduces Tiny\nLearning-Based Model Predictive Control (LB MPC), a novel framework for\nresource-constrained micro multirotor platforms. By exploiting multirotor\ndynamics' structure and developing an efficient solver, our approach enables\nhigh-rate control at 100 Hz on a Crazyflie 2.1 with a Teensy 4.0\nmicrocontroller. We demonstrate a 23% average improvement in tracking\nperformance over existing embedded MPC methods, achieving the first onboard\nimplementation of learning-based MPC on a tiny multirotor (53 g).\n","authors":["Babak Akbari","Justin Frank","Melissa Greeff"],"pdf_url":"https://arxiv.org/pdf/2410.23634v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04180v2","updated":"2024-11-01T09:03:26Z","published":"2024-02-06T17:38:41Z","title":"Deep-Learning Estimation of Weight Distribution Using Joint Kinematics\n for Lower-Limb Exoskeleton Control","summary":" In the control of lower-limb exoskeletons with feet, the phase in the gait\ncycle can be identified by monitoring the weight distribution at the feet. This\nphase information can be used in the exoskeleton's controller to compensate the\ndynamics of the exoskeleton and to assign impedance parameters. Typically the\nweight distribution is calculated using data from sensors such as treadmill\nforce plates or insole force sensors. However, these solutions increase both\nthe setup complexity and cost. For this reason, we propose a deep-learning\napproach that uses a short time window of joint kinematics to predict the\nweight distribution of an exoskeleton in real time. The model was trained on\ntreadmill walking data from six users wearing a four-degree-of-freedom\nexoskeleton and tested in real time on three different users wearing the same\ndevice. This test set includes two users not present in the training set to\ndemonstrate the model's ability to generalize across individuals. Results show\nthat the proposed method is able to fit the actual weight distribution with\nR2=0.9 and is suitable for real-time control with prediction times less than 1\nms. Experiments in closed-loop exoskeleton control show that\ndeep-learning-based weight distribution estimation can be used to replace force\nsensors in overground and treadmill walking.\n","authors":["Clément Lhoste","Emek Barış Küçüktabak","Lorenzo Vianello","Lorenzo Amato","Matthew R. Short","Kevin Lynch","Jose L. Pons"],"pdf_url":"https://arxiv.org/pdf/2402.04180v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06521v2","updated":"2024-11-01T07:15:49Z","published":"2024-09-10T13:56:08Z","title":"Asymptotically Optimal Lazy Lifelong Sampling-based Algorithm for\n Efficient Motion Planning in Dynamic Environments","summary":" The paper introduces an asymptotically optimal lifelong sampling-based path\nplanning algorithm that combines the merits of lifelong planning algorithms and\nlazy search algorithms for rapid replanning in dynamic environments where edge\nevaluation is expensive. By evaluating only sub-path candidates for the optimal\nsolution, the algorithm saves considerable evaluation time and thereby reduces\nthe overall planning cost. It employs a novel informed rewiring cascade to\nefficiently repair the search tree when the underlying search graph changes.\nSimulation results demonstrate that the algorithm outperforms various\nstate-of-the-art sampling-based planners in addressing both static and dynamic\nmotion planning problems.\n","authors":["Lu Huang","Xingjian Jing"],"pdf_url":"https://arxiv.org/pdf/2409.06521v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15677v3","updated":"2024-11-01T06:19:24Z","published":"2024-05-24T16:17:35Z","title":"SMART: Scalable Multi-agent Real-time Motion Generation via Next-token\n Prediction","summary":" Data-driven autonomous driving motion generation tasks are frequently\nimpacted by the limitations of dataset size and the domain gap between\ndatasets, which precludes their extensive application in real-world scenarios.\nTo address this issue, we introduce SMART, a novel autonomous driving motion\ngeneration paradigm that models vectorized map and agent trajectory data into\ndiscrete sequence tokens. These tokens are then processed through a\ndecoder-only transformer architecture to train for the next token prediction\ntask across spatial-temporal series. This GPT-style method allows the model to\nlearn the motion distribution in real driving scenarios. SMART achieves\nstate-of-the-art performance across most of the metrics on the generative Sim\nAgents challenge, ranking 1st on the leaderboards of Waymo Open Motion Dataset\n(WOMD), demonstrating remarkable inference speed. Moreover, SMART represents\nthe generative model in the autonomous driving motion domain, exhibiting\nzero-shot generalization capabilities: Using only the NuPlan dataset for\ntraining and WOMD for validation, SMART achieved a competitive score of 0.72 on\nthe Sim Agents challenge. Lastly, we have collected over 1 billion motion\ntokens from multiple datasets, validating the model's scalability. These\nresults suggest that SMART has initially emulated two important properties:\nscalability and zero-shot generalization, and preliminarily meets the needs of\nlarge-scale real-time simulation applications. We have released all the code to\npromote the exploration of models for motion generation in the autonomous\ndriving field. The source code is available at\nhttps://github.com/rainmaker22/SMART.\n","authors":["Wei Wu","Xiaoxin Feng","Ziyan Gao","Yuheng Kan"],"pdf_url":"https://arxiv.org/pdf/2405.15677v3.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.07701v3","updated":"2024-11-01T03:16:44Z","published":"2024-10-10T08:14:11Z","title":"Autonomous Driving in Unstructured Environments: How Far Have We Come?","summary":" Research on autonomous driving in unstructured outdoor environments is less\nadvanced than in structured urban settings due to challenges like environmental\ndiversities and scene complexity. These environments-such as rural areas and\nrugged terrains-pose unique obstacles that are not common in structured urban\nareas. Despite these difficulties, autonomous driving in unstructured outdoor\nenvironments is crucial for applications in agriculture, mining, and military\noperations. Our survey reviews over 250 papers for autonomous driving in\nunstructured outdoor environments, covering offline mapping, pose estimation,\nenvironmental perception, path planning, end-to-end autonomous driving,\ndatasets, and relevant challenges. We also discuss emerging trends and future\nresearch directions. This review aims to consolidate knowledge and encourage\nfurther research for autonomous driving in unstructured environments. To\nsupport ongoing work, we maintain an active repository with up-to-date\nliterature and open-source projects at:\nhttps://github.com/chaytonmin/Survey-Autonomous-Driving-in-Unstructured-Environments.\n","authors":["Chen Min","Shubin Si","Xu Wang","Hanzhang Xue","Weizhong Jiang","Yang Liu","Juan Wang","Qingtian Zhu","Qi Zhu","Lun Luo","Fanjie Kong","Jinyu Miao","Xudong Cai","Shuai An","Wei Li","Jilin Mei","Tong Sun","Heng Zhai","Qifeng Liu","Fangzhou Zhao","Liang Chen","Shuai Wang","Erke Shang","Linzhi Shang","Kunlong Zhao","Fuyang Li","Hao Fu","Lei Jin","Jian Zhao","Fangyuan Mao","Zhipeng Xiao","Chengyang Li","Bin Dai","Dawei Zhao","Liang Xiao","Yiming Nie","Yu Hu","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2410.07701v3.pdf","comment":"Survey paper; 38 pages"},{"id":"http://arxiv.org/abs/2410.16481v2","updated":"2024-11-01T23:51:44Z","published":"2024-10-21T20:12:45Z","title":"Caging in Time: A Framework for Robust Object Manipulation under\n Uncertainties and Limited Robot Perception","summary":" Real-world object manipulation has been commonly challenged by physical\nuncertainties and perception limitations. Being an effective strategy, while\ncaging configuration-based manipulation frameworks have successfully provided\nrobust solutions, they are not broadly applicable due to their strict\nrequirements on the availability of multiple robots, widely distributed\ncontacts, or specific geometries of the robots or the objects. To this end,\nthis work proposes a novel concept, termed Caging in Time, to allow caging\nconfigurations to be formed even if there is just one robot engaged in a task.\nThis novel concept can be explained by an insight that even if a caging\nconfiguration is needed to constrain the motion of an object, only a small\nportion of the cage is actively manipulating at a time. As such, we can switch\nthe configuration of the robot strategically so that by collapsing its\nconfiguration in time, we will see a cage formed and its necessary portion\nactive whenever needed. We instantiate our Caging in Time theory on challenging\nquasistatic and dynamic manipulation tasks, showing that Caging in Time can be\nachieved in general state spaces including geometry-based and energy-based\nspaces. With extensive experiments, we show robust and accurate manipulation,\nin an open-loop manner, without requiring detailed knowledge of the object\ngeometry or physical properties, nor realtime accurate feedback on the\nmanipulation states. In addition to being an effective and robust open-loop\nmanipulation solution, the proposed theory can be a supplementary strategy to\nother manipulation systems affected by uncertain or limited robot perception.\n","authors":["Gaotian Wang","Kejia Ren","Andrew S. Morgan","Kaiyu Hang"],"pdf_url":"https://arxiv.org/pdf/2410.16481v2.pdf","comment":"24 pages, 25 figures, video available at:\n www.youtube.com/watch?v=Ag_jTzazuSM"},{"id":"http://arxiv.org/abs/2402.17904v2","updated":"2024-11-01T22:46:52Z","published":"2024-02-27T21:42:58Z","title":"4CNet: A Diffusion Approach to Map Prediction for Decentralized\n Multi-robot Exploration","summary":" Mobile robots in unknown cluttered environments with irregularly shaped\nobstacles often face sensing, energy, and communication challenges which\ndirectly affect their ability to explore these environments. In this paper, we\nintroduce a novel deep learning architecture, Confidence-Aware Contrastive\nConditional Consistency Model (4CNet), for robot map prediction during\ndecentralized, resource-limited multi-robot exploration. 4CNet uniquely\nincorporates: 1) a conditional consistency model for map prediction in\nunstructured unknown regions, 2) a contrastive map-trajectory pretraining\nframework for a trajectory encoder that extracts spatial information from the\ntrajectories of nearby robots during map prediction, and 3) a confidence\nnetwork to measure the uncertainty of map prediction for effective exploration\nunder resource constraints. We incorporate 4CNet within our proposed robot\nexploration with map prediction architecture, 4CNet-E. We then conduct\nextensive comparison studies with 4CNet-E and state-of-the-art heuristic and\nlearning methods to investigate both map prediction and exploration performance\nin environments consisting of irregularly shaped obstacles and uneven terrain.\nResults showed that 4CNet-E obtained statistically significant higher\nprediction accuracy and area coverage with varying environment sizes, number of\nrobots, energy budgets, and communication limitations. Hardware experiments\nwere performed and validated the applicability and generalizability of 4CNet-E\nin both unstructured indoor and real natural outdoor environments.\n","authors":["Aaron Hao Tan","Siddarth Narasimhan","Goldie Nejat"],"pdf_url":"https://arxiv.org/pdf/2402.17904v2.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2208.04883v5","updated":"2024-11-01T21:25:51Z","published":"2022-08-09T16:25:49Z","title":"Neural-Rendezvous: Provably Robust Guidance and Control to Encounter\n Interstellar Objects","summary":" Interstellar objects (ISOs) are likely representatives of primitive materials\ninvaluable in understanding exoplanetary star systems. Due to their poorly\nconstrained orbits with generally high inclinations and relative velocities,\nhowever, exploring ISOs with conventional human-in-the-loop approaches is\nsignificantly challenging. This paper presents Neural-Rendezvous -- a deep\nlearning-based guidance and control framework for encountering fast-moving\nobjects, including ISOs, robustly, accurately, and autonomously in real time.\nIt uses pointwise minimum norm tracking control on top of a guidance policy\nmodeled by a spectrally-normalized deep neural network, where its\nhyperparameters are tuned with a loss function directly penalizing the MPC\nstate trajectory tracking error. We show that Neural-Rendezvous provides a high\nprobability exponential bound on the expected spacecraft delivery error, the\nproof of which leverages stochastic incremental stability analysis. In\nparticular, it is used to construct a non-negative function with a\nsupermartingale property, explicitly accounting for the ISO state uncertainty\nand the local nature of nonlinear state estimation guarantees. In numerical\nsimulations, Neural-Rendezvous is demonstrated to satisfy the expected error\nbound for 100 ISO candidates. This performance is also empirically validated\nusing our spacecraft simulator and in high-conflict and distributed UAV swarm\nreconfiguration with up to 20 UAVs.\n","authors":["Hiroyasu Tsukamoto","Soon-Jo Chung","Yashwanth Kumar Nakka","Benjamin Donitz","Declan Mages","Michel Ingham"],"pdf_url":"https://arxiv.org/pdf/2208.04883v5.pdf","comment":"Preprint Version, Accepted: October, 2024 (One-minute YouTube\n summary: https://youtu.be/q3e0LYS2IYQ, DOI:\n https://doi.org/10.2514/1.G007671)"},{"id":"http://arxiv.org/abs/2411.01038v1","updated":"2024-11-01T21:13:11Z","published":"2024-11-01T21:13:11Z","title":"AGISim, An Open Source Airborne Gimbal Mounted IMU Signal Simulator\n Considering Flight Dynamics Model","summary":" In this work we present more comprehensive evaluations on our airborne Gimbal\nmounted inertial measurement unit (IMU) signal simulator which also considers\nflight dynamic model (FDM). A flexible IMU signal simulator is an enabling tool\nin design, development, improvement, test and verification of aided inertial\nnavigation systems (INS). Efforts by other researchers had been concentrated on\nsimulation of the strapdown INS (SINS) with the IMU rigidly attached to the\nmoving body frame. However custom airborne surveying/mapping applications that\nneed pointing and stabilizing camera or any other surveying sensor, require\nmounting the IMU beside the sensor on a Gimbal onboard the airframe. Hence the\nproposed Gimbal mounted IMU signal simulator is of interest whilst itself\nrequires further analysis and verifications. Extended evaluation results in\nterms of both unit tests and functional/integration tests (using aided inertial\nnavigation algorithms with variable/dynamic lever arms), verifies the simulator\nand its applicability for the mentioned tasks. We have further packaged and\npublished our MATLAB code for the proposed simulator as an open source GitHub\nrepository.\n","authors":["Alireza Kazemi","Reza Rohani Sarvestani"],"pdf_url":"https://arxiv.org/pdf/2411.01038v1.pdf","comment":"10 pages, 8 figures, 4 tables, Submitted to Journal of Aerospace\n Science and Technology (JAST)"},{"id":"http://arxiv.org/abs/2411.01014v1","updated":"2024-11-01T20:37:31Z","published":"2024-11-01T20:37:31Z","title":"Mixed Reality Teleoperation Assistance for Direct Control of Humanoids","summary":" Teleoperation plays a crucial role in enabling robot operations in\nchallenging environments, yet existing limitations in effectiveness and\naccuracy necessitate the development of innovative strategies for improving\nteleoperated tasks. This article introduces a novel approach that utilizes\nmixed reality and assistive autonomy to enhance the efficiency and precision of\nhumanoid robot teleoperation. By leveraging Probabilistic Movement Primitives,\nobject detection, and Affordance Templates, the assistance combines user motion\nwith autonomous capabilities, achieving task efficiency while maintaining\nhuman-like robot motion. Experiments and feasibility studies on the Nadia robot\nconfirm the effectiveness of the proposed framework.\n","authors":["Luigi Penco","Kazuhiko Momose","Stephen McCrory","Dexton Anderson","Nicholas Kitchel","Duncan Calvert","Robert J. Griffin"],"pdf_url":"https://arxiv.org/pdf/2411.01014v1.pdf","comment":"IEEE Robotics and Automation, Volume: 9, Issue: 2"},{"id":"http://arxiv.org/abs/2411.01011v1","updated":"2024-11-01T20:27:46Z","published":"2024-11-01T20:27:46Z","title":"Active Learning-augmented Intention-aware Obstacle Avoidance of\n Autonomous Surface Vehicles in High-traffic Waters","summary":" This paper enhances the obstacle avoidance of Autonomous Surface Vehicles\n(ASVs) for safe navigation in high-traffic waters with an active state\nestimation of obstacle's passing intention and reducing its uncertainty. We\nintroduce a topological modeling of passing intention of obstacles, which can\nbe applied to varying encounter situations based on the inherent embedding of\ntopological concepts in COLREGs. With a Long Short-Term Memory (LSTM) neural\nnetwork, we classify the passing intention of obstacles. Then, for determining\nthe ASV maneuver, we propose a multi-objective optimization framework including\ninformation gain about the passing obstacle intention and safety. We validate\nthe proposed approach under extensive Monte Carlo simulations (2,400 runs) with\na varying number of obstacles, dynamic properties, encounter situations, and\ndifferent behavioral patterns of obstacles (cooperative, non-cooperative). We\nalso present the results from a real marine accident case study as well as\nreal-world experiments of a real ASV with environmental disturbances, showing\nsuccessful collision avoidance with our strategy in real-time.\n","authors":["Mingi Jeong","Arihant Chadda","Alberto Quattrini Li"],"pdf_url":"https://arxiv.org/pdf/2411.01011v1.pdf","comment":"Accepted to IROS 2024"},{"id":"http://arxiv.org/abs/2411.01000v1","updated":"2024-11-01T19:51:37Z","published":"2024-11-01T19:51:37Z","title":"Enhancing Model-Based Step Adaptation for Push Recovery through\n Reinforcement Learning of Step Timing and Region","summary":" This paper introduces a new approach to enhance the robustness of humanoid\nwalking under strong perturbations, such as substantial pushes. Effective\nrecovery from external disturbances requires bipedal robots to dynamically\nadjust their stepping strategies, including footstep positions and timing.\nUnlike most advanced walking controllers that restrict footstep locations to a\npredefined convex region, substantially limiting recoverable disturbances, our\nmethod leverages reinforcement learning to dynamically adjust the permissible\nfootstep region, expanding it to a larger, effectively non-convex area and\nallowing cross-over stepping, which is crucial for counteracting large lateral\npushes. Additionally, our method adapts footstep timing in real time to further\nextend the range of recoverable disturbances. Based on these adjustments,\nfeasible footstep positions and DCM trajectory are planned by solving a QP.\nFinally, we employ a DCM controller and an inverse dynamics whole-body control\nframework to ensure the robot effectively follows the trajectory.\n","authors":["Tobias Egle","Yashuai Yan","Dongheui Lee","Christian Ott"],"pdf_url":"https://arxiv.org/pdf/2411.01000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00967v1","updated":"2024-11-01T18:34:26Z","published":"2024-11-01T18:34:26Z","title":"Raspberry PhenoSet: A Phenology-based Dataset for Automated Growth\n Detection and Yield Estimation","summary":" The future of the agriculture industry is intertwined with automation.\nAccurate fruit detection, yield estimation, and harvest time estimation are\ncrucial for optimizing agricultural practices. These tasks can be carried out\nby robots to reduce labour costs and improve the efficiency of the process. To\ndo so, deep learning models should be trained to perform knowledge-based tasks,\nwhich outlines the importance of contributing valuable data to the literature.\nIn this paper, we introduce Raspberry PhenoSet, a phenology-based dataset\ndesigned for detecting and segmenting raspberry fruit across seven\ndevelopmental stages. To the best of our knowledge, Raspberry PhenoSet is the\nfirst fruit dataset to integrate biology-based classification with fruit\ndetection tasks, offering valuable insights for yield estimation and precise\nharvest timing. This dataset contains 1,853 high-resolution images, the highest\nquality in the literature, captured under controlled artificial lighting in a\nvertical farm. The dataset has a total of 6,907 instances of mask annotations,\nmanually labelled to reflect the seven phenology stages. We have also\nbenchmarked Raspberry PhenoSet using several state-of-the-art deep learning\nmodels, including YOLOv8, YOLOv10, RT-DETR, and Mask R-CNN, to provide a\ncomprehensive evaluation of their performance on the dataset. Our results\nhighlight the challenges of distinguishing subtle phenology stages and\nunderscore the potential of Raspberry PhenoSet for both deep learning model\ndevelopment and practical robotic applications in agriculture, particularly in\nyield prediction and supply chain management. The dataset and the trained\nmodels are publicly available for future studies.\n","authors":["Parham Jafary","Anna Bazangeya","Michelle Pham","Lesley G. Campbell","Sajad Saeedi","Kourosh Zareinia","Habiba Bougherara"],"pdf_url":"https://arxiv.org/pdf/2411.00967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00965v1","updated":"2024-11-01T18:32:23Z","published":"2024-11-01T18:32:23Z","title":"SPOT: SE(3) Pose Trajectory Diffusion for Object-Centric Manipulation","summary":" We introduce SPOT, an object-centric imitation learning framework. The key\nidea is to capture each task by an object-centric representation, specifically\nthe SE(3) object pose trajectory relative to the target. This approach\ndecouples embodiment actions from sensory inputs, facilitating learning from\nvarious demonstration types, including both action-based and action-less human\nhand demonstrations, as well as cross-embodiment generalization. Additionally,\nobject pose trajectories inherently capture planning constraints from\ndemonstrations without the need for manually crafted rules. To guide the robot\nin executing the task, the object trajectory is used to condition a diffusion\npolicy. We show improvement compared to prior work on RLBench simulated tasks.\nIn real-world evaluation, using only eight demonstrations shot on an iPhone,\nour approach completed all tasks while fully complying with task constraints.\nProject page: https://nvlabs.github.io/object_centric_diffusion\n","authors":["Cheng-Chun Hsu","Bowen Wen","Jie Xu","Yashraj Narang","Xiaolong Wang","Yuke Zhu","Joydeep Biswas","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2411.00965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07166v2","updated":"2024-11-01T18:03:00Z","published":"2024-10-09T17:59:00Z","title":"Embodied Agent Interface: Benchmarking LLMs for Embodied Decision Making","summary":" We aim to evaluate Large Language Models (LLMs) for embodied decision making.\nWhile a significant body of work has been leveraging LLMs for decision making\nin embodied environments, we still lack a systematic understanding of their\nperformance because they are usually applied in different domains, for\ndifferent purposes, and built based on different inputs and outputs.\nFurthermore, existing evaluations tend to rely solely on a final success rate,\nmaking it difficult to pinpoint what ability is missing in LLMs and where the\nproblem lies, which in turn blocks embodied agents from leveraging LLMs\neffectively and selectively. To address these limitations, we propose a\ngeneralized interface (Embodied Agent Interface) that supports the\nformalization of various types of tasks and input-output specifications of\nLLM-based modules. Specifically, it allows us to unify 1) a broad set of\nembodied decision-making tasks involving both state and temporally extended\ngoals, 2) four commonly-used LLM-based modules for decision making: goal\ninterpretation, subgoal decomposition, action sequencing, and transition\nmodeling, and 3) a collection of fine-grained metrics which break down\nevaluation into various types of errors, such as hallucination errors,\naffordance errors, various types of planning errors, etc. Overall, our\nbenchmark offers a comprehensive assessment of LLMs' performance for different\nsubtasks, pinpointing the strengths and weaknesses in LLM-powered embodied AI\nsystems, and providing insights for effective and selective use of LLMs in\nembodied decision making.\n","authors":["Manling Li","Shiyu Zhao","Qineng Wang","Kangrui Wang","Yu Zhou","Sanjana Srivastava","Cem Gokmen","Tony Lee","Li Erran Li","Ruohan Zhang","Weiyu Liu","Percy Liang","Li Fei-Fei","Jiayuan Mao","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2410.07166v2.pdf","comment":"Accepted for oral presentation at NeurIPS 2024 in the Datasets and\n Benchmarks track. Camera-ready version"},{"id":"http://arxiv.org/abs/2411.00741v1","updated":"2024-11-01T17:06:25Z","published":"2024-11-01T17:06:25Z","title":"FG-PE: Factor-graph Approach for Multi-robot Pursuit-Evasion","summary":" With the increasing use of robots in daily life, there is a growing need to\nprovide robust collaboration protocols for robots to tackle more complicated\nand dynamic problems effectively. This paper presents a novel, factor\ngraph-based approach to address the pursuit-evasion problem, enabling accurate\nestimation, planning, and tracking of an evader by multiple pursuers working\ntogether. It is assumed that there are multiple pursuers and only one evader in\nthis scenario. The proposed method significantly improves the accuracy of\nevader estimation and tracking, allowing pursuers to capture the evader in the\nshortest possible time and distance compared to existing techniques. In\naddition to these primary objectives, the proposed approach effectively\nminimizes uncertainty while remaining robust, even when communication issues\nlead to some messages being dropped or lost. Through a series of comprehensive\nexperiments, this paper demonstrates that the proposed algorithm consistently\noutperforms traditional pursuit-evasion methods across several key performance\nmetrics, such as the time required to capture the evader and the average\ndistance traveled by the pursuers. Additionally, the proposed method is tested\nin real-world hardware experiments, further validating its effectiveness and\napplicability.\n","authors":["Messiah Abolfazli Esfahani","Ayşe Başar","Sajad Saeedi"],"pdf_url":"https://arxiv.org/pdf/2411.00741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00728v1","updated":"2024-11-01T16:40:12Z","published":"2024-11-01T16:40:12Z","title":"Multi-Agent Deep Q-Network with Layer-based Communication Channel for\n Autonomous Internal Logistics Vehicle Scheduling in Smart Manufacturing","summary":" In smart manufacturing, scheduling autonomous internal logistic vehicles is\ncrucial for optimizing operational efficiency. This paper proposes a\nmulti-agent deep Q-network (MADQN) with a layer-based communication channel\n(LBCC) to address this challenge. The main goals are to minimize total job\ntardiness, reduce the number of tardy jobs, and lower vehicle energy\nconsumption. The method is evaluated against nine well-known scheduling\nheuristics, demonstrating its effectiveness in handling dynamic job shop\nbehaviors like job arrivals and workstation unavailabilities. The approach also\nproves scalable, maintaining performance across different layouts and larger\nproblem instances, highlighting the robustness and adaptability of MADQN with\nLBCC in smart manufacturing.\n","authors":["Mohammad Feizabadi","Arman Hosseini","Zakaria Yahouni"],"pdf_url":"https://arxiv.org/pdf/2411.00728v1.pdf","comment":"Accepted for the 5th IFAC/INSTICC INTERNATIONAL CONFERENCE ON\n INNOVATIVE INTELLIGENT INDUSTRIAL PRODUCTION AND LOGISTICS"},{"id":"http://arxiv.org/abs/2411.00704v1","updated":"2024-11-01T16:07:20Z","published":"2024-11-01T16:07:20Z","title":"Learning to Look Around: Enhancing Teleoperation and Learning with a\n Human-like Actuated Neck","summary":" We introduce a teleoperation system that integrates a 5 DOF actuated neck,\ndesigned to replicate natural human head movements and perception. By enabling\nbehaviors like peeking or tilting, the system provides operators with a more\nintuitive and comprehensive view of the environment, improving task\nperformance, reducing cognitive load, and facilitating complex whole-body\nmanipulation. We demonstrate the benefits of natural perception across seven\nchallenging teleoperation tasks, showing how the actuated neck enhances the\nscope and efficiency of remote operation. Furthermore, we investigate its role\nin training autonomous policies through imitation learning. In three distinct\ntasks, the actuated neck supports better spatial awareness, reduces\ndistribution shift, and enables adaptive task-specific adjustments compared to\na static wide-angle camera.\n","authors":["Bipasha Sen","Michelle Wang","Nandini Thakur","Aditya Agarwal","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2411.00704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00659v1","updated":"2024-11-01T15:24:12Z","published":"2024-11-01T15:24:12Z","title":"Path Integral Control for Hybrid Dynamical Systems","summary":" This work introduces a novel paradigm for solving optimal control problems\nfor hybrid dynamical systems under uncertainties. Robotic systems having\ncontact with the environment can be modeled as hybrid systems. Controller\ndesign for hybrid systems under disturbances is complicated by the\ndiscontinuous jump dynamics, mode changes with inconsistent state dimensions,\nand variations in jumping timing and states caused by noise. We formulate this\nproblem into a stochastic control problem with hybrid transition constraints\nand propose the Hybrid Path Integral (H-PI) framework to obtain the optimal\ncontroller. Despite random mode changes across stochastic path samples, we show\nthat the ratio between hybrid path distributions with varying drift terms\nremains analogous to the smooth path distributions. We then show that the\noptimal controller can be obtained by evaluating a path integral with hybrid\nconstraints. Importance sampling for path distributions with hybrid dynamics\nconstraints is introduced to reduce the variance of the path integral\nevaluation, where we leverage the recently developed Hybrid\niterative-Linear-Quadratic-Regulator (H-iLQR) controller to induce a hybrid\npath distribution proposal with low variance. The proposed method is validated\nthrough numerical experiments on various hybrid systems and extensive ablation\nstudies. All the sampling processes are conducted in parallel on a Graphics\nProcessing Unit (GPU).\n","authors":["Hongzhe Yu","Diana Frias Franco","Aaron M. Johnson","Yongxin Chen"],"pdf_url":"https://arxiv.org/pdf/2411.00659v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2411.00600v1","updated":"2024-11-01T14:01:54Z","published":"2024-11-01T14:01:54Z","title":"On Deep Learning for Geometric and Semantic Scene Understanding Using\n On-Vehicle 3D LiDAR","summary":" 3D LiDAR point cloud data is crucial for scene perception in computer vision,\nrobotics, and autonomous driving. Geometric and semantic scene understanding,\ninvolving 3D point clouds, is essential for advancing autonomous driving\ntechnologies. However, significant challenges remain, particularly in improving\nthe overall accuracy (e.g., segmentation accuracy, depth estimation accuracy,\netc.) and efficiency of these systems. To address the challenge in terms of\naccuracy related to LiDAR-based tasks, we present DurLAR, the first\nhigh-fidelity 128-channel 3D LiDAR dataset featuring panoramic ambient (near\ninfrared) and reflectivity imagery. To improve efficiency in 3D segmentation\nwhile ensuring the accuracy, we propose a novel pipeline that employs a smaller\narchitecture, requiring fewer ground-truth annotations while achieving superior\nsegmentation accuracy compared to contemporary approaches. To improve the\nsegmentation accuracy, we introduce Range-Aware Pointwise Distance Distribution\n(RAPiD) features and the associated RAPiD-Seg architecture. All contributions\nhave been accepted by peer-reviewed conferences, underscoring the advancements\nin both accuracy and efficiency in 3D LiDAR applications for autonomous\ndriving. Full abstract: https://etheses.dur.ac.uk/15738/.\n","authors":["Li Li"],"pdf_url":"https://arxiv.org/pdf/2411.00600v1.pdf","comment":"PhD thesis (Durham University, Computer Science), 149 pages (the 2024\n BMVA Sullivan Doctoral Thesis Prize runner-up). Includes published content\n from arXiv:2407.10159 (ECCV 2024 ORAL), arXiv:2303.11203 (CVPR 2023), and\n arXiv:2406.10068 (3DV 2021), with minor revisions to the examined version:\n https://etheses.dur.ac.uk/15738/"},{"id":"http://arxiv.org/abs/2411.00554v1","updated":"2024-11-01T13:04:25Z","published":"2024-11-01T13:04:25Z","title":"Differentiable Physics-based System Identification for Robotic\n Manipulation of Elastoplastic Materials","summary":" Robotic manipulation of volumetric elastoplastic deformable materials, from\nfoods such as dough to construction materials like clay, is in its infancy,\nlargely due to the difficulty of modelling and perception in a high-dimensional\nspace. Simulating the dynamics of such materials is computationally expensive.\nIt tends to suffer from inaccurately estimated physics parameters of the\nmaterials and the environment, impeding high-precision manipulation. Estimating\nsuch parameters from raw point clouds captured by optical cameras suffers\nfurther from heavy occlusions. To address this challenge, this work introduces\na novel Differentiable Physics-based System Identification (DPSI) framework\nthat enables a robot arm to infer the physics parameters of elastoplastic\nmaterials and the environment using simple manipulation motions and incomplete\n3D point clouds, aligning the simulation with the real world. Extensive\nexperiments show that with only a single real-world interaction, the estimated\nparameters, Young's modulus, Poisson's ratio, yield stress and friction\ncoefficients, can accurately simulate visually and physically realistic\ndeformation behaviours induced by unseen and long-horizon manipulation motions.\nAdditionally, the DPSI framework inherently provides physically intuitive\ninterpretations for the parameters in contrast to black-box approaches such as\ndeep neural networks.\n","authors":["Xintong Yang","Ze Ji","Yu-Kun Lai"],"pdf_url":"https://arxiv.org/pdf/2411.00554v1.pdf","comment":"Underreivew on the Internation Journal of Robotics Research"},{"id":"http://arxiv.org/abs/2411.00508v1","updated":"2024-11-01T10:48:03Z","published":"2024-11-01T10:48:03Z","title":"CLIP-RT: Learning Language-Conditioned Robotic Policies from Natural\n Language Supervision","summary":" This paper explores how non-experts can teach robots desired skills in their\nenvironments. We argue that natural language is an intuitive and accessible\ninterface for robot learning. To this end, we investigate two key aspects: (1)\nhow non-experts collect robotic data using natural language supervision and (2)\nhow pre-trained vision-language models learn end-to-end policies directly from\nthis supervision. We propose a data collection framework that collects robot\ndemonstrations based on natural language supervision (e.g., \"move forward\") and\nfurther augments these demonstrations. Next, we introduce a model that learns\nlanguage-conditioned policies from natural language supervision called CLIP-RT.\nOur model employs pre-trained CLIP models and learns to predict actions\nrepresented in language via contrastive imitation learning. We first train\nCLIP-RT on large-scale robotic data and then enable it to learn desired skills\nusing data collected from our framework. CLIP-RT shows strong capabilities in\nacquiring novel manipulation skills, outperforming the state-of-the-art model,\nOpenVLA (7B parameters), by 17% in average success rates, while using 7x fewer\nparameters (1B).\n","authors":["Gi-Cheon Kang","Junghyun Kim","Kyuhwan Shim","Jun Ki Lee","Byoung-Tak Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.00508v1.pdf","comment":"27 pages, 27 figures"},{"id":"http://arxiv.org/abs/2411.00476v1","updated":"2024-11-01T09:43:49Z","published":"2024-11-01T09:43:49Z","title":"PlanScope: Learning to Plan Within Decision Scope Does Matter","summary":" In the context of autonomous driving, learning-based methods have been\npromising for the development of planning modules. During the training process\nof planning modules, directly minimizing the discrepancy between expert-driving\nlogs and planning output is widely deployed. In general, driving logs consist\nof suddenly appearing obstacles or swiftly changing traffic signals, which\ntypically necessitate swift and nuanced adjustments in driving maneuvers.\nConcurrently, future trajectories of the vehicles exhibit their long-term\ndecisions, such as adhering to a reference lane or circumventing stationary\nobstacles. Due to the unpredictable influence of future events in driving logs,\nreasoning bias could be naturally introduced to learning based planning\nmodules, which leads to a possible degradation of driving performance. To\naddress this issue, we identify the decisions and their corresponding time\nhorizons, and characterize a so-called decision scope by retaining decisions\nwithin derivable horizons only, to mitigate the effect of irrational behaviors\ncaused by unpredictable events. This framework employs wavelet transformation\nbased log preprocessing with an effective loss computation approach, rendering\nthe planning model only sensitive to valuable decisions at the current state.\nSince frequency domain characteristics are extracted in conjunction with time\ndomain features by wavelets, decision information across various frequency\nbands within the corresponding time horizon can be suitably captured.\nFurthermore, to achieve valuable decision learning, this framework leverages a\ntransformer based decoder that incrementally generates the detailed profiles of\nfuture decisions over multiple steps. Our experiments demonstrate that our\nproposed method outperforms baselines in terms of driving scores with\nclosed-loop evaluations on the nuPlan dataset.\n","authors":["Ren Xin","Jie Cheng","Jun Ma"],"pdf_url":"https://arxiv.org/pdf/2411.00476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00448v1","updated":"2024-11-01T08:50:04Z","published":"2024-11-01T08:50:04Z","title":"ConceptFactory: Facilitate 3D Object Knowledge Annotation with Object\n Conceptualization","summary":" We present ConceptFactory, a novel scope to facilitate more efficient\nannotation of 3D object knowledge by recognizing 3D objects through generalized\nconcepts (i.e. object conceptualization), aiming at promoting machine\nintelligence to learn comprehensive object knowledge from both vision and\nrobotics aspects. This idea originates from the findings in human cognition\nresearch that the perceptual recognition of objects can be explained as a\nprocess of arranging generalized geometric components (e.g. cuboids and\ncylinders). ConceptFactory consists of two critical parts: i) ConceptFactory\nSuite, a unified toolbox that adopts Standard Concept Template Library (STL-C)\nto drive a web-based platform for object conceptualization, and ii)\nConceptFactory Asset, a large collection of conceptualized objects acquired\nusing ConceptFactory suite. Our approach enables researchers to effortlessly\nacquire or customize extensive varieties of object knowledge to comprehensively\nstudy different object understanding tasks. We validate our idea on a wide\nrange of benchmark tasks from both vision and robotics aspects with\nstate-of-the-art algorithms, demonstrating the high quality and versatility of\nannotations provided by our approach. Our website is available at\nhttps://apeirony.github.io/ConceptFactory.\n","authors":["Jianhua Sun","Yuxuan Li","Longfei Xu","Nange Wang","Jiude Wei","Yining Zhang","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2411.00448v1.pdf","comment":"NeurIPS 2024 Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2411.00444v1","updated":"2024-11-01T08:34:32Z","published":"2024-11-01T08:34:32Z","title":"Expert-level protocol translation for self-driving labs","summary":" Recent development in Artificial Intelligence (AI) models has propelled their\napplication in scientific discovery, but the validation and exploration of\nthese discoveries require subsequent empirical experimentation. The concept of\nself-driving laboratories promises to automate and thus boost the experimental\nprocess following AI-driven discoveries. However, the transition of\nexperimental protocols, originally crafted for human comprehension, into\nformats interpretable by machines presents significant challenges, which,\nwithin the context of specific expert domain, encompass the necessity for\nstructured as opposed to natural language, the imperative for explicit rather\nthan tacit knowledge, and the preservation of causality and consistency\nthroughout protocol steps. Presently, the task of protocol translation\npredominantly requires the manual and labor-intensive involvement of domain\nexperts and information technology specialists, rendering the process\ntime-intensive. To address these issues, we propose a framework that automates\nthe protocol translation process through a three-stage workflow, which\nincrementally constructs Protocol Dependence Graphs (PDGs) that approach\nstructured on the syntax level, completed on the semantics level, and linked on\nthe execution level. Quantitative and qualitative evaluations have demonstrated\nits performance at par with that of human experts, underscoring its potential\nto significantly expedite and democratize the process of scientific discovery\nby elevating the automation capabilities within self-driving laboratories.\n","authors":["Yu-Zhe Shi","Fanxu Meng","Haofei Hou","Zhangqian Bi","Qiao Xu","Lecheng Ruan","Qining Wang"],"pdf_url":"https://arxiv.org/pdf/2411.00444v1.pdf","comment":"In Advances in Neural Information Processing Systems (NeurIPS'24)"},{"id":"http://arxiv.org/abs/2411.00440v1","updated":"2024-11-01T08:18:15Z","published":"2024-11-01T08:18:15Z","title":"NAMR-RRT: Neural Adaptive Motion Planning for Mobile Robots in Dynamic\n Environments","summary":" Robots are increasingly deployed in dynamic and crowded environments, such as\nurban areas and shopping malls, where efficient and robust navigation is\ncrucial. Traditional risk-based motion planning algorithms face challenges in\nsuch scenarios due to the lack of a well-defined search region, leading to\ninefficient exploration in irrelevant areas. While bi-directional and\nmulti-directional search strategies can improve efficiency, they still result\nin significant unnecessary exploration. This article introduces the Neural\nAdaptive Multi-directional Risk-based Rapidly-exploring Random Tree (NAMR-RRT)\nto address these limitations. NAMR-RRT integrates neural network-generated\nheuristic regions to dynamically guide the exploration process, continuously\nrefining the heuristic region and sampling rates during the planning process.\nThis adaptive feature significantly enhances performance compared to\nneural-based methods with fixed heuristic regions and sampling rates. NAMR-RRT\nimproves planning efficiency, reduces trajectory length, and ensures higher\nsuccess by focusing the search on promising areas and continuously adjusting to\nenvironments. The experiment results from both simulations and real-world\napplications demonstrate the robustness and effectiveness of our proposed\nmethod in navigating dynamic environments. A website about this work is\navailable at https://sites.google.com/view/namr-rrt.\n","authors":["Zhirui Sun","Bingyi Xia","Peijia Xie","Xiaoxiao Li","Jiankun Wang"],"pdf_url":"https://arxiv.org/pdf/2411.00440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00417v1","updated":"2024-11-01T07:27:11Z","published":"2024-11-01T07:27:11Z","title":"Closed-Loop Stability of a Lyapunov-Based Switching Attitude Controller\n for Energy-Efficient Torque-Input-Selection During Flight","summary":" We present a new Lyapunov-based switching attitude controller for\nenergy-efficient real-time selection of the torque inputted to an uncrewed\naerial vehicle (UAV) during flight. The proposed method, using quaternions to\ndescribe the attitude of the controlled UAV, interchanges the stability\nproperties of the two fixed points-one locally asymptotically stable and\nanother unstable-of the resulting closed-loop (CL) switching dynamics of the\nsystem. In this approach, the switching events are triggered by the value of a\ncompound energy-based function. To analyze and ensure the stability of the CL\nswitching dynamics, we use classical nonlinear Lyapunov techniques, in\ncombination with switching-systems theory. For this purpose, we introduce a new\ncompound Lyapunov function (LF) that not only enables us to derive the\nconditions for CL asymptotic and exponential stability, but also provides us\nwith an estimate of the CL system's region of attraction. This new estimate is\nconsiderably larger than those previously reported for systems of the type\nconsidered in this paper. To test and demonstrate the functionality,\nsuitability, and performance of the proposed method, we present and discuss\nexperimental data obtained using a 31-g quadrotor during the execution of\nhigh-speed yaw-tracking maneuvers. Also, we provide empirical evidence\nindicating that all the initial conditions chosen for these maneuvers, as\nestimated, lie inside the system's region of attraction. Last, experimental\ndata obtained through these flight tests show that the proposed switching\ncontroller reduces the control effort by about 53%, on average, with respect to\nthat corresponding to a commonly used benchmark control scheme, when executing\na particular type of high-speed yaw-tracking maneuvers.\n","authors":["Francisco M. F. R. Gonçalves","Ryan M. Bena","Néstor O. Pérez-Arancibia"],"pdf_url":"https://arxiv.org/pdf/2411.00417v1.pdf","comment":"2024 IEEE International Conference on Robotics and Biomimetics\n (ROBIO)"},{"id":"http://arxiv.org/abs/2411.00413v1","updated":"2024-11-01T07:18:49Z","published":"2024-11-01T07:18:49Z","title":"Multi-Uncertainty Aware Autonomous Cooperative Planning","summary":" Autonomous cooperative planning (ACP) is a promising technique to improve the\nefficiency and safety of multi-vehicle interactions for future intelligent\ntransportation systems. However, realizing robust ACP is a challenge due to the\naggregation of perception, motion, and communication uncertainties. This paper\nproposes a novel multi-uncertainty aware ACP (MUACP) framework that\nsimultaneously accounts for multiple types of uncertainties via regularized\ncooperative model predictive control (RC-MPC). The regularizers and constraints\nfor perception, motion, and communication are constructed according to the\nconfidence levels, weather conditions, and outage probabilities, respectively.\nThe effectiveness of the proposed method is evaluated in the Car Learning to\nAct (CARLA) simulation platform. Results demonstrate that the proposed MUACP\nefficiently performs cooperative formation in real time and outperforms other\nbenchmark approaches in various scenarios under imperfect knowledge of the\nenvironment.\n","authors":["Shiyao Zhang","He Li","Shengyu Zhang","Shuai Wang","Derrick Wing Kwan Ng","Chengzhong Xu"],"pdf_url":"https://arxiv.org/pdf/2411.00413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00400v1","updated":"2024-11-01T06:58:55Z","published":"2024-11-01T06:58:55Z","title":"Capability-aware Task Allocation and Team Formation Analysis for\n Cooperative Exploration of Complex Environments","summary":" To achieve autonomy in complex real-world exploration missions, we consider\ndeployment strategies for a team of robots with heterogeneous autonomy\ncapabilities. In this work, we formulate a multi-robot exploration mission and\ncompute an operation policy to maintain robot team productivity and maximize\nmission rewards. The environment description, robot capability, and mission\noutcome are modeled as a Markov decision process (MDP). We also include\nconstraints in real-world operation, such as sensor failures, limited\ncommunication coverage, and mobility-stressing elements. Then, we study the\nproposed operation model on a real-world scenario in the context of the DARPA\nSubterranean (SubT) Challenge. The computed deployment policy is also compared\nagainst the human-based operation strategy in the final competition of the SubT\nChallenge. Finally, using the proposed model, we discuss the design trade-off\non building a multi-robot team with heterogeneous capabilities.\n","authors":["Muhammad Fadhil Ginting","Kyohei Otsu","Mykel J. Kochenderfer","Ali-akbar Agha-mohammadi"],"pdf_url":"https://arxiv.org/pdf/2411.00400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00357v1","updated":"2024-11-01T04:46:11Z","published":"2024-11-01T04:46:11Z","title":"An Improved Rapidly Exploring Random Tree Algorithm for Path Planning in\n Configuration Spaces with Narrow Channels","summary":" Rapidly-exploring Random Tree (RRT) algorithms have been applied successfully\nto challenging robot motion planning and under-actuated nonlinear control\nproblems. However a fundamental limitation of the RRT approach is the slow\nconvergence in configuration spaces with narrow channels because of the small\nprobability of generating test points inside narrow channels. This paper\npresents an improved RRT algorithm that takes advantage of narrow channels\nbetween the initial and goal states to find shorter paths by improving the\nexploration of narrow regions in the configuration space. The proposed\nalgorithm detects the presence of narrow channel by checking for collision of\nneighborhood points with the infeasible set and attempts to add points within\nnarrow channels with a predetermined bias. This approach is compared with the\nclassical RRT and its variants on a variety of benchmark planning problems.\nSimulation results indicate that the algorithm presented in this paper computes\na significantly shorter path in spaces with narrow channels.\n","authors":["Mathew Mithra Noel","Akshay Chawla"],"pdf_url":"https://arxiv.org/pdf/2411.00357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00347v1","updated":"2024-11-01T04:05:24Z","published":"2024-11-01T04:05:24Z","title":"An Untethered Bioinspired Robotic Tensegrity Dolphin with\n Multi-Flexibility Design for Aquatic Locomotion","summary":" This paper presents the first steps toward a soft dolphin robot using a\nbio-inspired approach to mimic dolphin flexibility. The current dolphin robot\nuses a minimalist approach, with only two actuated cable-driven degrees of\nfreedom actuated by a pair of motors. The actuated tail moves up and down in a\nswimming motion, but this first proof of concept does not permit controlled\nturns of the robot. While existing robotic dolphins typically use revolute\njoints to articulate rigid bodies, our design -- which will be made opensource\n-- incorporates a flexible tail with tunable silicone skin and actuation\nflexibility via a cable-driven system, which mimics muscle dynamics and design\nflexibility with a tunable skeleton structure. The design is also tunable since\nthe backbone can be easily printed in various geometries. The paper provides\ninsights into how a few such variations affect robot motion and efficiency,\nmeasured by speed and cost of transport (COT). This approach demonstrates the\npotential of achieving dolphin-like motion through enhanced flexibility in\nbio-inspired robotics.\n","authors":["Luyang Zhao","Yitao Jiang","Chun-Yi She","Mingi Jeong","Haibo Dong","Alberto Quattrini Li","Muhao Chen","Devin Balkcom"],"pdf_url":"https://arxiv.org/pdf/2411.00347v1.pdf","comment":"7 pages, 13 figures"},{"id":"http://arxiv.org/abs/2411.00345v1","updated":"2024-11-01T04:03:05Z","published":"2024-11-01T04:03:05Z","title":"On the Exploration of LM-Based Soft Modular Robot Design","summary":" Recent large language models (LLMs) have demonstrated promising capabilities\nin modeling real-world knowledge and enhancing knowledge-based generation\ntasks. In this paper, we further explore the potential of using LLMs to aid in\nthe design of soft modular robots, taking into account both user instructions\nand physical laws, to reduce the reliance on extensive trial-and-error\nexperiments typically needed to achieve robot designs that meet specific\nstructural or task requirements. Specifically, we formulate the robot design\nprocess as a sequence generation task and find that LLMs are able to capture\nkey requirements expressed in natural language and reflect them in the\nconstruction sequences of robots. To simplify, rather than conducting\nreal-world experiments to assess design quality, we utilize a simulation tool\nto provide feedback to the generative model, allowing for iterative\nimprovements without requiring extensive human annotations. Furthermore, we\nintroduce five evaluation metrics to assess the quality of robot designs from\nmultiple angles including task completion and adherence to instructions,\nsupporting an automatic evaluation process. Our model performs well in\nevaluations for designing soft modular robots with uni- and bi-directional\nlocomotion and stair-descending capabilities, highlighting the potential of\nusing natural language and LLMs for robot design. However, we also observe\ncertain limitations that suggest areas for further improvement.\n","authors":["Weicheng Ma","Luyang Zhao","Chun-Yi She","Yitao Jiang","Alan Sun","Bo Zhu","Devin Balkcom","Soroush Vosoughi"],"pdf_url":"https://arxiv.org/pdf/2411.00345v1.pdf","comment":"8 pages, 7 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.16016v2","updated":"2024-11-01T17:44:34Z","published":"2024-09-24T12:19:31Z","title":"VascX Models: Model Ensembles for Retinal Vascular Analysis from Color\n Fundus Images","summary":" We introduce VascX models, a comprehensive set of model ensembles for\nanalyzing retinal vasculature from color fundus images (CFIs). Annotated CFIs\nwere aggregated from public datasets . Additional CFIs, mainly from the\npopulation-based Rotterdam Study were annotated by graders for arteries and\nveins at pixel level, resulting in a dataset diverse in patient demographics\nand imaging conditions. VascX models demonstrated superior segmentation\nperformance across datasets, image quality levels, and anatomic regions when\ncompared to existing, publicly available models, likely due to the increased\nsize and variety of our training set. Important improvements were observed in\nartery-vein and disc segmentation performance, particularly in segmentations of\nthese structures on CFIs of intermediate quality, common in large cohorts and\nclinical datasets. Importantly, these improvements translated into\nsignificantly more accurate vascular features when we compared features\nextracted from VascX segmentation masks with features extracted from\nsegmentation masks generated by previous models. With VascX models we provide a\nrobust, ready-to-use set of model ensembles and inference code aimed at\nsimplifying the implementation and enhancing the quality of automated retinal\nvasculature analyses. The precise vessel parameters generated by the model can\nserve as starting points for the identification of disease patterns in and\noutside of the eye.\n","authors":["Jose Vargas Quiros","Bart Liefers","Karin van Garderen","Jeroen Vermeulen","Eyened Reading Center","Sinergia Consortium","Caroline Klaver"],"pdf_url":"https://arxiv.org/pdf/2409.16016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24211v2","updated":"2024-11-01T17:23:01Z","published":"2024-10-31T17:59:01Z","title":"DELTA: Dense Efficient Long-range 3D Tracking for any video","summary":" Tracking dense 3D motion from monocular videos remains challenging,\nparticularly when aiming for pixel-level precision over long sequences. We\nintroduce DELTA, a novel method that efficiently tracks every pixel in 3D\nspace, enabling accurate motion estimation across entire videos. Our approach\nleverages a joint global-local attention mechanism for reduced-resolution\ntracking, followed by a transformer-based upsampler to achieve high-resolution\npredictions. Unlike existing methods, which are limited by computational\ninefficiency or sparse tracking, DELTA delivers dense 3D tracking at scale,\nrunning over 8x faster than previous methods while achieving state-of-the-art\naccuracy. Furthermore, we explore the impact of depth representation on\ntracking performance and identify log-depth as the optimal choice. Extensive\nexperiments demonstrate the superiority of DELTA on multiple benchmarks,\nachieving new state-of-the-art results in both 2D and 3D dense tracking tasks.\nOur method provides a robust solution for applications requiring fine-grained,\nlong-term motion tracking in 3D space.\n","authors":["Tuan Duc Ngo","Peiye Zhuang","Chuang Gan","Evangelos Kalogerakis","Sergey Tulyakov","Hsin-Ying Lee","Chaoyang Wang"],"pdf_url":"https://arxiv.org/pdf/2410.24211v2.pdf","comment":"Project Page: https://snap-research.github.io/DELTA/"},{"id":"http://arxiv.org/abs/2402.01335v3","updated":"2024-11-01T16:51:01Z","published":"2024-02-02T11:40:27Z","title":"BehAVE: Behaviour Alignment of Video Game Encodings","summary":" Domain randomisation enhances the transferability of vision models across\nvisually distinct domains with similar content. However, current methods\nheavily depend on intricate simulation engines, hampering feasibility and\nscalability. This paper introduces BehAVE, a video understanding framework that\nutilises existing commercial video games for domain randomisation without\naccessing their simulation engines. BehAVE taps into the visual diversity of\nvideo games for randomisation and uses textual descriptions of player actions\nto align videos with similar content. We evaluate BehAVE across 25 first-person\nshooter (FPS) games using various video and text foundation models,\ndemonstrating its robustness in domain randomisation. BehAVE effectively aligns\nplayer behavioural patterns and achieves zero-shot transfer to multiple unseen\nFPS games when trained on just one game. In a more challenging scenario, BehAVE\nenhances the zero-shot transferability of foundation models to unseen FPS\ngames, even when trained on a game of a different genre, with improvements of\nup to 22%. BehAVE is available online at https://github.com/nrasajski/BehAVE.\n","authors":["Nemanja Rašajski","Chintan Trivedi","Konstantinos Makantasis","Antonios Liapis","Georgios N. Yannakakis"],"pdf_url":"https://arxiv.org/pdf/2402.01335v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07410v2","updated":"2024-11-01T16:34:04Z","published":"2024-10-09T20:21:43Z","title":"Aligning Motion-Blurred Images Using Contrastive Learning on\n Overcomplete Pixels","summary":" We propose a new contrastive objective for learning overcomplete pixel-level\nfeatures that are invariant to motion blur. Other invariances (e.g., pose,\nillumination, or weather) can be learned by applying the corresponding\ntransformations on unlabeled images during self-supervised training. We\nshowcase that a simple U-Net trained with our objective can produce local\nfeatures useful for aligning the frames of an unseen video captured with a\nmoving camera under realistic and challenging conditions. Using a carefully\ndesigned toy example, we also show that the overcomplete pixels can encode the\nidentity of objects in an image and the pixel coordinates relative to these\nobjects.\n","authors":["Leonid Pogorelyuk","Stefan T. Radev"],"pdf_url":"https://arxiv.org/pdf/2410.07410v2.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2410.24204v2","updated":"2024-11-01T16:31:22Z","published":"2024-10-31T17:57:07Z","title":"GeoSplatting: Towards Geometry Guided Gaussian Splatting for\n Physically-based Inverse Rendering","summary":" We consider the problem of physically-based inverse rendering using 3D\nGaussian Splatting (3DGS) representations. While recent 3DGS methods have\nachieved remarkable results in novel view synthesis (NVS), accurately capturing\nhigh-fidelity geometry, physically interpretable materials and lighting remains\nchallenging, as it requires precise geometry modeling to provide accurate\nsurface normals, along with physically-based rendering (PBR) techniques to\nensure correct material and lighting disentanglement. Previous 3DGS methods\nresort to approximating surface normals, but often struggle with noisy local\ngeometry, leading to inaccurate normal estimation and suboptimal\nmaterial-lighting decomposition. In this paper, we introduce GeoSplatting, a\nnovel hybrid representation that augments 3DGS with explicit geometric guidance\nand differentiable PBR equations. Specifically, we bridge isosurface and 3DGS\ntogether, where we first extract isosurface mesh from a scalar field, then\nconvert it into 3DGS points and formulate PBR equations for them in a fully\ndifferentiable manner. In GeoSplatting, 3DGS is grounded on the mesh geometry,\nenabling precise surface normal modeling, which facilitates the use of PBR\nframeworks for material decomposition. This approach further maintains the\nefficiency and quality of NVS from 3DGS while ensuring accurate geometry from\nthe isosurface. Comprehensive evaluations across diverse datasets demonstrate\nthe superiority of GeoSplatting, consistently outperforming existing methods\nboth quantitatively and qualitatively.\n","authors":["Kai Ye","Chong Gao","Guanbin Li","Wenzheng Chen","Baoquan Chen"],"pdf_url":"https://arxiv.org/pdf/2410.24204v2.pdf","comment":"Project page: https://pku-vcl-geometry.github.io/GeoSplatting/"},{"id":"http://arxiv.org/abs/2406.00307v4","updated":"2024-11-01T16:26:40Z","published":"2024-06-01T05:41:12Z","title":"HENASY: Learning to Assemble Scene-Entities for Egocentric\n Video-Language Model","summary":" Current video-language models (VLMs) rely extensively on instance-level\nalignment between video and language modalities, which presents two major\nlimitations: (1) visual reasoning disobeys the natural perception that humans\ndo in first-person perspective, leading to a lack of reasoning interpretation;\nand (2) learning is limited in capturing inherent fine-grained relationships\nbetween two modalities.\n In this paper, we take an inspiration from human perception and explore a\ncompositional approach for egocentric video representation. We introduce HENASY\n(Hierarchical ENtities ASsemblY), which includes a spatiotemporal token\ngrouping mechanism to explicitly assemble dynamically evolving scene entities\nthrough time and model their relationship for video representation. By\nleveraging compositional structure understanding, HENASY possesses strong\ninterpretability via visual grounding with free-form text queries. We further\nexplore a suite of multi-grained contrastive losses to facilitate\nentity-centric understandings. This comprises three alignment types:\nvideo-narration, noun-entity, verb-entities alignments.\n Our method demonstrates strong interpretability in both quantitative and\nqualitative experiments; while maintaining competitive performances on five\ndownstream tasks via zero-shot transfer or as video/text representation,\nincluding video/text retrieval, action recognition, multi-choice query, natural\nlanguage query, and moments query.\n","authors":["Khoa Vo","Thinh Phan","Kashu Yamazaki","Minh Tran","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2406.00307v4.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2312.14556v3","updated":"2024-11-01T16:12:52Z","published":"2023-12-22T09:29:45Z","title":"CaptainCook4D: A Dataset for Understanding Errors in Procedural\n Activities","summary":" Following step-by-step procedures is an essential component of various\nactivities carried out by individuals in their daily lives. These procedures\nserve as a guiding framework that helps to achieve goals efficiently, whether\nit is assembling furniture or preparing a recipe. However, the complexity and\nduration of procedural activities inherently increase the likelihood of making\nerrors. Understanding such procedural activities from a sequence of frames is a\nchallenging task that demands an accurate interpretation of visual information\nand the ability to reason about the structure of the activity. To this end, we\ncollect a new egocentric 4D dataset, CaptainCook4D, comprising 384 recordings\n(94.5 hours) of people performing recipes in real kitchen environments. This\ndataset consists of two distinct types of activity: one in which participants\nadhere to the provided recipe instructions and another in which they deviate\nand induce errors. We provide 5.3K step annotations and 10K fine-grained action\nannotations and benchmark the dataset for the following tasks: supervised error\nrecognition, multistep localization, and procedure learning\n","authors":["Rohith Peddi","Shivvrat Arya","Bharath Challa","Likhitha Pallapothula","Akshay Vyas","Bhavya Gouripeddi","Jikai Wang","Qifan Zhang","Vasundhara Komaragiri","Eric Ragan","Nicholas Ruozzi","Yu Xiang","Vibhav Gogate"],"pdf_url":"https://arxiv.org/pdf/2312.14556v3.pdf","comment":"Accepted to the 2024 Neural Information Processing Systems Datasets\n and Benchmarks Track, Project Page:\n https://captaincook4d.github.io/captain-cook/"},{"id":"http://arxiv.org/abs/2410.19869v2","updated":"2024-11-01T16:02:47Z","published":"2024-10-24T00:12:20Z","title":"Comparing YOLO11 and YOLOv8 for instance segmentation of occluded and\n non-occluded immature green fruits in complex orchard environment","summary":" This study conducted a comprehensive performance evaluation on YOLO11 and\nYOLOv8, the latest in the \"You Only Look Once\" (YOLO) series, focusing on their\ninstance segmentation capabilities for immature green apples in orchard\nenvironments. YOLO11n-seg achieved the highest mask precision across all\ncategories with a notable score of 0.831, highlighting its effectiveness in\nfruit detection. YOLO11m-seg and YOLO11l-seg excelled in non-occluded and\noccluded fruitlet segmentation with scores of 0.851 and 0.829, respectively.\nAdditionally, YOLO11x-seg led in mask recall for all categories, achieving a\nscore of 0.815, with YOLO11m-seg performing best for non-occluded immature\ngreen fruitlets at 0.858 and YOLOv8x-seg leading the occluded category with\n0.800. In terms of mean average precision at a 50\\% intersection over union\n(mAP@50), YOLO11m-seg consistently outperformed, registering the highest scores\nfor both box and mask segmentation, at 0.876 and 0.860 for the \"All\" class and\n0.908 and 0.909 for non-occluded immature fruitlets, respectively. YOLO11l-seg\nand YOLOv8l-seg shared the top box mAP@50 for occluded immature fruitlets at\n0.847, while YOLO11m-seg achieved the highest mask mAP@50 of 0.810. Despite the\nadvancements in YOLO11, YOLOv8n surpassed its counterparts in image processing\nspeed, with an impressive inference speed of 3.3 milliseconds, compared to the\nfastest YOLO11 series model at 4.8 milliseconds, underscoring its suitability\nfor real-time agricultural applications related to complex green fruit\nenvironments.\n","authors":["Ranjan Sapkota","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2410.19869v2.pdf","comment":"16 Pages, 10 Figures, 3 Tables"},{"id":"http://arxiv.org/abs/2409.00877v2","updated":"2024-11-01T15:41:56Z","published":"2024-09-02T00:11:48Z","title":"Digital Twins in Additive Manufacturing: A Systematic Review","summary":" Digital Twins (DTs) are becoming popular in Additive Manufacturing (AM) due\nto their ability to create virtual replicas of physical components of AM\nmachines, which helps in real-time production monitoring. Advanced techniques\nsuch as Machine Learning (ML), Augmented Reality (AR), and simulation-based\nmodels play key roles in developing intelligent and adaptable DTs in\nmanufacturing processes. However, questions remain regarding scalability, the\nintegration of high-quality data, and the computational power required for\nreal-time applications in developing DTs. Understanding the current state of\nDTs in AM is essential to address these challenges and fully utilize their\npotential in advancing AM processes. Considering this opportunity, this work\naims to provide a comprehensive overview of DTs in AM by addressing the\nfollowing four research questions: (1) What are the key types of DTs used in AM\nand their specific applications? (2) What are the recent developments and\nimplementations of DTs? (3) How are DTs employed in process improvement and\nhybrid manufacturing? (4) How are DTs integrated with Industry 4.0\ntechnologies? By discussing current applications and techniques, we aim to\noffer a better understanding and potential future research directions for\nresearchers and practitioners in AM and DTs.\n","authors":["Md Manjurul Ahsan","Yingtao Liu","Shivakumar Raman","Zahed Siddique"],"pdf_url":"https://arxiv.org/pdf/2409.00877v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15615v4","updated":"2024-11-01T15:13:01Z","published":"2023-07-28T15:22:34Z","title":"A survey on deep learning in medical image registration: new\n technologies, uncertainty, evaluation metrics, and beyond","summary":" Deep learning technologies have dramatically reshaped the field of medical\nimage registration over the past decade. The initial developments, such as\nregression-based and U-Net-based networks, established the foundation for deep\nlearning in image registration. Subsequent progress has been made in various\naspects of deep learning-based registration, including similarity measures,\ndeformation regularizations, network architectures, and uncertainty estimation.\nThese advancements have not only enriched the field of image registration but\nhave also facilitated its application in a wide range of tasks, including atlas\nconstruction, multi-atlas segmentation, motion estimation, and 2D-3D\nregistration. In this paper, we present a comprehensive overview of the most\nrecent advancements in deep learning-based image registration. We begin with a\nconcise introduction to the core concepts of deep learning-based image\nregistration. Then, we delve into innovative network architectures, loss\nfunctions specific to registration, and methods for estimating registration\nuncertainty. Additionally, this paper explores appropriate evaluation metrics\nfor assessing the performance of deep learning models in registration tasks.\nFinally, we highlight the practical applications of these novel techniques in\nmedical imaging and discuss the future prospects of deep learning-based image\nregistration.\n","authors":["Junyu Chen","Yihao Liu","Shuwen Wei","Zhangxing Bian","Shalini Subramanian","Aaron Carass","Jerry L. Prince","Yong Du"],"pdf_url":"https://arxiv.org/pdf/2307.15615v4.pdf","comment":"Accepted to Medical Image Analysis ((c) MedIA). A list of\n open-sourced code from the papers reviewed has been organized and is\n available at https://bit.ly/3QgFJ9z"},{"id":"http://arxiv.org/abs/2406.08773v3","updated":"2024-11-01T14:55:50Z","published":"2024-06-13T03:05:36Z","title":"DenoiseRep: Denoising Model for Representation Learning","summary":" The denoising model has been proven a powerful generative model but has\nlittle exploration of discriminative tasks. Representation learning is\nimportant in discriminative tasks, which is defined as \"learning\nrepresentations (or features) of the data that make it easier to extract useful\ninformation when building classifiers or other predictors\". In this paper, we\npropose a novel Denoising Model for Representation Learning (DenoiseRep) to\nimprove feature discrimination with joint feature extraction and denoising.\nDenoiseRep views each embedding layer in a backbone as a denoising layer,\nprocessing the cascaded embedding layers as if we are recursively denoise\nfeatures step-by-step. This unifies the frameworks of feature extraction and\ndenoising, where the former progressively embeds features from low-level to\nhigh-level, and the latter recursively denoises features step-by-step. After\nthat, DenoiseRep fuses the parameters of feature extraction and denoising\nlayers, and theoretically demonstrates its equivalence before and after the\nfusion, thus making feature denoising computation-free. DenoiseRep is a\nlabel-free algorithm that incrementally improves features but also\ncomplementary to the label if available. Experimental results on various\ndiscriminative vision tasks, including re-identification (Market-1501,\nDukeMTMC-reID, MSMT17, CUHK-03, vehicleID), image classification (ImageNet,\nUB200, Oxford-Pet, Flowers), object detection (COCO), image segmentation\n(ADE20K) show stability and impressive improvements. We also validate its\neffectiveness on the CNN (ResNet) and Transformer (ViT, Swin, Vmamda)\narchitectures.\n","authors":["Zhengrui Xu","Guan'an Wang","Xiaowen Huang","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2406.08773v3.pdf","comment":"Accepted by NeurIPS 2024,oral"},{"id":"http://arxiv.org/abs/2312.03701v4","updated":"2024-11-01T14:48:57Z","published":"2023-12-06T18:59:31Z","title":"Return of Unconditional Generation: A Self-supervised Representation\n Generation Method","summary":" Unconditional generation -- the problem of modeling data distribution without\nrelying on human-annotated labels -- is a long-standing and fundamental\nchallenge in generative models, creating a potential of learning from\nlarge-scale unlabeled data. In the literature, the generation quality of an\nunconditional method has been much worse than that of its conditional\ncounterpart. This gap can be attributed to the lack of semantic information\nprovided by labels. In this work, we show that one can close this gap by\ngenerating semantic representations in the representation space produced by a\nself-supervised encoder. These representations can be used to condition the\nimage generator. This framework, called Representation-Conditioned Generation\n(RCG), provides an effective solution to the unconditional generation problem\nwithout using labels. Through comprehensive experiments, we observe that RCG\nsignificantly improves unconditional generation quality: e.g., it achieves a\nnew state-of-the-art FID of 2.15 on ImageNet 256x256, largely reducing the\nprevious best of 5.91 by a relative 64%. Our unconditional results are situated\nin the same tier as the leading class-conditional ones. We hope these\nencouraging observations will attract the community's attention to the\nfundamental problem of unconditional generation. Code is available at\nhttps://github.com/LTH14/rcg.\n","authors":["Tianhong Li","Dina Katabi","Kaiming He"],"pdf_url":"https://arxiv.org/pdf/2312.03701v4.pdf","comment":"Neurips 2024 (Oral)"},{"id":"http://arxiv.org/abs/2312.07955v2","updated":"2024-11-01T14:45:44Z","published":"2023-12-13T08:01:15Z","title":"Erasing Self-Supervised Learning Backdoor by Cluster Activation Masking","summary":" Self-Supervised Learning (SSL) is an effective paradigm for learning\nrepresentations from unlabeled data, such as text, images, and videos. However,\nresearchers have recently found that SSL is vulnerable to backdoor attacks. The\nattacker can embed hidden SSL backdoors via a few poisoned examples in the\ntraining dataset and maliciously manipulate the behavior of downstream models.\nTo defend against SSL backdoor attacks, a feasible route is to detect and\nremove the poisonous samples in the training set. However, the existing SSL\nbackdoor defense method fails to detect the poisonous samples precisely. In\nthis paper, we propose to erase the SSL backdoor by cluster activation masking\nand propose a novel PoisonCAM method. After obtaining the threat model trained\non the poisoned dataset, our method can precisely detect poisonous samples\nbased on the assumption that masking the backdoor trigger can effectively\nchange the activation of a downstream clustering model. In experiments, our\nPoisonCAM achieves 96\\% accuracy for backdoor trigger detection compared to 3\\%\nof the state-of-the-art method on poisoned ImageNet-100. Moreover, our proposed\nPoisonCAM significantly improves the performance of the trained SSL model under\nbackdoor attacks compared to the state-of-the-art method. Our code, data, and\ntrained models will be open once this paper is accepted.\n","authors":["Shengsheng Qian","Dizhan Xue","Yifei Wang","Shengjie Zhang","Huaiwen Zhang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2312.07955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11838v3","updated":"2024-11-01T14:45:36Z","published":"2024-06-17T17:59:58Z","title":"Autoregressive Image Generation without Vector Quantization","summary":" Conventional wisdom holds that autoregressive models for image generation are\ntypically accompanied by vector-quantized tokens. We observe that while a\ndiscrete-valued space can facilitate representing a categorical distribution,\nit is not a necessity for autoregressive modeling. In this work, we propose to\nmodel the per-token probability distribution using a diffusion procedure, which\nallows us to apply autoregressive models in a continuous-valued space. Rather\nthan using categorical cross-entropy loss, we define a Diffusion Loss function\nto model the per-token probability. This approach eliminates the need for\ndiscrete-valued tokenizers. We evaluate its effectiveness across a wide range\nof cases, including standard autoregressive models and generalized masked\nautoregressive (MAR) variants. By removing vector quantization, our image\ngenerator achieves strong results while enjoying the speed advantage of\nsequence modeling. We hope this work will motivate the use of autoregressive\ngeneration in other continuous-valued domains and applications. Code is\navailable at: https://github.com/LTH14/mar.\n","authors":["Tianhong Li","Yonglong Tian","He Li","Mingyang Deng","Kaiming He"],"pdf_url":"https://arxiv.org/pdf/2406.11838v3.pdf","comment":"Neurips 2024 (Spotlight). Code: https://github.com/LTH14/mar"},{"id":"http://arxiv.org/abs/2407.15794v4","updated":"2024-11-01T14:19:14Z","published":"2024-07-22T16:52:32Z","title":"Disentangling spatio-temporal knowledge for weakly supervised object\n detection and segmentation in surgical video","summary":" Weakly supervised video object segmentation (WSVOS) enables the\nidentification of segmentation maps without requiring an extensive training\ndataset of object masks, relying instead on coarse video labels indicating\nobject presence. Current state-of-the-art methods either require multiple\nindependent stages of processing that employ motion cues or, in the case of\nend-to-end trainable networks, lack in segmentation accuracy, in part due to\nthe difficulty of learning segmentation maps from videos with transient object\npresence. This limits the application of WSVOS for semantic annotation of\nsurgical videos where multiple surgical tools frequently move in and out of the\nfield of view, a problem that is more difficult than typically encountered in\nWSVOS. This paper introduces Video Spatio-Temporal Disentanglement Networks\n(VDST-Net), a framework to disentangle spatiotemporal information using\nsemi-decoupled knowledge distillation to predict high-quality class activation\nmaps (CAMs). A teacher network designed to resolve temporal conflicts when\nspecifics about object location and timing in the video are not provided works\nwith a student network that integrates information over time by leveraging\ntemporal dependencies. We demonstrate the efficacy of our framework on a public\nreference dataset and on a more challenging surgical video dataset where\nobjects are, on average, present in less than 60\\% of annotated frames. Our\nmethod outperforms state-of-the-art techniques and generates superior\nsegmentation masks under video-level weak supervision.\n","authors":["Guiqiu Liao","Matjaz Jogan","Sai Koushik","Eric Eaton","Daniel A. Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2407.15794v4.pdf","comment":"Accepted to IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV)"},{"id":"http://arxiv.org/abs/2310.16020v3","updated":"2024-11-01T13:59:05Z","published":"2023-10-24T17:30:26Z","title":"ConvBKI: Real-Time Probabilistic Semantic Mapping Network with\n Quantifiable Uncertainty","summary":" In this paper, we develop a modular neural network for real-time\n{\\color{black}(> 10 Hz)} semantic mapping in uncertain environments, which\nexplicitly updates per-voxel probabilistic distributions within a neural\nnetwork layer. Our approach combines the reliability of classical probabilistic\nalgorithms with the performance and efficiency of modern neural networks.\nAlthough robotic perception is often divided between modern differentiable\nmethods and classical explicit methods, a union of both is necessary for\nreal-time and trustworthy performance. We introduce a novel Convolutional\nBayesian Kernel Inference (ConvBKI) layer which incorporates semantic\nsegmentation predictions online into a 3D map through a depthwise convolution\nlayer by leveraging conjugate priors. We compare ConvBKI against\nstate-of-the-art deep learning approaches and probabilistic algorithms for\nmapping to evaluate reliability and performance. We also create a Robot\nOperating System (ROS) package of ConvBKI and test it on real-world\nperceptually challenging off-road driving data.\n","authors":["Joey Wilson","Yuewei Fu","Joshua Friesen","Parker Ewen","Andrew Capodieci","Paramsothy Jayakumar","Kira Barton","Maani Ghaffari"],"pdf_url":"https://arxiv.org/pdf/2310.16020v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2209.10663"},{"id":"http://arxiv.org/abs/2311.12056v3","updated":"2024-11-01T12:54:28Z","published":"2023-11-18T13:55:05Z","title":"Kuro Siwo: 33 billion $m^2$ under the water. A global multi-temporal\n satellite dataset for rapid flood mapping","summary":" Global floods, exacerbated by climate change, pose severe threats to human\nlife, infrastructure, and the environment. Recent catastrophic events in\nPakistan and New Zealand underscore the urgent need for precise flood mapping\nto guide restoration efforts, understand vulnerabilities, and prepare for\nfuture occurrences. While Synthetic Aperture Radar (SAR) remote sensing offers\nday-and-night, all-weather imaging capabilities, its application in deep\nlearning for flood segmentation is limited by the lack of large annotated\ndatasets. To address this, we introduce Kuro Siwo, a manually annotated\nmulti-temporal dataset, spanning 43 flood events globally. Our dataset maps\nmore than 338 billion $m^2$ of land, with 33 billion designated as either\nflooded areas or permanent water bodies. Kuro Siwo includes a highly processed\nproduct optimized for flood mapping based on SAR Ground Range Detected, and a\nprimal SAR Single Look Complex product with minimal preprocessing, designed to\npromote research on the exploitation of both the phase and amplitude\ninformation and to offer maximum flexibility for downstream task preprocessing.\nTo leverage advances in large scale self-supervised pretraining methods for\nremote sensing data, we augment Kuro Siwo with a large unlabeled set of SAR\nsamples. Finally, we provide an extensive benchmark, namely BlackBench,\noffering strong baselines for a diverse set of flood events from Europe,\nAmerica, Africa, Asia and Australia.\n","authors":["Nikolaos Ioannis Bountos","Maria Sdraka","Angelos Zavras","Ilektra Karasante","Andreas Karavias","Themistocles Herekakis","Angeliki Thanasou","Dimitrios Michail","Ioannis Papoutsis"],"pdf_url":"https://arxiv.org/pdf/2311.12056v3.pdf","comment":"Accepted at the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024) Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2409.15246v3","updated":"2024-11-01T12:49:19Z","published":"2024-09-23T17:42:05Z","title":"On-Air Deep Learning Integrated Semantic Inference Models for Enhanced\n Earth Observation Satellite Networks","summary":" Earth Observation (EO) systems are crucial for cartography, disaster\nsurveillance, and resource administration. Nonetheless, they encounter\nconsiderable obstacles in the processing and transmission of extensive data,\nespecially in specialized domains such as precision agriculture and real-time\ndisaster response. Earth observation satellites, outfitted with remote sensing\ntechnology, gather data from onboard sensors and IoT-enabled terrestrial\nobjects, delivering important information remotely. Domain-adapted Large\nLanguage Models (LLMs) provide a solution by enabling the integration of raw\nand processed EO data. Through domain adaptation, LLMs improve the assimilation\nand analysis of many data sources, tackling the intricacies of specialized\ndatasets in agriculture and disaster response. This data synthesis, directed by\nLLMs, enhances the precision and pertinence of conveyed information. This study\nprovides a thorough examination of using semantic inference and deep learning\nfor sophisticated EO systems. It presents an innovative architecture for\nsemantic communication in EO satellite networks, designed to improve data\ntransmission efficiency using semantic processing methodologies. Recent\nadvancements in onboard processing technologies enable dependable, adaptable,\nand energy-efficient data management in orbit. These improvements guarantee\nreliable performance in adverse space circumstances using radiation-hardened\nand reconfigurable technology. Collectively, these advancements enable\nnext-generation satellite missions with improved processing capabilities,\ncrucial for operational flexibility and real-time decision-making in 6G\nsatellite communication.\n","authors":["Hong-fu Chou","Vu Nguyen Ha","Prabhu Thiruvasagam","Thanh-Dung Le","Geoffrey Eappen","Ti Ti Nguyen","Luis M. Garces-Socarras","Jorge L. Gonzalez-Rios","Juan Carlos Merlano-Duncan","Symeon Chatzinotas"],"pdf_url":"https://arxiv.org/pdf/2409.15246v3.pdf","comment":"17 pages, 7 figures, Journal"},{"id":"http://arxiv.org/abs/2405.14864v2","updated":"2024-11-01T12:46:26Z","published":"2024-05-23T17:59:40Z","title":"Video Diffusion Models are Training-free Motion Interpreter and\n Controller","summary":" Video generation primarily aims to model authentic and customized motion\nacross frames, making understanding and controlling the motion a crucial topic.\nMost diffusion-based studies on video motion focus on motion customization with\ntraining-based paradigms, which, however, demands substantial training\nresources and necessitates retraining for diverse models. Crucially, these\napproaches do not explore how video diffusion models encode cross-frame motion\ninformation in their features, lacking interpretability and transparency in\ntheir effectiveness. To answer this question, this paper introduces a novel\nperspective to understand, localize, and manipulate motion-aware features in\nvideo diffusion models. Through analysis using Principal Component Analysis\n(PCA), our work discloses that robust motion-aware feature already exists in\nvideo diffusion models. We present a new MOtion FeaTure (MOFT) by eliminating\ncontent correlation information and filtering motion channels. MOFT provides a\ndistinct set of benefits, including the ability to encode comprehensive motion\ninformation with clear interpretability, extraction without the need for\ntraining, and generalizability across diverse architectures. Leveraging MOFT,\nwe propose a novel training-free video motion control framework. Our method\ndemonstrates competitive performance in generating natural and faithful motion,\nproviding architecture-agnostic insights and applicability in a variety of\ndownstream tasks.\n","authors":["Zeqi Xiao","Yifan Zhou","Shuai Yang","Xingang Pan"],"pdf_url":"https://arxiv.org/pdf/2405.14864v2.pdf","comment":"Project Page: https://xizaoqu.github.io/moft/"},{"id":"http://arxiv.org/abs/2410.20883v2","updated":"2024-11-01T12:42:49Z","published":"2024-10-28T10:04:40Z","title":"Improving Generalization in Visual Reasoning via Self-Ensemble","summary":" The cognitive faculty of visual reasoning necessitates the integration of\nmultimodal perceptual processing and commonsense and external knowledge of the\nworld. In recent years, a plethora of large vision-language models (LVLMs) have\nbeen proposed, demonstrating outstanding power and exceptional proficiency in\ncommonsense reasoning across diverse domains and tasks. Nevertheless, training\nsuch LVLMs requires a lot of costly resources. Recent approaches, instead of\ntraining LVLMs from scratch on various large datasets, focus on exploring ways\nto take advantage of the capabilities of many different LVLMs, such as ensemble\nmethods. In this work, we propose self-ensemble, a novel method that improves\nthe generalization and visual reasoning of the model without updating any\nparameters, a training-free method. Our key insight is that we realized that\nLVLM itself can ensemble without the need for any other LVLMs, which helps to\nunlock their internal capabilities. Extensive experiments on various benchmarks\ndemonstrate the effectiveness of our method in achieving state-of-the-art\n(SOTA) performance on SketchyVQA, Outside Knowledge VQA, and\nout-of-distribution VQA tasks.\n","authors":["Tien-Huy Nguyen","Quang-Khai Tran","Anh-Tuan Quang-Hoang"],"pdf_url":"https://arxiv.org/pdf/2410.20883v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23831v2","updated":"2024-11-01T12:11:29Z","published":"2024-10-31T11:21:21Z","title":"FRoundation: Are Foundation Models Ready for Face Recognition?","summary":" Foundation models are predominantly trained in an unsupervised or\nself-supervised manner on highly diverse and large-scale datasets, making them\nbroadly applicable to various downstream tasks. In this work, we investigate\nfor the first time whether such models are suitable for the specific domain of\nface recognition. We further propose and demonstrate the adaptation of these\nmodels for face recognition across different levels of data availability.\nExtensive experiments are conducted on multiple foundation models and datasets\nof varying scales for training and fine-tuning, with evaluation on a wide range\nof benchmarks. Our results indicate that, despite their versatility,\npre-trained foundation models underperform in face recognition compared to\nsimilar architectures trained specifically for this task. However, fine-tuning\nfoundation models yields promising results, often surpassing models trained\nfrom scratch when training data is limited. Even with access to large-scale\nface recognition training datasets, fine-tuned foundation models perform\ncomparably to models trained from scratch, but with lower training\ncomputational costs and without relying on the assumption of extensive data\navailability. Our analysis also explores bias in face recognition, with\nslightly higher bias observed in some settings when using foundation models.\n","authors":["Tahar Chettaoui","Naser Damer","Fadi Boutros"],"pdf_url":"https://arxiv.org/pdf/2410.23831v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10188v5","updated":"2024-11-01T10:57:37Z","published":"2024-08-19T17:48:08Z","title":"LongVILA: Scaling Long-Context Visual Language Models for Long Videos","summary":" Long-context capability is critical for multi-modal foundation models,\nespecially for long video understanding. We introduce LongVILA, a full-stack\nsolution for long-context visual-language models by co-designing the algorithm\nand system. For model training, we upgrade existing VLMs to support long video\nunderstanding by incorporating two additional stages, i.e., long context\nextension and long video supervised fine-tuning. However, training on long\nvideo is computationally and memory intensive. We introduce the long-context\nMulti-Modal Sequence Parallelism (MM-SP) system that efficiently parallelizes\nlong video training and inference, enabling 2M context length training on 256\nGPUs without any gradient checkpointing. LongVILA efficiently extends the\nnumber of video frames of VILA from 8 to 2048, improving the long video\ncaptioning score from 2.00 to 3.26 (out of 5), achieving 99.8% accuracy in\n6,000-frame (more than 1 million tokens) video needle-in-a-haystack.\nLongVILA-7B demonstrates strong accuracy on the VideoMME benchmark, i.e., 61.8%\nwith subtitle. Besides, MM-SP is 2.1x - 5.7x faster than ring style sequence\nparallelism and 1.1x - 1.4x faster than Megatron with a hybrid context and\ntensor parallelism. Moreover, it seamlessly integrates with Hugging Face\nTransformers.\n","authors":["Fuzhao Xue","Yukang Chen","Dacheng Li","Qinghao Hu","Ligeng Zhu","Xiuyu Li","Yunhao Fang","Haotian Tang","Shang Yang","Zhijian Liu","Ethan He","Hongxu Yin","Pavlo Molchanov","Jan Kautz","Linxi Fan","Yuke Zhu","Yao Lu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2408.10188v5.pdf","comment":"Code and models are available at\n https://github.com/NVlabs/VILA/blob/main/LongVILA.md"},{"id":"http://arxiv.org/abs/2410.20359v2","updated":"2024-11-01T09:33:29Z","published":"2024-10-27T07:25:11Z","title":"Conditional GAN for Enhancing Diffusion Models in Efficient and\n Authentic Global Gesture Generation from Audios","summary":" Audio-driven simultaneous gesture generation is vital for human-computer\ncommunication, AI games, and film production. While previous research has shown\npromise, there are still limitations. Methods based on VAEs are accompanied by\nissues of local jitter and global instability, whereas methods based on\ndiffusion models are hampered by low generation efficiency. This is because the\ndenoising process of DDPM in the latter relies on the assumption that the noise\nadded at each step is sampled from a unimodal distribution, and the noise\nvalues are small. DDIM borrows the idea from the Euler method for solving\ndifferential equations, disrupts the Markov chain process, and increases the\nnoise step size to reduce the number of denoising steps, thereby accelerating\ngeneration. However, simply increasing the step size during the step-by-step\ndenoising process causes the results to gradually deviate from the original\ndata distribution, leading to a significant drop in the quality of the\ngenerated actions and the emergence of unnatural artifacts. In this paper, we\nbreak the assumptions of DDPM and achieves breakthrough progress in denoising\nspeed and fidelity. Specifically, we introduce a conditional GAN to capture\naudio control signals and implicitly match the multimodal denoising\ndistribution between the diffusion and denoising steps within the same sampling\nstep, aiming to sample larger noise values and apply fewer denoising steps for\nhigh-speed generation.\n","authors":["Yongkang Cheng","Mingjiang Liang","Shaoli Huang","Gaoge Han","Jifeng Ning","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2410.20359v2.pdf","comment":"Accepted by WACV 2025 (Round 1)"},{"id":"http://arxiv.org/abs/2410.20358v2","updated":"2024-11-01T09:20:53Z","published":"2024-10-27T07:19:39Z","title":"RopeTP: Global Human Motion Recovery via Integrating Robust Pose\n Estimation with Diffusion Trajectory Prior","summary":" We present RopeTP, a novel framework that combines Robust pose estimation\nwith a diffusion Trajectory Prior to reconstruct global human motion from\nvideos. At the heart of RopeTP is a hierarchical attention mechanism that\nsignificantly improves context awareness, which is essential for accurately\ninferring the posture of occluded body parts. This is achieved by exploiting\nthe relationships with visible anatomical structures, enhancing the accuracy of\nlocal pose estimations. The improved robustness of these local estimations\nallows for the reconstruction of precise and stable global trajectories.\nAdditionally, RopeTP incorporates a diffusion trajectory model that predicts\nrealistic human motion from local pose sequences. This model ensures that the\ngenerated trajectories are not only consistent with observed local actions but\nalso unfold naturally over time, thereby improving the realism and stability of\n3D human motion reconstruction. Extensive experimental validation shows that\nRopeTP surpasses current methods on two benchmark datasets, particularly\nexcelling in scenarios with occlusions. It also outperforms methods that rely\non SLAM for initial camera estimates and extensive optimization, delivering\nmore accurate and realistic trajectories.\n","authors":["Mingjiang Liang","Yongkang Cheng","Hualin Liang","Shaoli Huang","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2410.20358v2.pdf","comment":"Accepted by WACV 2025 (Round 1)"},{"id":"http://arxiv.org/abs/2402.13629v3","updated":"2024-11-01T08:56:48Z","published":"2024-02-21T09:06:04Z","title":"Adversarial Purification and Fine-tuning for Robust UDC Image\n Restoration","summary":" This study delves into the enhancement of Under-Display Camera (UDC) image\nrestoration models, focusing on their robustness against adversarial attacks.\nDespite its innovative approach to seamless display integration, UDC technology\nfaces unique image degradation challenges exacerbated by the susceptibility to\nadversarial perturbations. Our research initially conducts an in-depth\nrobustness evaluation of deep-learning-based UDC image restoration models by\nemploying several white-box and black-box attacking methods. This evaluation is\npivotal in understanding the vulnerabilities of current UDC image restoration\ntechniques. Following the assessment, we introduce a defense framework\nintegrating adversarial purification with subsequent fine-tuning processes.\nFirst, our approach employs diffusion-based adversarial purification,\neffectively neutralizing adversarial perturbations. Then, we apply the\nfine-tuning methodologies to refine the image restoration models further,\nensuring that the quality and fidelity of the restored images are maintained.\nThe effectiveness of our proposed approach is validated through extensive\nexperiments, showing marked improvements in resilience against typical\nadversarial attacks.\n","authors":["Zhenbo Song","Zhenyuan Zhang","Kaihao Zhang","Zhaoxin Fan","Jianfeng Lu"],"pdf_url":"https://arxiv.org/pdf/2402.13629v3.pdf","comment":"Failure to meet expectations"},{"id":"http://arxiv.org/abs/2410.23629v2","updated":"2024-11-01T08:38:21Z","published":"2024-10-31T04:42:43Z","title":"Posture-Informed Muscular Force Learning for Robust Hand Pressure\n Estimation","summary":" We present PiMForce, a novel framework that enhances hand pressure estimation\nby leveraging 3D hand posture information to augment forearm surface\nelectromyography (sEMG) signals. Our approach utilizes detailed spatial\ninformation from 3D hand poses in conjunction with dynamic muscle activity from\nsEMG to enable accurate and robust whole-hand pressure measurements under\ndiverse hand-object interactions. We also developed a multimodal data\ncollection system that combines a pressure glove, an sEMG armband, and a\nmarkerless finger-tracking module. We created a comprehensive dataset from 21\nparticipants, capturing synchronized data of hand posture, sEMG signals, and\nexerted hand pressure across various hand postures and hand-object interaction\nscenarios using our collection system. Our framework enables precise hand\npressure estimation in complex and natural interaction scenarios. Our approach\nsubstantially mitigates the limitations of traditional sEMG-based or\nvision-based methods by integrating 3D hand posture information with sEMG\nsignals. Video demos, data, and code are available online.\n","authors":["Kyungjin Seo","Junghoon Seo","Hanseok Jeong","Sangpil Kim","Sang Ho Yoon"],"pdf_url":"https://arxiv.org/pdf/2410.23629v2.pdf","comment":"Accepted to NeurIPS 2024. Project Page Link:\n https://pimforce.hcitech.org/"},{"id":"http://arxiv.org/abs/2408.07832v4","updated":"2024-11-01T07:41:04Z","published":"2024-07-31T14:49:35Z","title":"LADDER: Language Driven Slice Discovery and Error Rectification","summary":" Error slice discovery associates structured patterns with model errors.\nExisting methods discover error slices by clustering the error-prone samples\nwith similar patterns or assigning discrete attributes to each sample for\npost-hoc analysis. While these methods aim for interpretability and easier\nmitigation through reweighting or rebalancing, they may not capture the full\ncomplexity of error patterns due to incomplete or missing attributes. Contrary\nto the existing approach, this paper utilizes the reasoning capabilities of the\nLarge Language Model (LLM) to analyze complex error patterns and generate\ntestable hypotheses. This paper proposes LADDER: Language Driven slice\nDiscovery and Error Rectification. It first projects the model's representation\ninto a language-aligned feature space (eg CLIP) to preserve semantics in the\noriginal model feature space. This ensures the accurate retrieval of sentences\nthat highlight the model's errors. Next, the LLM utilizes the sentences and\ngenerates hypotheses to discover error slices. Finally, we mitigate the error\nby fine-tuning the classification head by creating a group-balanced dataset\nusing the hypotheses. Our entire method does not require any attribute\nannotation, either explicitly or through external tagging models. We validate\nour method with \\textbf{five} image classification datasets. The code is\navailable (https://github.com/batmanlab/Ladder).\n","authors":["Shantanu Ghosh","Rayan Syed","Chenyu Wang","Clare B. Poynton","Shyam Visweswaran","Kayhan Batmanghelich"],"pdf_url":"https://arxiv.org/pdf/2408.07832v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23806v2","updated":"2024-11-01T07:25:38Z","published":"2024-10-31T10:46:11Z","title":"Human Action Recognition (HAR) Using Skeleton-based Spatial Temporal\n Relative Transformer Network: ST-RTR","summary":" Human Action Recognition (HAR) is an interesting research area in\nhuman-computer interaction used to monitor the activities of elderly and\ndisabled individuals affected by physical and mental health. In the recent era,\nskeleton-based HAR has received much attention because skeleton data has shown\nthat it can handle changes in striking, body size, camera views, and complex\nbackgrounds. One key characteristic of ST-GCN is automatically learning spatial\nand temporal patterns from skeleton sequences. It has some limitations, as this\nmethod only works for short-range correlation due to its limited receptive\nfield. Consequently, understanding human action requires long-range\ninterconnection. To address this issue, we developed a spatial-temporal\nrelative transformer ST-RTR model. The ST-RTR includes joint and relay nodes,\nwhich allow efficient communication and data transmission within the network.\nThese nodes help to break the inherent spatial and temporal skeleton\ntopologies, which enables the model to understand long-range human action\nbetter. Furthermore, we combine ST-RTR with a fusion model for further\nperformance improvements. To assess the performance of the ST-RTR method, we\nconducted experiments on three skeleton-based HAR benchmarks: NTU RGB+D 60, NTU\nRGB+D 120, and UAV-Human. It boosted CS and CV by 2.11 % and 1.45% on NTU RGB+D\n60, 1.25% and 1.05% on NTU RGB+D 120. On UAV-Human datasets, accuracy improved\nby 2.54%. The experimental outcomes explain that the proposed ST-RTR model\nsignificantly improves action recognition associated with the standard ST-GCN\nmethod.\n","authors":["Faisal Mehmood","Enqing Chen","Touqeer Abbas","Samah M. Alzanin"],"pdf_url":"https://arxiv.org/pdf/2410.23806v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03507v6","updated":"2024-11-01T07:04:10Z","published":"2024-04-04T15:10:24Z","title":"DQ-DETR: DETR with Dynamic Query for Tiny Object Detection","summary":" Despite previous DETR-like methods having performed successfully in generic\nobject detection, tiny object detection is still a challenging task for them\nsince the positional information of object queries is not customized for\ndetecting tiny objects, whose scale is extraordinarily smaller than general\nobjects. Also, DETR-like methods using a fixed number of queries make them\nunsuitable for aerial datasets, which only contain tiny objects, and the\nnumbers of instances are imbalanced between different images. Thus, we present\na simple yet effective model, named DQ-DETR, which consists of three different\ncomponents: categorical counting module, counting-guided feature enhancement,\nand dynamic query selection to solve the above-mentioned problems. DQ-DETR uses\nthe prediction and density maps from the categorical counting module to\ndynamically adjust the number of object queries and improve the positional\ninformation of queries. Our model DQ-DETR outperforms previous CNN-based and\nDETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2\ndataset, which mostly consists of tiny objects. Our code will be available at\nhttps://github.com/hoiliu-0801/DQ-DETR.\n","authors":["Yi-Xin Huang","Hou-I Liu","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03507v6.pdf","comment":"Accepted by ECCV 2024. Our code will be available at\n https://github.com/hoiliu-0801/DQ-DETR"},{"id":"http://arxiv.org/abs/2405.17673v2","updated":"2024-11-01T06:22:30Z","published":"2024-05-27T21:50:16Z","title":"Fast Samplers for Inverse Problems in Iterative Refinement Models","summary":" Constructing fast samplers for unconditional diffusion and flow-matching\nmodels has received much attention recently; however, existing methods for\nsolving inverse problems, such as super-resolution, inpainting, or deblurring,\nstill require hundreds to thousands of iterative steps to obtain high-quality\nresults. We propose a plug-and-play framework for constructing efficient\nsamplers for inverse problems, requiring only pre-trained diffusion or\nflow-matching models. We present Conditional Conjugate Integrators, which\nleverage the specific form of the inverse problem to project the respective\nconditional diffusion/flow dynamics into a more amenable space for sampling.\nOur method complements popular posterior approximation methods for solving\ninverse problems using diffusion/flow models. We evaluate the proposed method's\nperformance on various linear image restoration tasks across multiple datasets,\nemploying diffusion and flow-matching models. Notably, on challenging inverse\nproblems like 4x super-resolution on the ImageNet dataset, our method can\ngenerate high-quality samples in as few as 5 conditional sampling steps and\noutperforms competing baselines requiring 20-1000 steps. Our code will be\npublicly available at https://github.com/mandt-lab/c-pigdm\n","authors":["Kushagra Pandey","Ruihan Yang","Stephan Mandt"],"pdf_url":"https://arxiv.org/pdf/2405.17673v2.pdf","comment":"43 pages, NeurIPS'24 Camera Ready"},{"id":"http://arxiv.org/abs/2405.15677v3","updated":"2024-11-01T06:19:24Z","published":"2024-05-24T16:17:35Z","title":"SMART: Scalable Multi-agent Real-time Motion Generation via Next-token\n Prediction","summary":" Data-driven autonomous driving motion generation tasks are frequently\nimpacted by the limitations of dataset size and the domain gap between\ndatasets, which precludes their extensive application in real-world scenarios.\nTo address this issue, we introduce SMART, a novel autonomous driving motion\ngeneration paradigm that models vectorized map and agent trajectory data into\ndiscrete sequence tokens. These tokens are then processed through a\ndecoder-only transformer architecture to train for the next token prediction\ntask across spatial-temporal series. This GPT-style method allows the model to\nlearn the motion distribution in real driving scenarios. SMART achieves\nstate-of-the-art performance across most of the metrics on the generative Sim\nAgents challenge, ranking 1st on the leaderboards of Waymo Open Motion Dataset\n(WOMD), demonstrating remarkable inference speed. Moreover, SMART represents\nthe generative model in the autonomous driving motion domain, exhibiting\nzero-shot generalization capabilities: Using only the NuPlan dataset for\ntraining and WOMD for validation, SMART achieved a competitive score of 0.72 on\nthe Sim Agents challenge. Lastly, we have collected over 1 billion motion\ntokens from multiple datasets, validating the model's scalability. These\nresults suggest that SMART has initially emulated two important properties:\nscalability and zero-shot generalization, and preliminarily meets the needs of\nlarge-scale real-time simulation applications. We have released all the code to\npromote the exploration of models for motion generation in the autonomous\ndriving field. The source code is available at\nhttps://github.com/rainmaker22/SMART.\n","authors":["Wei Wu","Xiaoxin Feng","Ziyan Gao","Yuheng Kan"],"pdf_url":"https://arxiv.org/pdf/2405.15677v3.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2401.08140v3","updated":"2024-11-01T06:12:07Z","published":"2024-01-16T06:19:18Z","title":"ProvNeRF: Modeling per Point Provenance in NeRFs as a Stochastic Field","summary":" Neural radiance fields (NeRFs) have gained popularity with multiple works\nshowing promising results across various applications. However, to the best of\nour knowledge, existing works do not explicitly model the distribution of\ntraining camera poses, or consequently the triangulation quality, a key factor\naffecting reconstruction quality dating back to classical vision literature. We\nclose this gap with ProvNeRF, an approach that models the \\textbf{provenance}\nfor each point -- i.e., the locations where it is likely visible -- of NeRFs as\na stochastic field. We achieve this by extending implicit maximum likelihood\nestimation (IMLE) to functional space with an optimizable objective. We show\nthat modeling per-point provenance during the NeRF optimization enriches the\nmodel with information on triangulation leading to improvements in novel view\nsynthesis and uncertainty estimation under the challenging sparse,\nunconstrained view setting against competitive baselines.\n","authors":["Kiyohiro Nakayama","Mikaela Angelina Uy","Yang You","Ke Li","Leonidas J. Guibas"],"pdf_url":"https://arxiv.org/pdf/2401.08140v3.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n 2024)"},{"id":"http://arxiv.org/abs/2310.01636v4","updated":"2024-11-01T05:29:34Z","published":"2023-10-02T21:02:23Z","title":"Adaptive Visual Scene Understanding: Incremental Scene Graph Generation","summary":" Scene graph generation (SGG) analyzes images to extract meaningful\ninformation about objects and their relationships. In the dynamic visual world,\nit is crucial for AI systems to continuously detect new objects and establish\ntheir relationships with existing ones. Recently, numerous studies have focused\non continual learning within the domains of object detection and image\nrecognition. However, a limited amount of research focuses on a more\nchallenging continual learning problem in SGG. This increased difficulty arises\nfrom the intricate interactions and dynamic relationships among objects, and\ntheir associated contexts. Thus, in continual learning, SGG models are often\nrequired to expand, modify, retain, and reason scene graphs within the process\nof adaptive visual scene understanding. To systematically explore Continual\nScene Graph Generation (CSEGG), we present a comprehensive benchmark comprising\nthree learning regimes: relationship incremental, scene incremental, and\nrelationship generalization. Moreover, we introduce a ``Replays via Analysis by\nSynthesis\" method named RAS. This approach leverages the scene graphs,\ndecomposes and re-composes them to represent different scenes, and replays the\nsynthesized scenes based on these compositional scene graphs. The replayed\nsynthesized scenes act as a means to practice and refine proficiency in SGG in\nknown and unknown environments. Our experimental results not only highlight the\nchallenges of directly combining existing continual learning methods with SGG\nbackbones but also demonstrate the effectiveness of our proposed approach,\nenhancing CSEGG efficiency while simultaneously preserving privacy and memory\nusage. All data and source code are publicly available online.\n","authors":["Naitik Khandelwal","Xiao Liu","Mengmi Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.01636v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03918v2","updated":"2024-11-01T05:23:35Z","published":"2024-10-04T20:45:33Z","title":"STONE: A Submodular Optimization Framework for Active 3D Object\n Detection","summary":" 3D object detection is fundamentally important for various emerging\napplications, including autonomous driving and robotics. A key requirement for\ntraining an accurate 3D object detector is the availability of a large amount\nof LiDAR-based point cloud data. Unfortunately, labeling point cloud data is\nextremely challenging, as accurate 3D bounding boxes and semantic labels are\nrequired for each potential object. This paper proposes a unified active 3D\nobject detection framework, for greatly reducing the labeling cost of training\n3D object detectors. Our framework is based on a novel formulation of\nsubmodular optimization, specifically tailored to the problem of active 3D\nobject detection. In particular, we address two fundamental challenges\nassociated with active 3D object detection: data imbalance and the need to\ncover the distribution of the data, including LiDAR-based point cloud data of\nvarying difficulty levels. Extensive experiments demonstrate that our method\nachieves state-of-the-art performance with high computational efficiency\ncompared to existing active learning methods. The code is available at\nhttps://github.com/RuiyuM/STONE.\n","authors":["Ruiyu Mao","Sarthak Kumar Maharana","Rishabh K Iyer","Yunhui Guo"],"pdf_url":"https://arxiv.org/pdf/2410.03918v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00986v2","updated":"2024-11-01T05:03:19Z","published":"2024-04-01T08:18:38Z","title":"Make Continual Learning Stronger via C-Flat","summary":" Model generalization ability upon incrementally acquiring dynamically\nupdating knowledge from sequentially arriving tasks is crucial to tackle the\nsensitivity-stability dilemma in Continual Learning (CL). Weight loss landscape\nsharpness minimization seeking for flat minima lying in neighborhoods with\nuniform low loss or smooth gradient is proven to be a strong training regime\nimproving model generalization compared with loss minimization based optimizer\nlike SGD. Yet only a few works have discussed this training regime for CL,\nproving that dedicated designed zeroth-order sharpness optimizer can improve CL\nperformance. In this work, we propose a Continual Flatness (C-Flat) method\nfeaturing a flatter loss landscape tailored for CL. C-Flat could be easily\ncalled with only one line of code and is plug-and-play to any CL methods. A\ngeneral framework of C-Flat applied to all CL categories and a thorough\ncomparison with loss minima optimizer and flat minima based CL approaches is\npresented in this paper, showing that our method can boost CL performance in\nalmost all cases. Code is available at https://github.com/WanNaa/C-Flat.\n","authors":["Ang Bian","Wei Li","Hangjie Yuan","Chengrong Yu","Mang Wang","Zixiang Zhao","Aojun Lu","Pengliang Ji","Tao Feng"],"pdf_url":"https://arxiv.org/pdf/2404.00986v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12470v2","updated":"2024-11-01T04:59:31Z","published":"2024-09-19T05:17:44Z","title":"HSIGene: A Foundation Model For Hyperspectral Image Generation","summary":" Hyperspectral image (HSI) plays a vital role in various fields such as\nagriculture and environmental monitoring. However, due to the expensive\nacquisition cost, the number of hyperspectral images is limited, degenerating\nthe performance of downstream tasks. Although some recent studies have\nattempted to employ diffusion models to synthesize HSIs, they still struggle\nwith the scarcity of HSIs, affecting the reliability and diversity of the\ngenerated images. Some studies propose to incorporate multi-modal data to\nenhance spatial diversity, but the spectral fidelity cannot be ensured. In\naddition, existing HSI synthesis models are typically uncontrollable or only\nsupport single-condition control, limiting their ability to generate accurate\nand reliable HSIs. To alleviate these issues, we propose HSIGene, a novel HSI\ngeneration foundation model which is based on latent diffusion and supports\nmulti-condition control, allowing for more precise and reliable HSI generation.\nTo enhance the spatial diversity of the training data while preserving spectral\nfidelity, we propose a new data augmentation method based on spatial\nsuper-resolution, in which HSIs are upscaled first, and thus abundant training\npatches could be obtained by cropping the high-resolution HSIs. In addition, to\nimprove the perceptual quality of the augmented data, we introduce a novel\ntwo-stage HSI super-resolution framework, which first applies RGB bands\nsuper-resolution and then utilizes our proposed Rectangular Guided Attention\nNetwork (RGAN) for guided HSI super-resolution. Experiments demonstrate that\nthe proposed model is capable of generating a vast quantity of realistic HSIs\nfor downstream tasks such as denoising and super-resolution. The code and\nmodels are available at https://github.com/LiPang/HSIGene.\n","authors":["Li Pang","Xiangyong Cao","Datao Tang","Shuang Xu","Xueru Bai","Feng Zhou","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2409.12470v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20474v2","updated":"2024-11-01T04:33:52Z","published":"2024-10-27T15:30:45Z","title":"GrounDiT: Grounding Diffusion Transformers via Noisy Patch\n Transplantation","summary":" We introduce GrounDiT, a novel training-free spatial grounding technique for\ntext-to-image generation using Diffusion Transformers (DiT). Spatial grounding\nwith bounding boxes has gained attention for its simplicity and versatility,\nallowing for enhanced user control in image generation. However, prior\ntraining-free approaches often rely on updating the noisy image during the\nreverse diffusion process via backpropagation from custom loss functions, which\nfrequently struggle to provide precise control over individual bounding boxes.\nIn this work, we leverage the flexibility of the Transformer architecture,\ndemonstrating that DiT can generate noisy patches corresponding to each\nbounding box, fully encoding the target object and allowing for fine-grained\ncontrol over each region. Our approach builds on an intriguing property of DiT,\nwhich we refer to as semantic sharing. Due to semantic sharing, when a smaller\npatch is jointly denoised alongside a generatable-size image, the two become\nsemantic clones. Each patch is denoised in its own branch of the generation\nprocess and then transplanted into the corresponding region of the original\nnoisy image at each timestep, resulting in robust spatial grounding for each\nbounding box. In our experiments on the HRS and DrawBench benchmarks, we\nachieve state-of-the-art performance compared to previous training-free\napproaches.\n","authors":["Phillip Y. Lee","Taehoon Yoon","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2410.20474v2.pdf","comment":"Accepted to NeurIPS 2024. Project Page:\n https://groundit-diffusion.github.io/"},{"id":"http://arxiv.org/abs/2410.23775v2","updated":"2024-11-01T03:15:02Z","published":"2024-10-31T09:45:00Z","title":"In-Context LoRA for Diffusion Transformers","summary":" Recent research arXiv:2410.15027 has explored the use of diffusion\ntransformers (DiTs) for task-agnostic image generation by simply concatenating\nattention tokens across images. However, despite substantial computational\nresources, the fidelity of the generated images remains suboptimal. In this\nstudy, we reevaluate and streamline this framework by hypothesizing that\ntext-to-image DiTs inherently possess in-context generation capabilities,\nrequiring only minimal tuning to activate them. Through diverse task\nexperiments, we qualitatively demonstrate that existing text-to-image DiTs can\neffectively perform in-context generation without any tuning. Building on this\ninsight, we propose a remarkably simple pipeline to leverage the in-context\nabilities of DiTs: (1) concatenate images instead of tokens, (2) perform joint\ncaptioning of multiple images, and (3) apply task-specific LoRA tuning using\nsmall datasets (e.g., $20\\sim 100$ samples) instead of full-parameter tuning\nwith large datasets. We name our models In-Context LoRA (IC-LoRA). This\napproach requires no modifications to the original DiT models, only changes to\nthe training data. Remarkably, our pipeline generates high-fidelity image sets\nthat better adhere to prompts. While task-specific in terms of tuning data, our\nframework remains task-agnostic in architecture and pipeline, offering a\npowerful tool for the community and providing valuable insights for further\nresearch on product-level task-agnostic generation systems. We release our\ncode, data, and models at https://github.com/ali-vilab/In-Context-LoRA\n","authors":["Lianghua Huang","Wei Wang","Zhi-Fan Wu","Yupeng Shi","Huanzhang Dou","Chen Liang","Yutong Feng","Yu Liu","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.23775v2.pdf","comment":"Tech report. Project page:\n https://ali-vilab.github.io/In-Context-LoRA-Page/"},{"id":"http://arxiv.org/abs/2409.17508v2","updated":"2024-11-01T02:38:53Z","published":"2024-09-26T03:33:26Z","title":"Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task\n Learning Via Connector-MoE","summary":" Multi-modal large language models (MLLMs) have shown impressive capabilities\nas a general-purpose interface for various visual and linguistic tasks.\nHowever, building a unified MLLM for multi-task learning in the medical field\nremains a thorny challenge. To mitigate the tug-of-war problem of multi-modal\nmulti-task optimization in MLLMs, recent advances primarily focus on improving\nthe LLM components, while neglecting the connector that bridges the gap between\nmodalities. In this paper, we introduce Uni-Med, a novel medical generalist\nfoundation model which consists of a universal visual feature extraction\nmodule, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting\nfrom the proposed CMoE that leverages a well-designed router with a mixture of\nprojection experts at the connector, Uni-Med achieves efficient solution to the\ntug-of-war problem and can perform six different medical tasks including\nquestion answering, visual question answering, report generation, referring\nexpression comprehension, referring expression generation and image\nclassification. To the best of our knowledge, Uni-Med is the first effort to\ntackle multi-task interference at the connector in MLLMs. Extensive ablation\nexperiments validate the effectiveness of introducing CMoE under any\nconfiguration, with up to an average 8% performance gains. We further provide\ninterpretation analysis of the tug-of-war problem from the perspective of\ngradient optimization and parameter statistics. Compared to previous\nstate-of-the-art medical MLLMs, Uni-Med achieves competitive or superior\nevaluation metrics on diverse tasks. Code and resources are available at\nhttps://github.com/tsinghua-msiip/Uni-Med.\n","authors":["Xun Zhu","Ying Hu","Fanbin Mo","Miao Li","Ji Wu"],"pdf_url":"https://arxiv.org/pdf/2409.17508v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09553v4","updated":"2024-11-01T02:25:50Z","published":"2024-06-28T08:21:49Z","title":"DPEC: Dual-Path Error Compensation Method for Enhanced Low-Light Image\n Clarity","summary":" For the task of low-light image enhancement, deep learning-based algorithms\nhave demonstrated superiority and effectiveness compared to traditional\nmethods. However, these methods, primarily based on Retinex theory, tend to\noverlook the noise and color distortions in input images, leading to\nsignificant noise amplification and local color distortions in enhanced\nresults. To address these issues, we propose the Dual-Path Error Compensation\n(DPEC) method, designed to improve image quality under low-light conditions by\npreserving local texture details while restoring global image brightness\nwithout amplifying noise. DPEC incorporates precise pixel-level error\nestimation to capture subtle differences and an independent denoising mechanism\nto prevent noise amplification. We introduce the HIS-Retinex loss to guide\nDPEC's training, ensuring the brightness distribution of enhanced images\nclosely aligns with real-world conditions. To balance computational speed and\nresource efficiency while training DPEC for a comprehensive understanding of\nthe global context, we integrated the VMamba architecture into its backbone.\nComprehensive quantitative and qualitative experimental results demonstrate\nthat our algorithm significantly outperforms state-of-the-art methods in\nlow-light image enhancement. The code is publicly available online at\nhttps://github.com/wangshuang233/DPEC.\n","authors":["Shuang Wang","Qianwen Lu","Boxing Peng","Yihe Nie","Qingchuan Tao"],"pdf_url":"https://arxiv.org/pdf/2407.09553v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14135v2","updated":"2024-11-01T02:20:06Z","published":"2024-08-26T09:32:16Z","title":"Foodfusion: A Novel Approach for Food Image Composition via Diffusion\n Models","summary":" Food image composition requires the use of existing dish images and\nbackground images to synthesize a natural new image, while diffusion models\nhave made significant advancements in image generation, enabling the\nconstruction of end-to-end architectures that yield promising results. However,\nexisting diffusion models face challenges in processing and fusing information\nfrom multiple images and lack access to high-quality publicly available\ndatasets, which prevents the application of diffusion models in food image\ncomposition. In this paper, we introduce a large-scale, high-quality food image\ncomposite dataset, FC22k, which comprises 22,000 foreground, background, and\nground truth ternary image pairs. Additionally, we propose a novel food image\ncomposition method, Foodfusion, which leverages the capabilities of the\npre-trained diffusion models and incorporates a Fusion Module for processing\nand integrating foreground and background information. This fused information\naligns the foreground features with the background structure by merging the\nglobal structural information at the cross-attention layer of the denoising\nUNet. To further enhance the content and structure of the background, we also\nintegrate a Content-Structure Control Module. Extensive experiments demonstrate\nthe effectiveness and scalability of our proposed method.\n","authors":["Chaohua Shi","Xuan Wang","Si Shi","Xule Wang","Mingrui Zhu","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2408.14135v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2406.18451v3","updated":"2024-11-01T02:13:59Z","published":"2024-06-26T16:00:35Z","title":"Detecting Brittle Decisions for Free: Leveraging Margin Consistency in\n Deep Robust Classifiers","summary":" Despite extensive research on adversarial training strategies to improve\nrobustness, the decisions of even the most robust deep learning models can\nstill be quite sensitive to imperceptible perturbations, creating serious risks\nwhen deploying them for high-stakes real-world applications. While detecting\nsuch cases may be critical, evaluating a model's vulnerability at a\nper-instance level using adversarial attacks is computationally too intensive\nand unsuitable for real-time deployment scenarios. The input space margin is\nthe exact score to detect non-robust samples and is intractable for deep neural\nnetworks. This paper introduces the concept of margin consistency -- a property\nthat links the input space margins and the logit margins in robust models --\nfor efficient detection of vulnerable samples. First, we establish that margin\nconsistency is a necessary and sufficient condition to use a model's logit\nmargin as a score for identifying non-robust samples. Next, through\ncomprehensive empirical analysis of various robustly trained models on CIFAR10\nand CIFAR100 datasets, we show that they indicate high margin consistency with\na strong correlation between their input space margins and the logit margins.\nThen, we show that we can effectively and confidently use the logit margin to\ndetect brittle decisions with such models. Finally, we address cases where the\nmodel is not sufficiently margin-consistent by learning a pseudo-margin from\nthe feature representation. Our findings highlight the potential of leveraging\ndeep representations to assess adversarial vulnerability in deployment\nscenarios efficiently.\n","authors":["Jonas Ngnawé","Sabyasachi Sahoo","Yann Pequignot","Frédéric Precioso","Christian Gagné"],"pdf_url":"https://arxiv.org/pdf/2406.18451v3.pdf","comment":"10 pages, 6 figures, 2 tables. Version Update: Neurips Camera Ready"},{"id":"http://arxiv.org/abs/2410.20595v2","updated":"2024-11-01T01:27:10Z","published":"2024-10-27T21:02:37Z","title":"A Framework for Real-Time Volcano-Seismic Event Recognition Based on\n Multi-Station Seismograms and Semantic Segmentation Models","summary":" In volcano monitoring, effective recognition of seismic events is essential\nfor understanding volcanic activity and raising timely warning alerts.\nTraditional methods rely on manual analysis, which can be subjective and\nlabor-intensive. Furthermore, current automatic approaches often tackle\ndetection and classification separately, mostly rely on single station\ninformation and generally require tailored preprocessing and representations to\nperform predictions. These limitations often hinder their application to\nreal-time monitoring and utilization across different volcano conditions. This\nstudy introduces a novel approach that utilizes Semantic Segmentation models to\nautomate seismic event recognition by applying a straight forward\ntransformation of multi-channel 1D signals into 2D representations, enabling\ntheir use as images. Our framework employs a data-driven, end-to-end design\nthat integrates multi-station seismic data with minimal preprocessing,\nperforming both detection and classification simultaneously for five seismic\nevent classes. We evaluated four state-of-the-art segmentation models (UNet,\nUNet++, DeepLabV3+ and SwinUNet) on approximately 25.000 seismic events\nrecorded at four different Chilean volcanoes: Nevados del Chill\\'an Volcanic\nComplex, Laguna del Maule, Villarrica and Puyehue-Cord\\'on Caulle. Among these\nmodels, the UNet architecture was identified as the most effective model,\nachieving mean F1 and Intersection over Union (IoU) scores of up to 0.91 and\n0.88, respectively, and demonstrating superior noise robustness and model\nflexibility to unseen volcano datasets.\n","authors":["Camilo Espinosa-Curilem","Millaray Curilem","Daniel Basualto"],"pdf_url":"https://arxiv.org/pdf/2410.20595v2.pdf","comment":"10 pages, 9 figures. This is a pre-print, it is currently under\n review for publication"},{"id":"http://arxiv.org/abs/2310.05341v5","updated":"2024-11-01T00:37:44Z","published":"2023-10-09T01:59:49Z","title":"From Question to Exploration: Test-Time Adaptation in Semantic\n Segmentation?","summary":" Test-time adaptation (TTA) aims to adapt a model, initially trained on\ntraining data, to test data with potential distribution shifts. Most existing\nTTA methods focus on classification problems. The pronounced success of\nclassification might lead numerous newcomers and engineers to assume that\nclassic TTA techniques can be directly applied to the more challenging task of\nsemantic segmentation. However, this belief is still an open question. In this\npaper, we investigate the applicability of existing classic TTA strategies in\nsemantic segmentation. Our comprehensive results have led to three key\nobservations. First, the classic normalization updating strategy only brings\nslight performance improvement, and in some cases, it might even adversely\naffect the results. Even with the application of advanced distribution\nestimation techniques like batch renormalization, the problem remains\nunresolved. Second, although the teacher-student scheme does enhance the\ntraining stability for segmentation TTA in the presence of noisy pseudo-labels\nand temporal correlation, it cannot directly result in performance improvement\ncompared to the original model without TTA under complex data distribution.\nThird, segmentation TTA suffers a severe long-tailed class-imbalance problem,\nwhich is substantially more complex than that in TTA for classification. This\nlong-tailed challenge negatively affects segmentation TTA performance, even\nwhen the accuracy of pseudo-labels is high. Besides those observations, we find\nthat visual prompt tuning (VisPT) is promising in segmentation TTA and propose\na novel method named TTAP. The outstanding performance of TTAP has also been\nverified. We hope the community can give more attention to this challenging,\nyet important, segmentation TTA task in the future. The source code is\navailable at: \\textit{https://github.com/ycarobot/TTAP\n","authors":["Chang'an Yi","Haotian Chen","Yifan Zhang","Yonghui Xu","Yan Zhou","Lizhen Cui"],"pdf_url":"https://arxiv.org/pdf/2310.05341v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09371v2","updated":"2024-11-01T00:22:26Z","published":"2024-06-13T17:51:00Z","title":"LRM-Zero: Training Large Reconstruction Models with Synthesized Data","summary":" We present LRM-Zero, a Large Reconstruction Model (LRM) trained entirely on\nsynthesized 3D data, achieving high-quality sparse-view 3D reconstruction. The\ncore of LRM-Zero is our procedural 3D dataset, Zeroverse, which is\nautomatically synthesized from simple primitive shapes with random texturing\nand augmentations (e.g., height fields, boolean differences, and wireframes).\nUnlike previous 3D datasets (e.g., Objaverse) which are often captured or\ncrafted by humans to approximate real 3D data, Zeroverse completely ignores\nrealistic global semantics but is rich in complex geometric and texture details\nthat are locally similar to or even more intricate than real objects. We\ndemonstrate that our LRM-Zero, trained with our fully synthesized Zeroverse,\ncan achieve high visual quality in the reconstruction of real-world objects,\ncompetitive with models trained on Objaverse. We also analyze several critical\ndesign choices of Zeroverse that contribute to LRM-Zero's capability and\ntraining stability. Our work demonstrates that 3D reconstruction, one of the\ncore tasks in 3D vision, can potentially be addressed without the semantics of\nreal-world objects. The Zeroverse's procedural synthesis code and interactive\nvisualization are available at: https://desaixie.github.io/lrm-zero/.\n","authors":["Desai Xie","Sai Bi","Zhixin Shu","Kai Zhang","Zexiang Xu","Yi Zhou","Sören Pirk","Arie Kaufman","Xin Sun","Hao Tan"],"pdf_url":"https://arxiv.org/pdf/2406.09371v2.pdf","comment":"23 pages, 8 figures. Our code and interactive visualization are\n available at: https://desaixie.github.io/lrm-zero/. v2: NeurIPS 2024 Camera\n Ready version"},{"id":"http://arxiv.org/abs/2406.17763v2","updated":"2024-11-01T00:08:54Z","published":"2024-06-25T17:48:24Z","title":"DiffusionPDE: Generative PDE-Solving Under Partial Observation","summary":" We introduce a general framework for solving partial differential equations\n(PDEs) using generative diffusion models. In particular, we focus on the\nscenarios where we do not have the full knowledge of the scene necessary to\napply classical solvers. Most existing forward or inverse PDE approaches\nperform poorly when the observations on the data or the underlying coefficients\nare incomplete, which is a common assumption for real-world measurements. In\nthis work, we propose DiffusionPDE that can simultaneously fill in the missing\ninformation and solve a PDE by modeling the joint distribution of the solution\nand coefficient spaces. We show that the learned generative priors lead to a\nversatile framework for accurately solving a wide range of PDEs under partial\nobservation, significantly outperforming the state-of-the-art methods for both\nforward and inverse directions.\n","authors":["Jiahe Huang","Guandao Yang","Zichen Wang","Jeong Joon Park"],"pdf_url":"https://arxiv.org/pdf/2406.17763v2.pdf","comment":"NeurIPS 2024. Project page:\n https://jhhuangchloe.github.io/Diffusion-PDE/"}]},"2024-11-04T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2411.02393v1","updated":"2024-11-04T18:58:01Z","published":"2024-11-04T18:58:01Z","title":"Adaptive Length Image Tokenization via Recurrent Allocation","summary":" Current vision systems typically assign fixed-length representations to\nimages, regardless of the information content. This contrasts with human\nintelligence - and even large language models - which allocate varying\nrepresentational capacities based on entropy, context and familiarity. Inspired\nby this, we propose an approach to learn variable-length token representations\nfor 2D images. Our encoder-decoder architecture recursively processes 2D image\ntokens, distilling them into 1D latent tokens over multiple iterations of\nrecurrent rollouts. Each iteration refines the 2D tokens, updates the existing\n1D latent tokens, and adaptively increases representational capacity by adding\nnew tokens. This enables compression of images into a variable number of\ntokens, ranging from 32 to 256. We validate our tokenizer using reconstruction\nloss and FID metrics, demonstrating that token count aligns with image entropy,\nfamiliarity and downstream task requirements. Recurrent token processing with\nincreasing representational capacity in each iteration shows signs of token\nspecialization, revealing potential for object / part discovery.\n","authors":["Shivam Duggal","Phillip Isola","Antonio Torralba","William T. Freeman"],"pdf_url":"https://arxiv.org/pdf/2411.02393v1.pdf","comment":"Code at: https://github.com/ShivamDuggal4/adaptive-length-tokenizer"},{"id":"http://arxiv.org/abs/2407.16677v2","updated":"2024-11-04T18:54:23Z","published":"2024-07-23T17:44:54Z","title":"From Imitation to Refinement -- Residual RL for Precise Assembly","summary":" Recent advances in behavior cloning (BC), like action-chunking and diffusion,\nhave led to impressive progress. Still, imitation alone remains insufficient\nfor tasks requiring reliable and precise movements, such as aligning and\ninserting objects. Our key insight is that chunked BC policies function as\ntrajectory planners, enabling long-horizon tasks. Conversely, as they execute\naction chunks open-loop, they lack the fine-grained reactivity necessary for\nreliable execution. Further, we find that the performance of BC policies\nsaturates despite increasing data. Reinforcement learning (RL) is a natural way\nto overcome this, but it is not straightforward to apply directly to\naction-chunked models like diffusion policies. We present a simple yet\neffective method, ResiP (Residual for Precise Manipulation), that sidesteps\nthese challenges by augmenting a frozen, chunked BC model with a fully\nclosed-loop residual policy trained with RL. The residual policy is trained via\non-policy RL, addressing distribution shifts and introducing reactivity without\naltering the BC trajectory planner. Evaluation on high-precision manipulation\ntasks demonstrates strong performance of ResiP over BC methods and direct RL\nfine-tuning. Videos, code, and data are available at\n\\url{https://residual-assembly.github.io}.\n","authors":["Lars Ankile","Anthony Simeonov","Idan Shenfeld","Marcel Torne","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2407.16677v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23262v2","updated":"2024-11-04T18:44:20Z","published":"2024-10-30T17:46:31Z","title":"EMMA: End-to-End Multimodal Model for Autonomous Driving","summary":" We introduce EMMA, an End-to-end Multimodal Model for Autonomous driving.\nBuilt on a multi-modal large language model foundation, EMMA directly maps raw\ncamera sensor data into various driving-specific outputs, including planner\ntrajectories, perception objects, and road graph elements. EMMA maximizes the\nutility of world knowledge from the pre-trained large language models, by\nrepresenting all non-sensor inputs (e.g. navigation instructions and ego\nvehicle status) and outputs (e.g. trajectories and 3D locations) as natural\nlanguage text. This approach allows EMMA to jointly process various driving\ntasks in a unified language space, and generate the outputs for each task using\ntask-specific prompts. Empirically, we demonstrate EMMA's effectiveness by\nachieving state-of-the-art performance in motion planning on nuScenes as well\nas competitive results on the Waymo Open Motion Dataset (WOMD). EMMA also\nyields competitive results for camera-primary 3D object detection on the Waymo\nOpen Dataset (WOD). We show that co-training EMMA with planner trajectories,\nobject detection, and road graph tasks yields improvements across all three\ndomains, highlighting EMMA's potential as a generalist model for autonomous\ndriving applications. However, EMMA also exhibits certain limitations: it can\nprocess only a small amount of image frames, does not incorporate accurate 3D\nsensing modalities like LiDAR or radar and is computationally expensive. We\nhope that our results will inspire further research to mitigate these issues\nand to further evolve the state of the art in autonomous driving model\narchitectures.\n","authors":["Jyh-Jing Hwang","Runsheng Xu","Hubert Lin","Wei-Chih Hung","Jingwei Ji","Kristy Choi","Di Huang","Tong He","Paul Covington","Benjamin Sapp","Yin Zhou","James Guo","Dragomir Anguelov","Mingxing Tan"],"pdf_url":"https://arxiv.org/pdf/2410.23262v2.pdf","comment":"Blog post: https://waymo.com/blog/2024/10/introducing-emma/"},{"id":"http://arxiv.org/abs/2411.02359v1","updated":"2024-11-04T18:26:08Z","published":"2024-11-04T18:26:08Z","title":"DeeR-VLA: Dynamic Inference of Multimodal Large Language Models for\n Efficient Robot Execution","summary":" MLLMs have demonstrated remarkable comprehension and reasoning capabilities\nwith complex language and visual data. These advances have spurred the vision\nof establishing a generalist robotic MLLM proficient in understanding complex\nhuman instructions and accomplishing various embodied tasks. However,\ndeveloping MLLMs for real-world robots is challenging due to the typically\nlimited computation and memory capacities available on robotic platforms. In\ncontrast, the inference of MLLMs involves storing billions of parameters and\nperforming tremendous computation, imposing significant hardware demands. In\nour paper, we propose a Dynamic Early-Exit Framework for Robotic\nVision-Language-Action Model (DeeR-VLA, or simply DeeR) that automatically\nadjusts the size of the activated MLLM based on each situation at hand. The\napproach leverages a multi-exit architecture in MLLMs, which allows the model\nto terminate processing once a proper size of the model has been activated for\na specific situation, thus avoiding further redundant computation.\nAdditionally, we develop novel algorithms that establish early-termination\ncriteria for DeeR, conditioned on predefined demands such as average\ncomputational cost (i.e., power consumption), as well as peak computational\nconsumption (i.e., latency) and GPU memory usage. These enhancements ensure\nthat DeeR operates efficiently under varying resource constraints while\nmaintaining competitive performance. On the CALVIN robot manipulation\nbenchmark, DeeR demonstrates significant reductions in computational costs of\nLLM by 5.2-6.5x and GPU memory of LLM by 2-6x without compromising performance.\nCode and checkpoints are available at https://github.com/yueyang130/DeeR-VLA.\n","authors":["Yang Yue","Yulin Wang","Bingyi Kang","Yizeng Han","Shenzhi Wang","Shiji Song","Jiashi Feng","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2411.02359v1.pdf","comment":"25 pages, 6 figures, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.02345v1","updated":"2024-11-04T18:16:40Z","published":"2024-11-04T18:16:40Z","title":"Simulation of Nanorobots with Artificial Intelligence and Reinforcement\n Learning for Advanced Cancer Cell Detection and Tracking","summary":" Nanorobots are a promising development in targeted drug delivery and the\ntreatment of neurological disorders, with potential for crossing the\nblood-brain barrier (BBB). These small devices leverage advancements in\nnanotechnology and bioengineering for precise navigation and targeted payload\ndelivery, particularly for conditions like brain tumors, Alzheimer's disease,\nand Parkinson's disease. Recent progress in artificial intelligence (AI) and\nmachine learning (ML) has improved the navigation and effectiveness of\nnanorobots, allowing them to detect and interact with cancer cells through\nbiomarker analysis. This study presents a new reinforcement learning (RL)\nframework for optimizing nanorobot navigation in complex biological\nenvironments, focusing on cancer cell detection by analyzing the concentration\ngradients of surrounding biomarkers. We utilize a computer simulation model to\nexplore the behavior of nanorobots in a three-dimensional space with cancer\ncells and biological barriers. The proposed method uses Q-learning to refine\nmovement strategies based on real-time biomarker concentration data, enabling\nnanorobots to autonomously navigate to cancerous tissues for targeted drug\ndelivery. This research lays the groundwork for future laboratory experiments\nand clinical applications, with implications for personalized medicine and less\ninvasive cancer treatments. The integration of intelligent nanorobots could\nrevolutionize therapeutic strategies, reducing side effects and enhancing\ntreatment effectiveness for cancer patients. Further research will investigate\nthe practical deployment of these technologies in medical settings, aiming to\nunlock the full potential of nanorobotics in healthcare.\n","authors":["Shahab Kavousinejad"],"pdf_url":"https://arxiv.org/pdf/2411.02345v1.pdf","comment":"The source code for this simulation is available on GitHub:\n https://github.com/SHAHAB-K93/cancer-and-smart-nanorobot"},{"id":"http://arxiv.org/abs/2407.18145v2","updated":"2024-11-04T17:31:40Z","published":"2024-07-25T15:49:26Z","title":"Taxonomy-Aware Continual Semantic Segmentation in Hyperbolic Spaces for\n Open-World Perception","summary":" Semantic segmentation models are typically trained on a fixed set of classes,\nlimiting their applicability in open-world scenarios. Class-incremental\nsemantic segmentation aims to update models with emerging new classes while\npreventing catastrophic forgetting of previously learned ones. However,\nexisting methods impose strict rigidity on old classes, reducing their\neffectiveness in learning new incremental classes. In this work, we propose\nTaxonomy-Oriented Poincar\\'e-regularized Incremental-Class Segmentation\n(TOPICS) that learns feature embeddings in hyperbolic space following explicit\ntaxonomy-tree structures. This supervision provides plasticity for old classes,\nupdating ancestors based on new classes while integrating new classes at\nfitting positions. Additionally, we maintain implicit class relational\nconstraints on the geometric basis of the Poincar\\'e ball. This ensures that\nthe latent space can continuously adapt to new constraints while maintaining a\nrobust structure to combat catastrophic forgetting. We also establish eight\nrealistic incremental learning protocols for autonomous driving scenarios,\nwhere novel classes can originate from known classes or the background.\nExtensive evaluations of TOPICS on the Cityscapes and Mapillary Vistas 2.0\nbenchmarks demonstrate that it achieves state-of-the-art performance. We make\nthe code and trained models publicly available at\nhttp://topics.cs.uni-freiburg.de.\n","authors":["Julia Hindel","Daniele Cattaneo","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2407.18145v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07745v2","updated":"2024-11-04T17:26:07Z","published":"2023-12-12T21:27:31Z","title":"High-density Electromyography for Effective Gesture-based Control of\n Physically Assistive Mobile Manipulators","summary":" High-density electromyography (HDEMG) can detect myoelectric activity as\ncontrol inputs to a variety of electronically-controlled devices. Furthermore,\nHDEMG sensors may be built into a variety of clothing, allowing for a\nnon-intrusive myoelectric interface that is integrated into a user's routine.\nIn our work, we introduce an easily-producible HDEMG device that interfaces\nwith the control of a mobile manipulator to perform a range of household and\nphysically assistive tasks. Mobile manipulators can operate throughout the home\nand are applicable for a spectrum of assistive and daily tasks in the home. We\nevaluate the use of real-time myoelectric gesture recognition using our device\nto enable precise control over the intricate mobility and manipulation\nfunctionalities of an 8 degree-of-freedom mobile manipulator. Our evaluation,\ninvolving 13 participants engaging in challenging self-care and household\nactivities, demonstrates the potential of our wearable HDEMG system to control\na mobile manipulator in the home.\n","authors":["Jehan Yang","Kent Shibata","Douglas Weber","Zackory Erickson"],"pdf_url":"https://arxiv.org/pdf/2312.07745v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02295v1","updated":"2024-11-04T17:21:58Z","published":"2024-11-04T17:21:58Z","title":"Kilovolt Pyroelectric Voltage Generation and Electrostatic Actuation\n With Fluidic Heating","summary":" Integrated micro power generators are crucial components for micro robotic\nplatforms to demonstrate untethered operation and to achieve autonomy. Current\nmicro robotic electrostatic actuators typically require hundreds to thousands\nof voltages to output sufficient work. Pyroelectricity is one such source of\nhigh voltages that can be scaled to small form factors. This paper demonstrates\na distributed pyroelectric high voltage generation mechanism to power kV\nactuators using alternating exposure of crystals to hot and cold water (300C to\n900C water temperature). Using this fluidic temperature control, a\npyroelectrically generated voltage of 2470 V was delivered to a 2 pF storage\ncapacitor yielding a 6.10 {\\mu}J stored energy. A maximum energy of 17.46\n{\\mu}J was delivered to a 47 pF capacitor at 861 V. The recirculating water can\nbe used to heat a distributed array of converters to generate electricity in\ndistant robotic actuator sections. The development of this distributed system\nwould enable untethered micro-robot to be operated with a flexible body and\nfree of battery recharging, which advances its applications in the real world.\n","authors":["Di Ni","Ved Gund","Landon Ivy","Amit Lal"],"pdf_url":"https://arxiv.org/pdf/2411.02295v1.pdf","comment":"Accepted and published at Hilton Head Workshop 2022: A Solid-State\n Sensors, Actuators and Microsystems Workshop"},{"id":"http://arxiv.org/abs/2410.14468v4","updated":"2024-11-04T17:09:25Z","published":"2024-10-18T13:52:28Z","title":"Knowledge Transfer from Simple to Complex: A Safe and Efficient\n Reinforcement Learning Framework for Autonomous Driving Decision-Making","summary":" A safe and efficient decision-making system is crucial for autonomous\nvehicles. However, the complexity of driving environments limits the\neffectiveness of many rule-based and machine learning approaches. Reinforcement\nLearning (RL), with its robust self-learning capabilities and environmental\nadaptability, offers a promising solution to these challenges. Nevertheless,\nsafety and efficiency concerns during training hinder its widespread\napplication. To address these concerns, we propose a novel RL framework, Simple\nto Complex Collaborative Decision (S2CD). First, we rapidly train the teacher\nmodel in a lightweight simulation environment. In the more complex and\nrealistic environment, teacher intervenes when the student agent exhibits\nsuboptimal behavior by assessing actions' value to avert dangers. We also\nintroduce an RL algorithm called Adaptive Clipping Proximal Policy Optimization\nPlus, which combines samples from both teacher and student policies and employs\ndynamic clipping strategies based on sample importance. This approach improves\nsample efficiency while effectively alleviating data imbalance. Additionally,\nwe employ the Kullback-Leibler divergence as a policy constraint, transforming\nit into an unconstrained problem with the Lagrangian method to accelerate the\nstudent's learning. Finally, a gradual weaning strategy ensures that the\nstudent learns to explore independently over time, overcoming the teacher's\nlimitations and maximizing performance. Simulation experiments in highway\nlane-change scenarios show that the S2CD framework enhances learning\nefficiency, reduces training costs, and significantly improves safety compared\nto state-of-the-art algorithms. This framework also ensures effective knowledge\ntransfer between teacher and student models, even with suboptimal teachers, the\nstudent achieves superior performance, demonstrating the robustness and\neffectiveness of S2CD.\n","authors":["Rongliang Zhou","Jiakun Huang","Mingjun Li","Hepeng Li","Haotian Cao","Xiaolin Song"],"pdf_url":"https://arxiv.org/pdf/2410.14468v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06454v3","updated":"2024-11-04T16:25:05Z","published":"2024-07-08T23:35:36Z","title":"Simplification of Robotic System Model Analysis by Petri Net Meta-Model\n Property Transfer","summary":" This paper presents a simplification of robotic system model analysis due to\nthe transfer of Robotic System Hierarchical Petri Net (RSHPN) meta-model\nproperties onto the model of a designed system. Key contributions include: 1)\nanalysis of RSHPN meta-model properties; 2) decomposition of RSHPN analysis\ninto analysis of individual Petri nets, thus the reduction of state space\nexplosion; and 3) transfer of RSHPN meta-model properties onto the produced\nmodels, hence elimination of the need for full re-analysis of the RSHPN model\nwhen creating new robotic systems. Only task-dependent parts of the model need\nto be analyzed. This approach streamlines the analysis thus reducing the design\ntime. Moreover, it produces a specification which is a solid foundation for the\nimplementation of the system. The obtained results highlight the potential of\nPetri nets as a valuable formal framework for analyzing robotic system\nproperties.\n","authors":["Maksym Figat","Cezary Zieliński"],"pdf_url":"https://arxiv.org/pdf/2407.06454v3.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2411.02230v1","updated":"2024-11-04T16:22:32Z","published":"2024-11-04T16:22:32Z","title":"Energy-Aware Coverage Planning for Heterogeneous Multi-Robot System","summary":" We propose a distributed control law for a heterogeneous multi-robot coverage\nproblem, where the robots could have different energy characteristics, such as\ncapacity and depletion rates, due to their varying sizes, speeds, capabilities,\nand payloads. Existing energy-aware coverage control laws consider capacity\ndifferences but assume the battery depletion rate to be the same for all\nrobots. In realistic scenarios, however, some robots can consume energy much\nfaster than other robots; for instance, UAVs hover at different altitudes, and\nthese changes could be dynamically updated based on their assigned tasks.\nRobots' energy capacities and depletion rates need to be considered to maximize\nthe performance of a multi-robot system. To this end, we propose a new\nenergy-aware controller based on Lloyd's algorithm to adapt the weights of the\nrobots based on their energy dynamics and divide the area of interest among the\nrobots accordingly. The controller is theoretically analyzed and extensively\nevaluated through simulations and real-world demonstrations in multiple\nrealistic scenarios and compared with three baseline control laws to validate\nits performance and efficacy.\n","authors":["Aiman Munir","Ayan Dutta","Ramviyas Parasuraman"],"pdf_url":"https://arxiv.org/pdf/2411.02230v1.pdf","comment":"Presented at DARS 2024"},{"id":"http://arxiv.org/abs/2411.02214v1","updated":"2024-11-04T16:11:33Z","published":"2024-11-04T16:11:33Z","title":"DexHub and DART: Towards Internet Scale Robot Data Collection","summary":" The quest to build a generalist robotic system is impeded by the scarcity of\ndiverse and high-quality data. While real-world data collection effort exist,\nrequirements for robot hardware, physical environment setups, and frequent\nresets significantly impede the scalability needed for modern learning\nframeworks. We introduce DART, a teleoperation platform designed for\ncrowdsourcing that reimagines robotic data collection by leveraging cloud-based\nsimulation and augmented reality (AR) to address many limitations of prior data\ncollection efforts. Our user studies highlight that DART enables higher data\ncollection throughput and lower physical fatigue compared to real-world\nteleoperation. We also demonstrate that policies trained using DART-collected\ndatasets successfully transfer to reality and are robust to unseen visual\ndisturbances. All data collected through DART is automatically stored in our\ncloud-hosted database, DexHub, which will be made publicly available upon\ncuration, paving the path for DexHub to become an ever-growing data hub for\nrobot learning. Videos are available at: https://dexhub.ai/project\n","authors":["Younghyo Park","Jagdeep Singh Bhatia","Lars Ankile","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2411.02214v1.pdf","comment":"Visit https://dexhub.ai/project for more details"},{"id":"http://arxiv.org/abs/2310.11590v2","updated":"2024-11-04T15:49:59Z","published":"2023-10-17T21:12:32Z","title":"Predicting Human Impressions of Robot Performance During Navigation\n Tasks","summary":" Human impressions of robot performance are often measured through surveys. As\na more scalable and cost-effective alternative, we investigate the possibility\nof predicting people's impressions of robot behavior using non-verbal\nbehavioral cues and machine learning techniques. To this end, we first\ncontribute the SEAN TOGETHER Dataset consisting of observations of an\ninteraction between a person and a mobile robot in a VR simulation, together\nwith impressions of robot performance provided by users on a 5-point scale.\nSecond, we contribute analyses of how well humans and supervised learning\ntechniques can predict perceived robot performance based on different\nobservation types (like facial expression features, and features that describe\nthe navigation behavior of the robot and pedestrians). Our results suggest that\nfacial expressions alone provide useful information about human impressions of\nrobot performance; but in the navigation scenarios that we considered,\nreasoning about spatial features in context is critical for the prediction\ntask. Also, supervised learning techniques showed promise because they\noutperformed humans' predictions of robot performance in most cases. Further,\nwhen predicting robot performance as a binary classification task on unseen\nusers' data, the F1 Score of machine learning models more than doubled in\ncomparison to predicting performance on a 5-point scale. This suggested that\nthe models can have good generalization capabilities, although they are better\nat telling the directionality of robot performance than predicting exact\nperformance ratings. Based on our findings in simulation, we conducted a\nreal-world demonstration in which a mobile robot uses a machine learning model\nto predict how a human that follows it perceives it. Finally, we discuss the\nimplications of our results for implementing such supervised learning models in\nreal-world navigation scenarios.\n","authors":["Qiping Zhang","Nathan Tsoi","Mofeed Nagib","Booyeon Choi","Jie Tan","Hao-Tien Lewis Chiang","Marynel Vázquez"],"pdf_url":"https://arxiv.org/pdf/2310.11590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02189v1","updated":"2024-11-04T15:43:57Z","published":"2024-11-04T15:43:57Z","title":"DiffSim2Real: Deploying Quadrupedal Locomotion Policies Purely Trained\n in Differentiable Simulation","summary":" Differentiable simulators provide analytic gradients, enabling more\nsample-efficient learning algorithms and paving the way for data intensive\nlearning tasks such as learning from images. In this work, we demonstrate that\nlocomotion policies trained with analytic gradients from a differentiable\nsimulator can be successfully transferred to the real world. Typically,\nsimulators that offer informative gradients lack the physical accuracy needed\nfor sim-to-real transfer, and vice-versa. A key factor in our success is a\nsmooth contact model that combines informative gradients with physical\naccuracy, ensuring effective transfer of learned behaviors. To the best of our\nknowledge, this is the first time a real quadrupedal robot is able to locomote\nafter training exclusively in a differentiable simulation.\n","authors":["Joshua Bagajo","Clemens Schwarke","Victor Klemm","Ignat Georgiev","Jean-Pierre Sleiman","Jesus Tordesillas","Animesh Garg","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2411.02189v1.pdf","comment":"Presented at the CoRL 2024 Workshop 'Differentiable Optimization\n Everywhere'"},{"id":"http://arxiv.org/abs/2411.02187v1","updated":"2024-11-04T15:41:45Z","published":"2024-11-04T15:41:45Z","title":"Touch-to-Touch Translation -- Learning the Mapping Between Heterogeneous\n Tactile Sensing Technologies","summary":" The use of data-driven techniques for tactile data processing and\nclassification has recently increased. However, collecting tactile data is a\ntime-expensive and sensor-specific procedure. Indeed, due to the lack of\nhardware standards in tactile sensing, data is required to be collected for\neach different sensor. This paper considers the problem of learning the mapping\nbetween two tactile sensor outputs with respect to the same physical stimulus\n-- we refer to this problem as touch-to-touch translation. In this respect, we\nproposed two data-driven approaches to address this task and we compared their\nperformance. The first one exploits a generative model developed for\nimage-to-image translation and adapted for this context. The second one uses a\nResNet model trained to perform a regression task. We validated both methods\nusing two completely different tactile sensors -- a camera-based, Digit and a\ncapacitance-based, CySkin. In particular, we used Digit images to generate the\ncorresponding CySkin data. We trained the models on a set of tactile features\nthat can be found in common larger objects and we performed the testing on a\npreviously unseen set of data. Experimental results show the possibility of\ntranslating Digit images into the CySkin output by preserving the contact shape\nand with an error of 15.18% in the magnitude of the sensor responses.\n","authors":["Francesco Grella","Alessandro Albini","Giorgio Cannata","Perla Maiolino"],"pdf_url":"https://arxiv.org/pdf/2411.02187v1.pdf","comment":"This paper was initially submitted at the International Conference on\n Intelligent Robots and Systems (IROS) 2023"},{"id":"http://arxiv.org/abs/2411.02186v1","updated":"2024-11-04T15:39:59Z","published":"2024-11-04T15:39:59Z","title":"Limiting Kinetic Energy through Control Barrier Functions: Analysis and\n Experimental Validation","summary":" In the context of safety-critical control, we propose and analyse the use of\nControl Barrier Functions (CBFs) to limit the kinetic energy of\ntorque-controlled robots. The proposed scheme is able to modify a nominal\ncontrol action in a minimally invasive manner to achieve the desired kinetic\nenergy limit. We show how this safety condition is achieved by appropriately\ninjecting damping in the underlying robot dynamics independently of the nominal\ncontroller structure. We present an extensive experimental validation of the\napproach on a 7-Degree of Freedom (DoF) Franka Emika Panda robot. The results\ndemonstrate that this approach provides an effective, minimally invasive safety\nlayer that is straightforward to implement and is robust in real experiments.\n","authors":["Federico Califano","Daniel Logmans","Wesley Roozing"],"pdf_url":"https://arxiv.org/pdf/2411.02186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02169v1","updated":"2024-11-04T15:28:54Z","published":"2024-11-04T15:28:54Z","title":"Diffusion-based Virtual Fixtures","summary":" Virtual fixtures assist human operators in teleoperation settings by\nconstraining their actions. This extended abstract introduces a novel virtual\nfixture formulation \\emph{on surfaces} for tactile robotics tasks. Unlike\nexisting methods, our approach constrains the behavior based on the position on\nthe surface and generalizes it over the surface by considering the distance\n(metric) on the surface. Our method works directly on possibly noisy and\npartial point clouds collected via a camera. Given a set of regions on the\nsurface together with their desired behaviors, our method diffuses the\nbehaviors across the entire surface by taking into account the surface\ngeometry. We demonstrate our method's ability in two simulated experiments (i)\nto regulate contact force magnitude or tangential speed based on surface\nposition and (ii) to guide the robot to targets while avoiding restricted\nregions defined on the surface. All source codes, experimental data, and videos\nare available as open access at\nhttps://sites.google.com/view/diffusion-virtual-fixtures\n","authors":["Cem Bilaloglu","Tobias Löw","Sylvain Calinon"],"pdf_url":"https://arxiv.org/pdf/2411.02169v1.pdf","comment":"Presented at ICRA@40"},{"id":"http://arxiv.org/abs/2403.07312v3","updated":"2024-11-04T15:26:27Z","published":"2024-03-12T04:49:59Z","title":"RoLD: Robot Latent Diffusion for Multi-task Policy Modeling","summary":" Modeling generalized robot control policies poses ongoing challenges for\nlanguage-guided robot manipulation tasks. Existing methods often struggle to\nefficiently utilize cross-dataset resources or rely on resource-intensive\nvision-language models, thus limiting their multi-task performance and\npractical applications. In this study, we propose a novel approach that\ndecouples robot action trajectory encoding and control policy generation by\nleveraging latent action trajectory spaces, enhancing the generalization\nability of policy generation on multi-task manipulation tasks. First, we\npre-train a task-agnostic auto-encoder to project an action trajectory of\nseveral frames accompanied with observations into a latent action trajectory\nspace on large-scale datasets collected with multiple embodiments in various\nenvironments. Then we propose learning a diffusion model based on the latent\naction trajectory space to generate actions of next steps. Through experiments\non two widely used benchmarks, results demonstrate that our proposed method\noutperforms baselines by 7%-29% in terms of average success rate across eight\ntasks. Our method can consistently benefit from pre-training while baselines\ncannot. Our method is more than two times faster than our baseline.\n","authors":["Wenhui Tan","Bei Liu","Junbo Zhang","Ruihua Song","Jianlong Fu"],"pdf_url":"https://arxiv.org/pdf/2403.07312v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02158v1","updated":"2024-11-04T15:17:19Z","published":"2024-11-04T15:17:19Z","title":"Learning Multiple Initial Solutions to Optimization Problems","summary":" Sequentially solving similar optimization problems under strict runtime\nconstraints is essential for many applications, such as robot control,\nautonomous driving, and portfolio management. The performance of local\noptimization methods in these settings is sensitive to the initial solution:\npoor initialization can lead to slow convergence or suboptimal solutions. To\naddress this challenge, we propose learning to predict \\emph{multiple} diverse\ninitial solutions given parameters that define the problem instance. We\nintroduce two strategies for utilizing multiple initial solutions: (i) a\nsingle-optimizer approach, where the most promising initial solution is chosen\nusing a selection function, and (ii) a multiple-optimizers approach, where\nseveral optimizers, potentially run in parallel, are each initialized with a\ndifferent solution, with the best solution chosen afterward. We validate our\nmethod on three optimal control benchmark tasks: cart-pole, reacher, and\nautonomous driving, using different optimizers: DDP, MPPI, and iLQR. We find\nsignificant and consistent improvement with our method across all evaluation\nsettings and demonstrate that it efficiently scales with the number of initial\nsolutions required. The code is available at\n$\\href{https://github.com/EladSharony/miso}{\\tt{https://github.com/EladSharony/miso}}$.\n","authors":["Elad Sharony","Heng Yang","Tong Che","Marco Pavone","Shie Mannor","Peter Karkus"],"pdf_url":"https://arxiv.org/pdf/2411.02158v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2402.04862v2","updated":"2024-11-04T15:04:40Z","published":"2024-02-07T13:57:52Z","title":"Tactile Ergodic Coverage on Curved Surfaces","summary":" In this article, we present a feedback control method for tactile coverage\ntasks, such as cleaning or surface inspection. These tasks are challenging to\nplan due to complex continuous physical interactions. In these tasks, the\ncoverage target and progress can be easily measured using a camera and encoded\nin a point cloud. We propose an ergodic coverage method that operates directly\non point clouds, guiding the robot to spend more time on regions requiring more\ncoverage. For robot control and contact behavior, we use geometric algebra to\nformulate a task-space impedance controller that tracks a line while\nsimultaneously exerting a desired force along that line. We evaluate the\nperformance of our method in kinematic simulations and demonstrate its\napplicability in real-world experiments on kitchenware. Our source codes,\nexperimental data, and videos are available as open access at\nhttps://sites.google.com/view/tactile-ergodic-control/\n","authors":["Cem Bilaloglu","Tobias Löw","Sylvain Calinon"],"pdf_url":"https://arxiv.org/pdf/2402.04862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05884v2","updated":"2024-11-04T14:43:41Z","published":"2024-10-08T10:29:17Z","title":"A Robust Quadruped Robot with Twisting Waist for Flexible Motions","summary":" The waist plays a crucial role in the agile movement of many animals in\nnature. It provides the torso with additional degrees of freedom and\nflexibility, inspiring researchers to incorporate this biological feature into\nrobotic structures to enhance robot locomotion. This paper presents a\ncost-effective and low-complexity waist mechanism integrated into the structure\nof the open-source robot solo8, adding a new degree of freedom (DOF) to its\ntorso. We refer to this novel robot as solo9. Additionally, we propose a\nfull-body control method for the waist-equipped quadruped robot based on\ngenerative adversarial imitation learning (GAIL). During training, the\ndiscriminator is used as input for iterative optimization of the policy and\ndataset, enabling solo9 to achieve flexible steering maneuvers across various\ngaits. Extensive tests of solo9's steering capabilities, terrain adaptability,\nand robustness are conducted in both simulation and real-world scenarios, with\ndetailed comparisons to solo8 and solo12, demonstrating the effectiveness of\nthe control algorithm and the advantages of the waist mechanism.\n","authors":["Quancheng Qian","Xiaoyi Wei","Zonghao Zhang","Jiaxin Tu","Yueqi Zhang","Taixian Hou","Xiaofei Gao","Peng Zhai","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.05884v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11562v2","updated":"2024-11-04T14:32:26Z","published":"2024-07-16T10:15:35Z","title":"RobotKeyframing: Learning Locomotion with High-Level Objectives via\n Mixture of Dense and Sparse Rewards","summary":" This paper presents a novel learning-based control framework that uses\nkeyframing to incorporate high-level objectives in natural locomotion for\nlegged robots. These high-level objectives are specified as a variable number\nof partial or complete pose targets that are spaced arbitrarily in time. Our\nproposed framework utilizes a multi-critic reinforcement learning algorithm to\neffectively handle the mixture of dense and sparse rewards. Additionally, it\nemploys a transformer-based encoder to accommodate a variable number of input\ntargets, each associated with specific time-to-arrivals. Throughout simulation\nand hardware experiments, we demonstrate that our framework can effectively\nsatisfy the target keyframe sequence at the required times. In the experiments,\nthe multi-critic method significantly reduces the effort of hyperparameter\ntuning compared to the standard single-critic alternative. Moreover, the\nproposed transformer-based architecture enables robots to anticipate future\ngoals, which results in quantitative improvements in their ability to reach\ntheir targets.\n","authors":["Fatemeh Zargarbashi","Jin Cheng","Dongho Kang","Robert Sumner","Stelian Coros"],"pdf_url":"https://arxiv.org/pdf/2407.11562v2.pdf","comment":"This paper has been accepted to 8th Conference on Robot Learning\n (CoRL 2024). Project website: https://sites.google.com/view/robot-keyframing"},{"id":"http://arxiv.org/abs/2303.06753v3","updated":"2024-11-04T14:32:02Z","published":"2023-03-12T21:01:54Z","title":"Modular Quantization-Aware Training for 6D Object Pose Estimation","summary":" Edge applications, such as collaborative robotics and spacecraft rendezvous,\ndemand efficient 6D object pose estimation on resource-constrained embedded\nplatforms. Existing 6D pose estimation networks are often too large for such\ndeployments, necessitating compression while maintaining reliable performance.\nTo address this challenge, we introduce Modular Quantization-Aware Training\n(MQAT), an adaptive and mixed-precision quantization-aware training strategy\nthat exploits the modular structure of modern 6D pose estimation architectures.\nMQAT guides a systematic gradated modular quantization sequence and determines\nmodule-specific bit precisions, leading to quantized models that outperform\nthose produced by state-of-the-art uniform and mixed-precision quantization\ntechniques. Our experiments showcase the generality of MQAT across datasets,\narchitectures, and quantization algorithms. Remarkably, MQAT-trained quantized\nmodels achieve a significant accuracy boost (>7%) over the baseline\nfull-precision network while reducing model size by a factor of 4x or more. Our\nproject website is at: https://saqibjaved1.github.io/MQAT_/\n","authors":["Saqib Javed","Chengkun Li","Andrew Price","Yinlin Hu","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2303.06753v3.pdf","comment":"Accepted to Transactions on Machine Learning Research (TMLR), 2024"},{"id":"http://arxiv.org/abs/2407.02876v2","updated":"2024-11-04T14:15:47Z","published":"2024-07-03T07:44:09Z","title":"Prävention und Beseitigung von Fehlerursachen im Kontext von\n unbemannten Fahrzeugen","summary":" Mobile robots, becoming increasingly autonomous, are capable of operating in\ndiverse and unknown environments. This flexibility allows them to fulfill goals\nindependently and adapting their actions dynamically without rigidly predefined\ncontrol codes. However, their autonomous behavior complicates guaranteeing\nsafety and reliability due to the limited influence of a human operator to\naccurately supervise and verify each robot's actions. To ensure autonomous\nmobile robot's safety and reliability, which are aspects of dependability,\nmethods are needed both in the planning and execution of missions for\nautonomous mobile robots. In this article, a twofold approach is presented that\nensures fault removal in the context of mission planning and fault prevention\nduring mission execution for autonomous mobile robots. First, the approach\nconsists of a concept based on formal verification applied during the planning\nphase of missions. Second, the approach consists of a rule-based concept\napplied during mission execution. A use case applying the approach is\npresented, discussing how the two concepts complement each other and what\ncontribution they make to certain aspects of dependability.\n","authors":["Aron Schnakenbeck","Christoph Sieber","Luis Miguel Vieira da Silva","Felix Gehlhoff","Alexander Fay"],"pdf_url":"https://arxiv.org/pdf/2407.02876v2.pdf","comment":"Language: German. Dieser Beitrag wird eingereicht in:\n \"dtec.bw-Beitr\\\"age der Helmut-Schmidt-Universit\\\"at/Universit\\\"at der\n Bundeswehr Hamburg: Forschungsaktivit\\\"aten im Zentrum f\\\"ur\n Digitalisierungs- und Technologieforschung der Bundeswehr dtec.bw\""},{"id":"http://arxiv.org/abs/2407.17502v2","updated":"2024-11-04T14:14:01Z","published":"2024-07-05T14:31:51Z","title":"MetaLoco: Universal Quadrupedal Locomotion with Meta-Reinforcement\n Learning and Motion Imitation","summary":" This work presents a meta-reinforcement learning approach to develop a\nuniversal locomotion control policy capable of zero-shot generalization across\ndiverse quadrupedal platforms. The proposed method trains an RL agent equipped\nwith a memory unit to imitate reference motions using a small set of\nprocedurally generated quadruped robots. Through comprehensive simulation and\nreal-world hardware experiments, we demonstrate the efficacy of our approach in\nachieving locomotion across various robots without requiring robot-specific\nfine-tuning. Furthermore, we highlight the critical role of the memory unit in\nenabling generalization, facilitating rapid adaptation to changes in the robot\nproperties, and improving sample efficiency.\n","authors":["Fatemeh Zargarbashi","Fabrizio Di Giuro","Jin Cheng","Dongho Kang","Bhavya Sukhija","Stelian Coros"],"pdf_url":"https://arxiv.org/pdf/2407.17502v2.pdf","comment":"The supplementary video is available at\n https://youtu.be/PaFRUDOrh_U?si=hfdbng3Wxo_GnxIA"},{"id":"http://arxiv.org/abs/2411.02102v1","updated":"2024-11-04T14:10:46Z","published":"2024-11-04T14:10:46Z","title":"Toward Realistic Cinema: The State of the Art in Mechatronics for Modern\n Animatronic","summary":" The pursuit of realism in cinema has driven significant advancements in\nanimatronics, where the integration of mechatronics, a multidisciplinary field\nthat combines mechanical engineering, electronics, and computer science, plays\na pivotal role in enhancing the functionality and realism of animatronics. This\ninterdisciplinary approach facilitates smoother characters movements and\nenhances the sophistication of behaviors in animatronic creatures, thereby\nincreasing their realism. This article examines the most recent developments in\nmechatronic technology and their significant impact on the art and engineering\nof animatronics in the filmmaking. It explores the sophisticated integration of\nsystem components and analyzes how these enhancements foster complexity and\nintegration, crucial for achieving unprecedented levels of realism in modern\ncinema. Further, the article delves into in-depth case studies of well-known\nmovie characters, demonstrating the practical applicability of these\nstate-of-the-art mechatronic solutions in creating compelling, lifelike\ncinematic experiences. This paper aims to bridge the gap between the technical\naspects of mechatronics and the creative demands of the film industry,\nultimately contributing to the ongoing evolution of cinematic realism.\n","authors":["Riham M. Hilal","Haitham El-Hussieny","Ayman A. Nada"],"pdf_url":"https://arxiv.org/pdf/2411.02102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15840v3","updated":"2024-11-04T14:04:56Z","published":"2024-09-24T08:07:25Z","title":"Distance-based Multiple Non-cooperative Ground Target Encirclement for\n Complex Environments","summary":" This paper proposes a comprehensive strategy for complex\nmulti-target-multi-drone encirclement in an obstacle-rich and GPS-denied\nenvironment, motivated by practical scenarios such as pursuing vehicles or\nhumans in urban canyons. The drones have omnidirectional range sensors that can\nrobustly detect ground targets and obtain noisy relative distances. After each\ndrone task is assigned, a novel distance-based target state estimator (DTSE) is\nproposed by estimating the measurement output noise variance and utilizing the\nKalman filter. By integrating anti-synchronization techniques and pseudo-force\nfunctions, an acceleration controller enables two tasking drones to\ncooperatively encircle a target from opposing positions while navigating\nobstacles. The algorithms effectiveness for the discrete-time double-integrator\nsystem is established theoretically, particularly regarding observability.\nMoreover, the versatility of the algorithm is showcased in aerial-to-ground\nscenarios, supported by compelling simulation results. Experimental validation\ndemonstrates the effectiveness of the proposed approach.\n","authors":["Fen Liu","Shenghai Yuan","Kun Cao","Wei Meng","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2409.15840v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09875v2","updated":"2024-11-04T13:17:00Z","published":"2023-09-18T15:37:01Z","title":"RaLF: Flow-based Global and Metric Radar Localization in LiDAR Maps","summary":" Localization is paramount for autonomous robots. While camera and LiDAR-based\napproaches have been extensively investigated, they are affected by adverse\nillumination and weather conditions. Therefore, radar sensors have recently\ngained attention due to their intrinsic robustness to such conditions. In this\npaper, we propose RaLF, a novel deep neural network-based approach for\nlocalizing radar scans in a LiDAR map of the environment, by jointly learning\nto address both place recognition and metric localization. RaLF is composed of\nradar and LiDAR feature encoders, a place recognition head that generates\nglobal descriptors, and a metric localization head that predicts the 3-DoF\ntransformation between the radar scan and the map. We tackle the place\nrecognition task by learning a shared embedding space between the two\nmodalities via cross-modal metric learning. Additionally, we perform metric\nlocalization by predicting pixel-level flow vectors that align the query radar\nscan with the LiDAR map. We extensively evaluate our approach on multiple\nreal-world driving datasets and show that RaLF achieves state-of-the-art\nperformance for both place recognition and metric localization. Moreover, we\ndemonstrate that our approach can effectively generalize to different cities\nand sensor setups than the ones used during training. We make the code and\ntrained models publicly available at http://ralf.cs.uni-freiburg.de.\n","authors":["Abhijeet Nayak","Daniele Cattaneo","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2309.09875v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02062v1","updated":"2024-11-04T13:05:11Z","published":"2024-11-04T13:05:11Z","title":"Heterogeneous Multi-robot Task Allocation for Long-Endurance Missions in\n Dynamic Scenarios","summary":" We present a framework for Multi-Robot Task Allocation (MRTA) in\nheterogeneous teams performing long-endurance missions in dynamic scenarios.\nGiven the limited battery of robots, especially in the case of aerial vehicles,\nwe allow for robot recharges and the possibility of fragmenting and/or relaying\ncertain tasks. We also address tasks that must be performed by a coalition of\nrobots in a coordinated manner. Given these features, we introduce a new class\nof heterogeneous MRTA problems which we analyze theoretically and optimally\nformulate as a Mixed-Integer Linear Program. We then contribute a heuristic\nalgorithm to compute approximate solutions and integrate it into a mission\nplanning and execution architecture capable of reacting to unexpected events by\nrepairing or recomputing plans online. Our experimental results show the\nrelevance of our newly formulated problem in a realistic use case for\ninspection with aerial robots. We assess the performance of our heuristic\nsolver in comparison with other variants and with exact optimal solutions in\nsmall-scale scenarios. In addition, we evaluate the ability of our replanning\nframework to repair plans online.\n","authors":["Alvaro Calvo","Jesus Capitan"],"pdf_url":"https://arxiv.org/pdf/2411.02062v1.pdf","comment":"20 pages, 10 figures"},{"id":"http://arxiv.org/abs/2411.02028v1","updated":"2024-11-04T12:25:20Z","published":"2024-11-04T12:25:20Z","title":"An Immediate Update Strategy of Multi-State Constraint Kalman Filter","summary":" The lightweight Multi-state Constraint Kalman Filter (MSCKF) has been\nwell-known for its high efficiency, in which the delayed update has been\nusually adopted since its proposal. This work investigates the immediate update\nstrategy of MSCKF based on timely reconstructed 3D feature points and\nmeasurement constraints. The differences between the delayed update and the\nimmediate update are theoretically analyzed in detail. It is found that the\nimmediate update helps construct more observation constraints and employ more\nfiltering updates than the delayed update, which improves the linearization\npoint of the measurement model and therefore enhances the estimation accuracy.\nNumerical simulations and experiments show that the immediate update strategy\nsignificantly enhances MSCKF even with a small amount of feature observations.\n","authors":["Qingchao Zhang","Wei Ouyang","Jiale Han","Qi Cai","Maoran Zhu","Yuanxin Wu"],"pdf_url":"https://arxiv.org/pdf/2411.02028v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.17298v3","updated":"2024-11-04T12:24:21Z","published":"2024-04-26T10:06:58Z","title":"Automatic Target-Less Camera-LiDAR Calibration From Motion and Deep\n Point Correspondences","summary":" Sensor setups of robotic platforms commonly include both camera and LiDAR as\nthey provide complementary information. However, fusing these two modalities\ntypically requires a highly accurate calibration between them. In this paper,\nwe propose MDPCalib which is a novel method for camera-LiDAR calibration that\nrequires neither human supervision nor any specific target objects. Instead, we\nutilize sensor motion estimates from visual and LiDAR odometry as well as deep\nlearning-based 2D-pixel-to-3D-point correspondences that are obtained without\nin-domain retraining. We represent camera-LiDAR calibration as an optimization\nproblem and minimize the costs induced by constraints from sensor motion and\npoint correspondences. In extensive experiments, we demonstrate that our\napproach yields highly accurate extrinsic calibration parameters and is robust\nto random initialization. Additionally, our approach generalizes to a wide\nrange of sensor setups, which we demonstrate by employing it on various robotic\nplatforms including a self-driving perception car, a quadruped robot, and a\nUAV. To make our calibration method publicly accessible, we release the code on\nour project website at http://calibration.cs.uni-freiburg.de.\n","authors":["Kürsat Petek","Niclas Vödisch","Johannes Meyer","Daniele Cattaneo","Abhinav Valada","Wolfram Burgard"],"pdf_url":"https://arxiv.org/pdf/2404.17298v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03078v2","updated":"2024-11-04T12:19:57Z","published":"2024-08-06T10:13:57Z","title":"BodySLAM: A Generalized Monocular Visual SLAM Framework for Surgical\n Applications","summary":" Endoscopic surgery relies on two-dimensional views, posing challenges for\nsurgeons in depth perception and instrument manipulation. While Monocular\nVisual Simultaneous Localization and Mapping (MVSLAM) has emerged as a\npromising solution, its implementation in endoscopic procedures faces\nsignificant challenges due to hardware limitations, such as the use of a\nmonocular camera and the absence of odometry sensors. This study presents\nBodySLAM, a robust deep learning-based MVSLAM approach that addresses these\nchallenges through three key components: CycleVO, a novel unsupervised\nmonocular pose estimation module; the integration of the state-of-the-art Zoe\narchitecture for monocular depth estimation; and a 3D reconstruction module\ncreating a coherent surgical map. The approach is rigorously evaluated using\nthree publicly available datasets (Hamlyn, EndoSLAM, and SCARED) spanning\nlaparoscopy, gastroscopy, and colonoscopy scenarios, and benchmarked against\nfour state-of-the-art methods. Results demonstrate that CycleVO exhibited\ncompetitive performance with the lowest inference time among pose estimation\nmethods, while maintaining robust generalization capabilities, whereas Zoe\nsignificantly outperformed existing algorithms for depth estimation in\nendoscopy. BodySLAM's strong performance across diverse endoscopic scenarios\ndemonstrates its potential as a viable MVSLAM solution for endoscopic\napplications.\n","authors":["G. Manni","C. Lauretti","F. Prata","R. Papalia","L. Zollo","P. Soda"],"pdf_url":"https://arxiv.org/pdf/2408.03078v2.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.01985v1","updated":"2024-11-04T11:13:38Z","published":"2024-11-04T11:13:38Z","title":"Reshaping UAV-Enabled Communications with Omnidirectional Multi-Rotor\n Aerial Vehicles","summary":" A new class of Multi-Rotor Aerial Vehicles (MRAVs), known as omnidirectional\nMRAVs (o-MRAVs), has attracted significant interest in the robotics community.\nThese MRAVs have the unique capability of independently controlling their 3D\nposition and 3D orientation. In the context of aerial communication networks,\nthis translates into the ability to control the position and orientation of the\nantenna mounted on the MRAV without any additional devices tasked for antenna\norientation. This additional Degrees of Freedom (DoF) adds a new dimension to\naerial communication systems, creating various research opportunities in\ncommunications-aware trajectory planning and positioning. This paper presents\nthis new class of MRAVs and discusses use cases in areas such as physical layer\nsecurity and optical communications. Furthermore, the benefits of these MRAVs\nare illustrated with realistic simulation scenarios. Finally, new research\nproblems and opportunities introduced by this advanced robotics technology are\ndiscussed.\n","authors":["Daniel Bonilla Licea","Giuseppe Silano","Hajar El Hammouti","Mounir Ghogho","Martin Saska"],"pdf_url":"https://arxiv.org/pdf/2411.01985v1.pdf","comment":"Accepted for IEEE Communications Magazine. \\c{opyright}2024 IEEE.\n Personal use of this material is permitted. Permission from IEEE must be\n obtained for all other uses, in any current or future media"},{"id":"http://arxiv.org/abs/2410.18825v2","updated":"2024-11-04T10:59:17Z","published":"2024-10-24T15:17:09Z","title":"A generic approach for reactive stateful mitigation of application\n failures in distributed robotics systems deployed with Kubernetes","summary":" Offloading computationally expensive algorithms to the edge or even cloud\noffers an attractive option to tackle limitations regarding on-board\ncomputational and energy resources of robotic systems. In cloud-native\napplications deployed with the container management system Kubernetes (K8s),\none key problem is ensuring resilience against various types of failures.\nHowever, complex robotic systems interacting with the physical world pose a\nvery specific set of challenges and requirements that are not yet covered by\nfailure mitigation approaches from the cloud-native domain. In this paper, we\ntherefore propose a novel approach for robotic system monitoring and stateful,\nreactive failure mitigation for distributed robotic systems deployed using\nKubernetes (K8s) and the Robot Operating System (ROS2). By employing the\ngeneric substrate of Behaviour Trees, our approach can be applied to any\nrobotic workload and supports arbitrarily complex monitoring and failure\nmitigation strategies. We demonstrate the effectiveness and\napplication-agnosticism of our approach on two example applications, namely\nAutonomous Mobile Robot (AMR) navigation and robotic manipulation in a\nsimulated environment.\n","authors":["Florian Mirus","Frederik Pasch","Nikhil Singhal","Kay-Ulrich Scholl"],"pdf_url":"https://arxiv.org/pdf/2410.18825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01963v1","updated":"2024-11-04T10:39:15Z","published":"2024-11-04T10:39:15Z","title":"V-CAS: A Realtime Vehicle Anti Collision System Using Vision Transformer\n on Multi-Camera Streams","summary":" This paper introduces a real-time Vehicle Collision Avoidance System (V-CAS)\ndesigned to enhance vehicle safety through adaptive braking based on\nenvironmental perception. V-CAS leverages the advanced vision-based transformer\nmodel RT-DETR, DeepSORT tracking, speed estimation, brake light detection, and\nan adaptive braking mechanism. It computes a composite collision risk score\nbased on vehicles' relative accelerations, distances, and detected braking\nactions, using brake light signals and trajectory data from multiple camera\nstreams to improve scene perception. Implemented on the Jetson Orin Nano, V-CAS\nenables real-time collision risk assessment and proactive mitigation through\nadaptive braking. A comprehensive training process was conducted on various\ndatasets for comparative analysis, followed by fine-tuning the selected object\ndetection model using transfer learning. The system's effectiveness was\nrigorously evaluated on the Car Crash Dataset (CCD) from YouTube and through\nreal-time experiments, achieving over 98% accuracy with an average proactive\nalert time of 1.13 seconds. Results indicate significant improvements in object\ndetection and tracking, enhancing collision avoidance compared to traditional\nsingle-camera methods. This research demonstrates the potential of low-cost,\nmulti-camera embedded vision transformer systems to advance automotive safety\nthrough enhanced environmental perception and proactive collision avoidance\nmechanisms.\n","authors":["Muhammad Waqas Ashraf","Ali Hassan","Imad Ali Shah"],"pdf_url":"https://arxiv.org/pdf/2411.01963v1.pdf","comment":"Accepted at ICMLA 2024"},{"id":"http://arxiv.org/abs/2411.00543v2","updated":"2024-11-04T10:21:57Z","published":"2024-11-01T12:50:38Z","title":"3D Equivariant Pose Regression via Direct Wigner-D Harmonics Prediction","summary":" Determining the 3D orientations of an object in an image, known as\nsingle-image pose estimation, is a crucial task in 3D vision applications.\nExisting methods typically learn 3D rotations parametrized in the spatial\ndomain using Euler angles or quaternions, but these representations often\nintroduce discontinuities and singularities. SO(3)-equivariant networks enable\nthe structured capture of pose patterns with data-efficient learning, but the\nparametrizations in spatial domain are incompatible with their architecture,\nparticularly spherical CNNs, which operate in the frequency domain to enhance\ncomputational efficiency. To overcome these issues, we propose a\nfrequency-domain approach that directly predicts Wigner-D coefficients for 3D\nrotation regression, aligning with the operations of spherical CNNs. Our\nSO(3)-equivariant pose harmonics predictor overcomes the limitations of spatial\nparameterizations, ensuring consistent pose estimation under arbitrary\nrotations. Trained with a frequency-domain regression loss, our method achieves\nstate-of-the-art results on benchmarks such as ModelNet10-SO(3) and PASCAL3D+,\nwith significant improvements in accuracy, robustness, and data efficiency.\n","authors":["Jongmin Lee","Minsu Cho"],"pdf_url":"https://arxiv.org/pdf/2411.00543v2.pdf","comment":"Accepted to NeurIPS 2024, Project webpage at\n http://cvlab.postech.ac.kr/research/3D_EquiPose"},{"id":"http://arxiv.org/abs/2409.07195v3","updated":"2024-11-04T10:19:37Z","published":"2024-09-11T11:34:43Z","title":"Perceptive Pedipulation with Local Obstacle Avoidance","summary":" Pedipulation leverages the feet of legged robots for mobile manipulation,\neliminating the need for dedicated robotic arms. While previous works have\nshowcased blind and task-specific pedipulation skills, they fail to account for\nstatic and dynamic obstacles in the environment. To address this limitation, we\nintroduce a reinforcement learning-based approach to train a whole-body\nobstacle-aware policy that tracks foot position commands while simultaneously\navoiding obstacles. Despite training the policy in only five different static\nscenarios in simulation, we show that it generalizes to unknown environments\nwith different numbers and types of obstacles. We analyze the performance of\nour method through a set of simulation experiments and successfully deploy the\nlearned policy on the ANYmal quadruped, demonstrating its capability to follow\nfoot commands while navigating around static and dynamic obstacles. Videos of\nthe experiments are available at\nsites.google.com/leggedrobotics.com/perceptive-pedipulation.\n","authors":["Jonas Stolle","Philip Arm","Mayank Mittal","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2409.07195v3.pdf","comment":"Accepted to the IEEE International Conference on Humanoid Robots 2024\n Videos available at\n sites.google.com/leggedrobotics.com/perceptive-pedipulation"},{"id":"http://arxiv.org/abs/2411.01943v1","updated":"2024-11-04T10:09:18Z","published":"2024-11-04T10:09:18Z","title":"Brainbots as smart autonomous active particles with programmable motion","summary":" We present an innovative robotic device designed to provide controlled motion\nfor studying active matter. Motion is driven by an internal vibrator powered by\na small rechargeable battery. The system integrates acoustic and magnetic\nsensors along with a programmable microcontroller. Unlike conventional\nvibrobots, the motor induces horizontal vibrations, resulting in cycloidal\ntrajectories that have been characterized and optimized. Portions of these\norbits can be utilized to create specific motion patterns. As a proof of\nconcept, we demonstrate how this versatile system can be exploited to develop\nactive particles with varying dynamics, ranging from ballistic motion to\nrun-and-tumble diffusive behavior.\n","authors":["M. Noirhomme","I. Mammadli","N. Vanesse","J. Pande","A. -S. Smith","N. Vandewalle"],"pdf_url":"https://arxiv.org/pdf/2411.01943v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2410.23085v2","updated":"2024-11-04T10:01:38Z","published":"2024-10-30T15:00:06Z","title":"S3PT: Scene Semantics and Structure Guided Clustering to Boost\n Self-Supervised Pre-Training for Autonomous Driving","summary":" Recent self-supervised clustering-based pre-training techniques like DINO and\nCribo have shown impressive results for downstream detection and segmentation\ntasks. However, real-world applications such as autonomous driving face\nchallenges with imbalanced object class and size distributions and complex\nscene geometries. In this paper, we propose S3PT a novel scene semantics and\nstructure guided clustering to provide more scene-consistent objectives for\nself-supervised training. Specifically, our contributions are threefold: First,\nwe incorporate semantic distribution consistent clustering to encourage better\nrepresentation of rare classes such as motorcycles or animals. Second, we\nintroduce object diversity consistent spatial clustering, to handle imbalanced\nand diverse object sizes, ranging from large background areas to small objects\nsuch as pedestrians and traffic signs. Third, we propose a depth-guided spatial\nclustering to regularize learning based on geometric information of the scene,\nthus further refining region separation on the feature level. Our learned\nrepresentations significantly improve performance in downstream semantic\nsegmentation and 3D object detection tasks on the nuScenes, nuImages, and\nCityscapes datasets and show promising domain translation properties.\n","authors":["Maciej K. Wozniak","Hariprasath Govindarajan","Marvin Klingner","Camille Maurice","B Ravi Kiran","Senthil Yogamani"],"pdf_url":"https://arxiv.org/pdf/2410.23085v2.pdf","comment":"Accepted for WACV 2025"},{"id":"http://arxiv.org/abs/2403.17565v2","updated":"2024-11-04T09:54:44Z","published":"2024-03-26T10:19:04Z","title":"Aerial Robots Carrying Flexible Cables: Dynamic Shape Optimal Control\n via Spectral Method Model","summary":" In this work, we present a model-based optimal boundary control design for an\naerial robotic system composed of a quadrotor carrying a flexible cable. The\nwhole system is modeled by partial differential equations (PDEs) combined with\nboundary conditions described by ordinary differential equations (ODEs). The\nproper orthogonal decomposition (POD) method is adopted to project the original\ninfinite-dimensional system on a finite low-dimensional space spanned by\northogonal basis functions. Based on such a reduced order model, nonlinear\nmodel predictive control (NMPC) is implemented online to realize both position\nand shape trajectory tracking of the flexible cable in an optimal predictive\nfashion. The proposed POD-based reduced modeling and optimal control paradigms\nare verified in simulation using an accurate high-dimensional FDM-based model\nand experimentally using a real quadrotor and a cable. The results show the\nviability of the POD-based predictive control approach (allowing closing the\ncontrol loop on the full system state) and its superior performance compared to\nan optimally tuned PID controller (allowing closing the control loop on the\nquadrotor state only).\n","authors":["Yaolei Shen","Antonio Franchi","Chiara Gabellieri"],"pdf_url":"https://arxiv.org/pdf/2403.17565v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01919v1","updated":"2024-11-04T09:34:55Z","published":"2024-11-04T09:34:55Z","title":"Real-Time Polygonal Semantic Mapping for Humanoid Robot Stair Climbing","summary":" We present a novel algorithm for real-time planar semantic mapping tailored\nfor humanoid robots navigating complex terrains such as staircases. Our method\nis adaptable to any odometry input and leverages GPU-accelerated processes for\nplanar extraction, enabling the rapid generation of globally consistent\nsemantic maps. We utilize an anisotropic diffusion filter on depth images to\neffectively minimize noise from gradient jumps while preserving essential edge\ndetails, enhancing normal vector images' accuracy and smoothness. Both the\nanisotropic diffusion and the RANSAC-based plane extraction processes are\noptimized for parallel processing on GPUs, significantly enhancing\ncomputational efficiency. Our approach achieves real-time performance,\nprocessing single frames at rates exceeding $30~Hz$, which facilitates detailed\nplane extraction and map management swiftly and efficiently. Extensive testing\nunderscores the algorithm's capabilities in real-time scenarios and\ndemonstrates its practical application in humanoid robot gait planning,\nsignificantly improving its ability to navigate dynamic environments.\n","authors":["Teng Bin","Jianming Yao","Tin Lun Lam","Tianwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.01919v1.pdf","comment":"Accepted by The 2024 IEEE-RAS International Conference on Humanoid\n Robots. The code: https://github.com/BTFrontier/polygon_mapping"},{"id":"http://arxiv.org/abs/2411.01915v1","updated":"2024-11-04T09:27:36Z","published":"2024-11-04T09:27:36Z","title":"RoboCrowd: Scaling Robot Data Collection through Crowdsourcing","summary":" In recent years, imitation learning from large-scale human demonstrations has\nemerged as a promising paradigm for training robot policies. However, the\nburden of collecting large quantities of human demonstrations is significant in\nterms of collection time and the need for access to expert operators. We\nintroduce a new data collection paradigm, RoboCrowd, which distributes the\nworkload by utilizing crowdsourcing principles and incentive design. RoboCrowd\nhelps enable scalable data collection and facilitates more efficient learning\nof robot policies. We build RoboCrowd on top of ALOHA (Zhao et al. 2023) -- a\nbimanual platform that supports data collection via puppeteering -- to explore\nthe design space for crowdsourcing in-person demonstrations in a public\nenvironment. We propose three classes of incentive mechanisms to appeal to\nusers' varying sources of motivation for interacting with the system: material\nrewards, intrinsic interest, and social comparison. We instantiate these\nincentives through tasks that include physical rewards, engaging or challenging\nmanipulations, as well as gamification elements such as a leaderboard. We\nconduct a large-scale, two-week field experiment in which the platform is\nsituated in a university cafe. We observe significant engagement with the\nsystem -- over 200 individuals independently volunteered to provide a total of\nover 800 interaction episodes. Our findings validate the proposed incentives as\nmechanisms for shaping users' data quantity and quality. Further, we\ndemonstrate that the crowdsourced data can serve as useful pre-training data\nfor policies fine-tuned on expert demonstrations -- boosting performance up to\n20% compared to when this data is not available. These results suggest the\npotential for RoboCrowd to reduce the burden of robot data collection by\ncarefully implementing crowdsourcing and incentive design principles.\n","authors":["Suvir Mirchandani","David D. Yuan","Kaylee Burns","Md Sazzad Islam","Tony Z. Zhao","Chelsea Finn","Dorsa Sadigh"],"pdf_url":"https://arxiv.org/pdf/2411.01915v1.pdf","comment":"21 pages, 25 figures"},{"id":"http://arxiv.org/abs/2411.01909v1","updated":"2024-11-04T09:21:00Z","published":"2024-11-04T09:21:00Z","title":"Traffic and Safety Rule Compliance of Humans in Diverse Driving\n Situations","summary":" The increasing interest in autonomous driving systems has highlighted the\nneed for an in-depth analysis of human driving behavior in diverse scenarios.\nAnalyzing human data is crucial for developing autonomous systems that\nreplicate safe driving practices and ensure seamless integration into\nhuman-dominated environments. This paper presents a comparative evaluation of\nhuman compliance with traffic and safety rules across multiple trajectory\nprediction datasets, including Argoverse 2, nuPlan, Lyft, and DeepUrban. By\ndefining and leveraging existing safety and behavior-related metrics, such as\ntime to collision, adherence to speed limits, and interactions with other\ntraffic participants, we aim to provide a comprehensive understanding of each\ndatasets strengths and limitations. Our analysis focuses on the distribution of\ndata samples, identifying noise, outliers, and undesirable behaviors exhibited\nby human drivers in both the training and validation sets. The results\nunderscore the need for applying robust filtering techniques to certain\ndatasets due to high levels of noise and the presence of such undesirable\nbehaviors.\n","authors":["Michael Kurenkov","Sajad Marvi","Julian Schmidt","Christoph B. Rist","Alessandro Canevaro","Hang Yu","Julian Jordan","Georg Schildbach","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2411.01909v1.pdf","comment":"8 pages, CoRL 2024 Workshop SAFE-ROL"},{"id":"http://arxiv.org/abs/2411.01866v1","updated":"2024-11-04T07:46:24Z","published":"2024-11-04T07:46:24Z","title":"Improving Trust Estimation in Human-Robot Collaboration Using Beta\n Reputation at Fine-grained Timescales","summary":" When interacting with each other, humans adjust their behavior based on\nperceived trust. However, to achieve similar adaptability, robots must\naccurately estimate human trust at sufficiently granular timescales during the\nhuman-robot collaboration task. A beta reputation is a popular way to formalize\na mathematical estimation of human trust. However, it relies on binary\nperformance, which updates trust estimations only after each task concludes.\nAdditionally, manually crafting a reward function is the usual method of\nbuilding a performance indicator, which is labor-intensive and time-consuming.\nThese limitations prevent efficiently capturing continuous changes in trust at\nmore granular timescales throughout the collaboration task. Therefore, this\npaper presents a new framework for the estimation of human trust using a beta\nreputation at fine-grained timescales. To achieve granularity in beta\nreputation, we utilize continuous reward values to update trust estimations at\neach timestep of a task. We construct a continuous reward function using\nmaximum entropy optimization to eliminate the need for the laborious\nspecification of a performance indicator. The proposed framework improves trust\nestimations by increasing accuracy, eliminating the need for manually crafting\na reward function, and advancing toward developing more intelligent robots. The\nsource code is publicly available.\nhttps://github.com/resuldagdanov/robot-learning-human-trust\n","authors":["Resul Dagdanov","Milan Andrejevic","Dikai Liu","Chin-Teng Lin"],"pdf_url":"https://arxiv.org/pdf/2411.01866v1.pdf","comment":"8 pages, 7 figures, 1 table. This work has been submitted to the IEEE\n for possible publication"},{"id":"http://arxiv.org/abs/2411.01850v1","updated":"2024-11-04T07:05:02Z","published":"2024-11-04T07:05:02Z","title":"ManiBox: Enhancing Spatial Grasping Generalization via Scalable\n Simulation Data Generation","summary":" Learning a precise robotic grasping policy is crucial for embodied agents\noperating in complex real-world manipulation tasks. Despite significant\nadvancements, most models still struggle with accurate spatial positioning of\nobjects to be grasped. We first show that this spatial generalization challenge\nstems primarily from the extensive data requirements for adequate spatial\nunderstanding. However, collecting such data with real robots is prohibitively\nexpensive, and relying on simulation data often leads to visual generalization\ngaps upon deployment. To overcome these challenges, we then focus on\nstate-based policy generalization and present \\textbf{ManiBox}, a novel\nbounding-box-guided manipulation method built on a simulation-based\nteacher-student framework. The teacher policy efficiently generates scalable\nsimulation data using bounding boxes, which are proven to uniquely determine\nthe objects' spatial positions. The student policy then utilizes these\nlow-dimensional spatial states to enable zero-shot transfer to real robots.\nThrough comprehensive evaluations in simulated and real-world environments,\nManiBox demonstrates a marked improvement in spatial grasping generalization\nand adaptability to diverse objects and backgrounds. Further, our empirical\nstudy into scaling laws for policy performance indicates that spatial volume\ngeneralization scales positively with data volume. For a certain level of\nspatial volume, the success rate of grasping empirically follows\nMichaelis-Menten kinetics relative to data volume, showing a saturation effect\nas data increases. Our videos and code are available in\nhttps://thkkk.github.io/manibox.\n","authors":["Hengkai Tan","Xuezhou Xu","Chengyang Ying","Xinyi Mao","Songming Liu","Xingxing Zhang","Hang Su","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.01850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01816v1","updated":"2024-11-04T05:38:05Z","published":"2024-11-04T05:38:05Z","title":"Toward Integrating Semantic-aware Path Planning and Reliable\n Localization for UAV Operations","summary":" Localization is one of the most crucial tasks for Unmanned Aerial Vehicle\nsystems (UAVs) directly impacting overall performance, which can be achieved\nwith various sensors and applied to numerous tasks related to search and rescue\noperations, object tracking, construction, etc. However, due to the negative\neffects of challenging environments, UAVs may lose signals for localization. In\nthis paper, we present an effective path-planning system leveraging semantic\nsegmentation information to navigate around texture-less and problematic areas\nlike lakes, oceans, and high-rise buildings using a monocular camera. We\nintroduce a real-time semantic segmentation architecture and a novel keyframe\ndecision pipeline to optimize image inputs based on pixel distribution,\nreducing processing time. A hierarchical planner based on the Dynamic Window\nApproach (DWA) algorithm, integrated with a cost map, is designed to facilitate\nefficient path planning. The system is implemented in a photo-realistic\nsimulation environment using Unity, aligning with segmentation model\nparameters. Comprehensive qualitative and quantitative evaluations validate the\neffectiveness of our approach, showing significant improvements in the\nreliability and efficiency of UAV localization in challenging environments.\n","authors":["Thanh Nguyen Canh","Huy-Hoang Ngo","Xiem HoangVan","Nak Young Chong"],"pdf_url":"https://arxiv.org/pdf/2411.01816v1.pdf","comment":"In The 24th International Conference on Control, Automation, and\n Systems (ICCAS 2024), Jeju, Korea"},{"id":"http://arxiv.org/abs/2411.01814v1","updated":"2024-11-04T05:34:30Z","published":"2024-11-04T05:34:30Z","title":"Enhancing Social Robot Navigation with Integrated Motion Prediction and\n Trajectory Planning in Dynamic Human Environments","summary":" Navigating safely in dynamic human environments is crucial for mobile service\nrobots, and social navigation is a key aspect of this process. In this paper,\nwe proposed an integrative approach that combines motion prediction and\ntrajectory planning to enable safe and socially-aware robot navigation. The\nmain idea of the proposed method is to leverage the advantages of Socially\nAcceptable trajectory prediction and Timed Elastic Band (TEB) by incorporating\nhuman interactive information including position, orientation, and motion into\nthe objective function of the TEB algorithms. In addition, we designed social\nconstraints to ensure the safety of robot navigation. The proposed system is\nevaluated through physical simulation using both quantitative and qualitative\nmetrics, demonstrating its superior performance in avoiding human and dynamic\nobstacles, thereby ensuring safe navigation. The implementations are open\nsource at: \\url{https://github.com/thanhnguyencanh/SGan-TEB.git}\n","authors":["Thanh Nguyen Canh","Xiem HoangVan","Nak Young Chong"],"pdf_url":"https://arxiv.org/pdf/2411.01814v1.pdf","comment":"In the 24th International Conference on Control, Automation, and\n Systems (ICCAS 2024), Jeju, Korea"},{"id":"http://arxiv.org/abs/2411.01813v1","updated":"2024-11-04T05:31:35Z","published":"2024-11-04T05:31:35Z","title":"So You Think You Can Scale Up Autonomous Robot Data Collection?","summary":" A long-standing goal in robot learning is to develop methods for robots to\nacquire new skills autonomously. While reinforcement learning (RL) comes with\nthe promise of enabling autonomous data collection, it remains challenging to\nscale in the real-world partly due to the significant effort required for\nenvironment design and instrumentation, including the need for designing reset\nfunctions or accurate success detectors. On the other hand, imitation learning\n(IL) methods require little to no environment design effort, but instead\nrequire significant human supervision in the form of collected demonstrations.\nTo address these shortcomings, recent works in autonomous IL start with an\ninitial seed dataset of human demonstrations that an autonomous policy can\nbootstrap from. While autonomous IL approaches come with the promise of\naddressing the challenges of autonomous RL as well as pure IL strategies, in\nthis work, we posit that such techniques do not deliver on this promise and are\nstill unable to scale up autonomous data collection in the real world. Through\na series of real-world experiments, we demonstrate that these approaches, when\nscaled up to realistic settings, face much of the same scaling challenges as\nprior attempts in RL in terms of environment design. Further, we perform a\nrigorous study of autonomous IL methods across different data scales and 7\nsimulation and real-world tasks, and demonstrate that while autonomous data\ncollection can modestly improve performance, simply collecting more human data\noften provides significantly more improvement. Our work suggests a negative\nresult: that scaling up autonomous data collection for learning robot policies\nfor real-world tasks is more challenging and impractical than what is suggested\nin prior work. We hope these insights about the core challenges of scaling up\ndata collection help inform future efforts in autonomous learning.\n","authors":["Suvir Mirchandani","Suneel Belkhale","Joey Hejna","Evelyn Choi","Md Sazzad Islam","Dorsa Sadigh"],"pdf_url":"https://arxiv.org/pdf/2411.01813v1.pdf","comment":"21 pages, 25 figures. Conference on Robot Learning (CoRL) 2024"},{"id":"http://arxiv.org/abs/2411.01804v1","updated":"2024-11-04T05:13:22Z","published":"2024-11-04T05:13:22Z","title":"Semantic Masking and Visual Feature Matching for Robust Localization","summary":" We are interested in long-term deployments of autonomous robots to aid\nastronauts with maintenance and monitoring operations in settings such as the\nInternational Space Station. Unfortunately, such environments tend to be highly\ndynamic and unstructured, and their frequent reconfiguration poses a challenge\nfor robust long-term localization of robots. Many state-of-the-art visual\nfeature-based localization algorithms are not robust towards spatial scene\nchanges, and SLAM algorithms, while promising, cannot run within the\nlow-compute budget available to space robots. To address this gap, we present a\ncomputationally efficient semantic masking approach for visual feature matching\nthat improves the accuracy and robustness of visual localization systems during\nlong-term deployment in changing environments. Our method introduces a\nlightweight check that enforces matches to be within long-term static objects\nand have consistent semantic classes. We evaluate this approach using both\nmap-based relocalization and relative pose estimation and show that it improves\nAbsolute Trajectory Error (ATE) and correct match ratios on the publicly\navailable Astrobee dataset. While this approach was originally developed for\nmicrogravity robotic freeflyers, it can be applied to any visual feature\nmatching pipeline to improve robustness.\n","authors":["Luisa Mao","Ryan Soussan","Brian Coltin","Trey Smith","Joydeep Biswas"],"pdf_url":"https://arxiv.org/pdf/2411.01804v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2411.01796v1","updated":"2024-11-04T04:41:12Z","published":"2024-11-04T04:41:12Z","title":"Constrained Human-AI Cooperation: An Inclusive Embodied Social\n Intelligence Challenge","summary":" We introduce Constrained Human-AI Cooperation (CHAIC), an inclusive embodied\nsocial intelligence challenge designed to test social perception and\ncooperation in embodied agents. In CHAIC, the goal is for an embodied agent\nequipped with egocentric observations to assist a human who may be operating\nunder physical constraints -- e.g., unable to reach high places or confined to\na wheelchair -- in performing common household or outdoor tasks as efficiently\nas possible. To achieve this, a successful helper must: (1) infer the human's\nintents and constraints by following the human and observing their behaviors\n(social perception), and (2) make a cooperative plan tailored to the human\npartner to solve the task as quickly as possible, working together as a team\n(cooperative planning). To benchmark this challenge, we create four new agents\nwith real physical constraints and eight long-horizon tasks featuring both\nindoor and outdoor scenes with various constraints, emergency events, and\npotential risks. We benchmark planning- and learning-based baselines on the\nchallenge and introduce a new method that leverages large language models and\nbehavior modeling. Empirical evaluations demonstrate the effectiveness of our\nbenchmark in enabling systematic assessment of key aspects of machine social\nintelligence. Our benchmark and code are publicly available at this URL:\nhttps://github.com/UMass-Foundation-Model/CHAIC.\n","authors":["Weihua Du","Qiushi Lyu","Jiaming Shan","Zhenting Qi","Hongxin Zhang","Sunli Chen","Andi Peng","Tianmin Shu","Kwonjoon Lee","Behzad Dariush","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2411.01796v1.pdf","comment":"NeurIPS 2024 Dataset and Benchmark Track. Project at this URL:\n https://github.com/UMass-Foundation-Model/CHAIC"},{"id":"http://arxiv.org/abs/2411.01775v1","updated":"2024-11-04T03:54:00Z","published":"2024-11-04T03:54:00Z","title":"Eurekaverse: Environment Curriculum Generation via Large Language Models","summary":" Recent work has demonstrated that a promising strategy for teaching robots a\nwide range of complex skills is by training them on a curriculum of\nprogressively more challenging environments. However, developing an effective\ncurriculum of environment distributions currently requires significant\nexpertise, which must be repeated for every new domain. Our key insight is that\nenvironments are often naturally represented as code. Thus, we probe whether\neffective environment curriculum design can be achieved and automated via code\ngeneration by large language models (LLM). In this paper, we introduce\nEurekaverse, an unsupervised environment design algorithm that uses LLMs to\nsample progressively more challenging, diverse, and learnable environments for\nskill training. We validate Eurekaverse's effectiveness in the domain of\nquadrupedal parkour learning, in which a quadruped robot must traverse through\na variety of obstacle courses. The automatic curriculum designed by Eurekaverse\nenables gradual learning of complex parkour skills in simulation and can\nsuccessfully transfer to the real-world, outperforming manual training courses\ndesigned by humans.\n","authors":["William Liang","Sam Wang","Hung-Ju Wang","Osbert Bastani","Dinesh Jayaraman","Yecheng Jason Ma"],"pdf_url":"https://arxiv.org/pdf/2411.01775v1.pdf","comment":"Conference on Robot Learning (CoRL), 2024. Project website and code:\n https://eureka-research.github.io/eurekaverse"},{"id":"http://arxiv.org/abs/2403.01537v5","updated":"2024-11-04T03:49:50Z","published":"2024-03-03T15:30:59Z","title":"Mixed Strategy Nash Equilibrium for Crowd Navigation","summary":" Robots navigating in crowded areas should negotiate free space with humans\nrather than fully controlling collision avoidance, as this can lead to freezing\nbehavior. Game theory provides a framework for the robot to reason about\npotential cooperation from humans for collision avoidance during path planning.\nIn particular, the mixed strategy Nash equilibrium captures the negotiation\nbehavior under uncertainty, making it well suited for crowd navigation.\nHowever, computing the mixed strategy Nash equilibrium is often prohibitively\nexpensive for real-time decision-making. In this paper, we propose an iterative\nBayesian update scheme over probability distributions of trajectories. The\nalgorithm simultaneously generates a stochastic plan for the robot and\nprobabilistic predictions of other pedestrians' paths. We prove that the\nproposed algorithm is equivalent to solving a mixed strategy game for crowd\nnavigation, and the algorithm guarantees the recovery of the global Nash\nequilibrium of the game. We name our algorithm Bayesian Recursive Nash\nEquilibrium (BRNE) and develop a real-time model prediction crowd navigation\nframework. Since BRNE is not solving a general-purpose mixed strategy Nash\nequilibrium but a tailored formula specifically for crowd navigation, it can\ncompute the solution in real-time on a low-power embedded computer. We evaluate\nBRNE in both simulated environments and real-world pedestrian datasets. BRNE\nconsistently outperforms non-learning and learning-based methods regarding\nsafety and navigation efficiency. It also reaches human-level crowd navigation\nperformance in the pedestrian dataset benchmark. Lastly, we demonstrate the\npracticality of our algorithm with real humans on an untethered quadruped robot\nwith fully onboard perception and computation.\n","authors":["Max M. Sun","Francesca Baldini","Katie Hughes","Peter Trautman","Todd Murphey"],"pdf_url":"https://arxiv.org/pdf/2403.01537v5.pdf","comment":"Accepted to The International Journal of Robotics Research (IJRR)"},{"id":"http://arxiv.org/abs/2410.22031v2","updated":"2024-11-04T02:59:15Z","published":"2024-10-29T13:29:29Z","title":"A Degree of Flowability for Virtual Tubes","summary":" With the rapid development of robotics swarm technology, there are more tasks\nthat require the swarm to pass through complicated environments safely and\nefficiently. Virtual tube technology is a novel way to achieve this goal.\nVirtual tubes are free spaces connecting two places that provide safety\nboundaries and direction of motion for swarm robotics. How to determine the\ndesign quality of a virtual tube is a fundamental problem. For such a purpose,\nthis paper presents a degree of flowability (DOF) for two-dimensional virtual\ntubes according to a minimum energy principle. After that, methods to calculate\nDOF are proposed with a feasibility analysis. Simulations of swarm robotics in\ndifferent kinds of two-dimensional virtual tubes are performed to demonstrate\nthe effectiveness of the proposed method of calculating DOF.\n","authors":["Quan Quan","Shuhan Huang","Kai-Yuan Cai"],"pdf_url":"https://arxiv.org/pdf/2410.22031v2.pdf","comment":"22 pages, 16 figures. This is a preprint, currently under review for\n publication in Robotics and Autonomous Systems, Elsevier. Version 2 is\n submitted to fix the rendering fault in HTML and correct spelling mistakes in\n the abstract and the references"},{"id":"http://arxiv.org/abs/2411.00241v2","updated":"2024-11-04T02:54:31Z","published":"2024-10-31T22:45:10Z","title":"A Fast and Model Based Approach for Evaluating Task-Competence of\n Antagonistic Continuum Arms","summary":" Soft robot arms have made significant progress towards completing human-scale\ntasks, but designing arms for tasks with specific load and workspace\nrequirements remains difficult. A key challenge is the lack of model-based\ndesign tools, forcing advancement to occur through empirical iteration and\nobservation. Existing models are focused on control and rely on parameter fits,\nwhich means they cannot provide general conclusions about the mapping between\ndesign and performance or the influence of factors outside the fitting data. As\na first step toward model-based design tools, we introduce a novel method of\nanalyzing whether a proposed arm design can complete desired tasks. Our method\nis informative, interpretable, and fast; it provides novel metrics for\nquantifying a proposed arm design's ability to perform a task, it yields a\ngraphical interpretation of performance through segment forces, and computing\nit is over 80x faster than optimization based methods. Our formulation focuses\non antagonistic, pneumatically-driven soft arms. We demonstrate our approach\nthrough example analysis, and also through consideration of antagonistic vs\nnon-antagonistic designs. Our method enables fast, direct and task-specific\ncomparison of these two architectures, and provides a new visualization of the\ncomparative mechanics. While only a first step, the proposed approach will\nsupport advancement of model-based design tools, leading to highly capable soft\narms.\n","authors":["Bill Fan","Jacob Roulier","Gina Olson"],"pdf_url":"https://arxiv.org/pdf/2411.00241v2.pdf","comment":"8 pages, 7 figures. Submission for the 8th IEEE-RAS International\n Conference on Soft Robotics (RoboSoft 2025). For code, proofs, and other\n supplementary information, see\n https://github.com/wfan19/antagonistic-task-competency"},{"id":"http://arxiv.org/abs/2406.19464v2","updated":"2024-11-04T02:21:30Z","published":"2024-06-27T18:06:38Z","title":"ManiWAV: Learning Robot Manipulation from In-the-Wild Audio-Visual Data","summary":" Audio signals provide rich information for the robot interaction and object\nproperties through contact. This information can surprisingly ease the learning\nof contact-rich robot manipulation skills, especially when the visual\ninformation alone is ambiguous or incomplete. However, the usage of audio data\nin robot manipulation has been constrained to teleoperated demonstrations\ncollected by either attaching a microphone to the robot or object, which\nsignificantly limits its usage in robot learning pipelines. In this work, we\nintroduce ManiWAV: an 'ear-in-hand' data collection device to collect\nin-the-wild human demonstrations with synchronous audio and visual feedback,\nand a corresponding policy interface to learn robot manipulation policy\ndirectly from the demonstrations. We demonstrate the capabilities of our system\nthrough four contact-rich manipulation tasks that require either passively\nsensing the contact events and modes, or actively sensing the object surface\nmaterials and states. In addition, we show that our system can generalize to\nunseen in-the-wild environments by learning from diverse in-the-wild human\ndemonstrations.\n","authors":["Zeyi Liu","Cheng Chi","Eric Cousineau","Naveen Kuppuswamy","Benjamin Burchfiel","Shuran Song"],"pdf_url":"https://arxiv.org/pdf/2406.19464v2.pdf","comment":"Conference on Robot Learning (CoRL) 2024; Project website:\n https://maniwav.github.io/"},{"id":"http://arxiv.org/abs/2411.01725v1","updated":"2024-11-04T00:49:47Z","published":"2024-11-04T00:49:47Z","title":"A Probabilistic Formulation of LiDAR Mapping with Neural Radiance Fields","summary":" In this paper we reexamine the process through which a Neural Radiance Field\n(NeRF) can be trained to produce novel LiDAR views of a scene. Unlike image\napplications where camera pixels integrate light over time, LiDAR pulses arrive\nat specific times. As such, multiple LiDAR returns are possible for any given\ndetector and the classification of these returns is inherently probabilistic.\nApplying a traditional NeRF training routine can result in the network learning\nphantom surfaces in free space between conflicting range measurements, similar\nto how floater aberrations may be produced by an image model. We show that by\nformulating loss as an integral of probability (rather than as an integral of\noptical density) the network can learn multiple peaks for a given ray, allowing\nthe sampling of first, nth, or strongest returns from a single output channel.\nCode is available at https://github.com/mcdermatt/PLINK\n","authors":["Matthew McDermott","Jason Rife"],"pdf_url":"https://arxiv.org/pdf/2411.01725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02673v1","updated":"2024-11-04T23:15:21Z","published":"2024-11-04T23:15:21Z","title":"Multi-Transmotion: Pre-trained Model for Human Motion Prediction","summary":" The ability of intelligent systems to predict human behaviors is crucial,\nparticularly in fields such as autonomous vehicle navigation and social\nrobotics. However, the complexity of human motion have prevented the\ndevelopment of a standardized dataset for human motion prediction, thereby\nhindering the establishment of pre-trained models. In this paper, we address\nthese limitations by integrating multiple datasets, encompassing both\ntrajectory and 3D pose keypoints, to propose a pre-trained model for human\nmotion prediction. We merge seven distinct datasets across varying modalities\nand standardize their formats. To facilitate multimodal pre-training, we\nintroduce Multi-Transmotion, an innovative transformer-based model designed for\ncross-modality pre-training. Additionally, we present a novel masking strategy\nto capture rich representations. Our methodology demonstrates competitive\nperformance across various datasets on several downstream tasks, including\ntrajectory prediction in the NBA and JTA datasets, as well as pose prediction\nin the AMASS and 3DPW datasets. The code is publicly available:\nhttps://github.com/vita-epfl/multi-transmotion\n","authors":["Yang Gao","Po-Chien Luan","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2411.02673v1.pdf","comment":"CoRL 2024"},{"id":"http://arxiv.org/abs/2411.02651v1","updated":"2024-11-04T22:27:25Z","published":"2024-11-04T22:27:25Z","title":"Intelligent Magnetic Inspection Robot for Enhanced Structural Health\n Monitoring of Ferromagnetic Infrastructure","summary":" This paper presents an innovative solution to the issue of infrastructure\ndeterioration in the U.S., where a significant portion of facilities are in\npoor condition, and over 130,000 steel bridges have exceeded their lifespan.\nAging steel structures face corrosion and hidden defects, posing major safety\nrisks. The Silver Bridge collapse, resulting from an undetected flaw,\nhighlights the limitations of manual inspection methods, which often miss\nsubtle or concealed defects. Addressing the need for improved inspection\ntechnology, this work introduces an AI-powered magnetic inspection robot.\nEquipped with magnetic wheels, the robot adheres to and navigates complex\nferromagnetic surfaces, including challenging areas like vertical inclines and\ninternal corners, enabling thorough, large-scale inspections. Utilizing\nMobileNetV2, a deep learning model trained on steel surface defects, the system\nachieved an 85% precision rate across six defect types. This AI-driven\ninspection process enhances accuracy and reliability, outperforming traditional\nmethods in defect detection and efficiency. The findings suggest that combining\nrobotic mobility with AI-based image analysis offers a scalable, automated\napproach to infrastructure inspection, reducing human labor while improving\ndetection precision and the safety of critical assets.\n","authors":["Angelina Tseng","Sean Kalaycioglu"],"pdf_url":"https://arxiv.org/pdf/2411.02651v1.pdf","comment":"10 pages, 17 figures"},{"id":"http://arxiv.org/abs/2410.22527v2","updated":"2024-11-04T21:41:19Z","published":"2024-10-29T20:43:56Z","title":"Intelligent Mobility System with Integrated Motion Planning and Control\n Utilizing Infrastructure Sensor Nodes","summary":" This paper introduces a framework for an indoor autonomous mobility system\nthat can perform patient transfers and materials handling. Unlike traditional\nsystems that rely on onboard perception sensors, the proposed approach\nleverages a global perception and localization (PL) through Infrastructure\nSensor Nodes (ISNs) and cloud computing technology. Using the global PL, an\nintegrated Model Predictive Control (MPC)-based local planning and tracking\ncontroller augmented with Artificial Potential Field (APF) is developed,\nenabling reliable and efficient motion planning and obstacle avoidance ability\nwhile tracking predefined reference motions. Simulation results demonstrate the\neffectiveness of the proposed MPC controller in smoothly navigating around both\nstatic and dynamic obstacles. The proposed system has the potential to extend\nto intelligent connected autonomous vehicles, such as electric or cargo\ntransport vehicles with four-wheel independent drive/steering (4WID-4WIS)\nconfigurations.\n","authors":["Yufeng Yang","Minghao Ning","Shucheng Huang","Ehsan Hashemi","Amir Khajepour"],"pdf_url":"https://arxiv.org/pdf/2410.22527v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02624v1","updated":"2024-11-04T21:31:45Z","published":"2024-11-04T21:31:45Z","title":"Enhancing Indoor Mobility with Connected Sensor Nodes: A Real-Time,\n Delay-Aware Cooperative Perception Approach","summary":" This paper presents a novel real-time, delay-aware cooperative perception\nsystem designed for intelligent mobility platforms operating in dynamic indoor\nenvironments. The system contains a network of multi-modal sensor nodes and a\ncentral node that collectively provide perception services to mobility\nplatforms. The proposed Hierarchical Clustering Considering the Scanning\nPattern and Ground Contacting Feature based Lidar Camera Fusion improve\nintra-node perception for crowded environment. The system also features\ndelay-aware global perception to synchronize and aggregate data across nodes.\nTo validate our approach, we introduced the Indoor Pedestrian Tracking dataset,\ncompiled from data captured by two indoor sensor nodes. Our experiments,\ncompared to baselines, demonstrate significant improvements in detection\naccuracy and robustness against delays. The dataset is available in the\nrepository: https://github.com/NingMingHao/MVSLab-IndoorCooperativePerception\n","authors":["Minghao Ning","Yaodong Cui","Yufeng Yang","Shucheng Huang","Zhenan Liu","Ahmad Reza Alghooneh","Ehsan Hashemi","Amir Khajepour"],"pdf_url":"https://arxiv.org/pdf/2411.02624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02619v1","updated":"2024-11-04T21:15:17Z","published":"2024-11-04T21:15:17Z","title":"Tracking Tumors under Deformation from Partial Point Clouds using\n Occupancy Networks","summary":" To track tumors during surgery, information from preoperative CT scans is\nused to determine their position. However, as the surgeon operates, the tumor\nmay be deformed which presents a major hurdle for accurately resecting the\ntumor, and can lead to surgical inaccuracy, increased operation time, and\nexcessive margins. This issue is particularly pronounced in robot-assisted\npartial nephrectomy (RAPN), where the kidney undergoes significant deformations\nduring operation. Toward addressing this, we introduce a occupancy\nnetwork-based method for the localization of tumors within kidney phantoms\nundergoing deformations at interactive speeds. We validate our method by\nintroducing a 3D hydrogel kidney phantom embedded with exophytic and endophytic\nrenal tumors. It closely mimics real tissue mechanics to simulate kidney\ndeformation during in vivo surgery, providing excellent contrast and clear\ndelineation of tumor margins to enable automatic threshold-based segmentation.\nOur findings indicate that the proposed method can localize tumors in\nmoderately deforming kidneys with a margin of 6mm to 10mm, while providing\nessential volumetric 3D information at over 60Hz. This capability directly\nenables downstream tasks such as robotic resection.\n","authors":["Pit Henrich","Jiawei Liu","Jiawei Ge","Samuel Schmidgall","Lauren Shepard","Ahmed Ezzat Ghazi","Franziska Mathis-Ullrich","Axel Krieger"],"pdf_url":"https://arxiv.org/pdf/2411.02619v1.pdf","comment":"Accepted at IROS 2024"},{"id":"http://arxiv.org/abs/2411.02611v1","updated":"2024-11-04T21:05:40Z","published":"2024-11-04T21:05:40Z","title":"Advanced XR-Based 6-DOF Catheter Tracking System for Immersive Cardiac\n Intervention Training","summary":" Extended Reality (XR) technologies are gaining traction as effective tools\nfor medical training and procedural guidance, particularly in complex cardiac\ninterventions. This paper presents a novel system for real-time 3D tracking and\nvisualization of intracardiac echocardiography (ICE) catheters, with precise\nmeasurement of the roll angle. A custom 3D-printed setup, featuring orthogonal\ncameras, captures biplane video of the catheter, while a specialized computer\nvision algorithm reconstructs its 3D trajectory, localizing the tip with\nsub-millimeter accuracy and tracking the roll angle in real-time. The system's\ndata is integrated into an interactive Unity-based environment, rendered\nthrough the Meta Quest 3 XR headset, combining a dynamically tracked catheter\nwith a patient-specific 3D heart model. This immersive environment allows the\ntesting of the importance of 3D depth perception, in comparison to 2D\nprojections, as a form of visualization in XR. Our experimental study,\nconducted using the ICE catheter with six participants, suggests that 3D\nvisualization is not necessarily beneficial over 2D views offered by the XR\nsystem; although all cardiologists saw its utility for pre-operative training,\nplanning, and intra-operative guidance. The proposed system qualitatively shows\ngreat promise in transforming catheter-based interventions, particularly ICE\nprocedures, by improving visualization, interactivity, and skill development.\n","authors":["Mohsen Annabestani","Sandhya Sriram","S. Chiu Wong","Alexandros Sigaras","Bobak Mosadegh"],"pdf_url":"https://arxiv.org/pdf/2411.02611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18796v2","updated":"2024-11-04T20:58:13Z","published":"2024-02-29T01:56:41Z","title":"MOSAIC: A Modular System for Assistive and Interactive Cooking","summary":" We present MOSAIC, a modular architecture for home robots to perform complex\ncollaborative tasks, such as cooking with everyday users. MOSAIC tightly\ncollaborates with humans, interacts with users using natural language,\ncoordinates multiple robots, and manages an open vocabulary of everyday\nobjects. At its core, MOSAIC employs modularity: it leverages multiple\nlarge-scale pre-trained models for general tasks like language and image\nrecognition, while using streamlined modules designed for task-specific\ncontrol. We extensively evaluate MOSAIC on 60 end-to-end trials where two\nrobots collaborate with a human user to cook a combination of 6 recipes. We\nalso extensively test individual modules with 180 episodes of visuomotor\npicking, 60 episodes of human motion forecasting, and 46 online user\nevaluations of the task planner. We show that MOSAIC is able to efficiently\ncollaborate with humans by running the overall system end-to-end with a real\nhuman user, completing 68.3% (41/60) collaborative cooking trials of 6\ndifferent recipes with a subtask completion rate of 91.6%. Finally, we discuss\nthe limitations of the current system and exciting open challenges in this\ndomain. The project's website is at https://portal-cornell.github.io/MOSAIC/\n","authors":["Huaxiaoyue Wang","Kushal Kedia","Juntao Ren","Rahma Abdullah","Atiksh Bhardwaj","Angela Chao","Kelly Y Chen","Nathaniel Chin","Prithwish Dan","Xinyi Fan","Gonzalo Gonzalez-Pumariega","Aditya Kompella","Maximus Adrian Pace","Yash Sharma","Xiangwan Sun","Neha Sunkara","Sanjiban Choudhury"],"pdf_url":"https://arxiv.org/pdf/2402.18796v2.pdf","comment":"22 pages, 13 figures; CoRL 2024"},{"id":"http://arxiv.org/abs/2411.02599v1","updated":"2024-11-04T20:44:40Z","published":"2024-11-04T20:44:40Z","title":"Vocal Sandbox: Continual Learning and Adaptation for Situated\n Human-Robot Collaboration","summary":" We introduce Vocal Sandbox, a framework for enabling seamless human-robot\ncollaboration in situated environments. Systems in our framework are\ncharacterized by their ability to adapt and continually learn at multiple\nlevels of abstraction from diverse teaching modalities such as spoken dialogue,\nobject keypoints, and kinesthetic demonstrations. To enable such adaptation, we\ndesign lightweight and interpretable learning algorithms that allow users to\nbuild an understanding and co-adapt to a robot's capabilities in real-time, as\nthey teach new behaviors. For example, after demonstrating a new low-level\nskill for \"tracking around\" an object, users are provided with trajectory\nvisualizations of the robot's intended motion when asked to track a new object.\nSimilarly, users teach high-level planning behaviors through spoken dialogue,\nusing pretrained language models to synthesize behaviors such as \"packing an\nobject away\" as compositions of low-level skills $-$ concepts that can be\nreused and built upon. We evaluate Vocal Sandbox in two settings: collaborative\ngift bag assembly and LEGO stop-motion animation. In the first setting, we run\nsystematic ablations and user studies with 8 non-expert participants,\nhighlighting the impact of multi-level teaching. Across 23 hours of total robot\ninteraction time, users teach 17 new high-level behaviors with an average of 16\nnovel low-level skills, requiring 22.1% less active supervision compared to\nbaselines and yielding more complex autonomous performance (+19.7%) with fewer\nfailures (-67.1%). Qualitatively, users strongly prefer Vocal Sandbox systems\ndue to their ease of use (+20.6%) and overall performance (+13.9%). Finally, we\npair an experienced system-user with a robot to film a stop-motion animation;\nover two hours of continuous collaboration, the user teaches progressively more\ncomplex motion skills to shoot a 52 second (232 frame) movie.\n","authors":["Jennifer Grannen","Siddharth Karamcheti","Suvir Mirchandani","Percy Liang","Dorsa Sadigh"],"pdf_url":"https://arxiv.org/pdf/2411.02599v1.pdf","comment":"Published at CoRL 2024. 24 pages, 8 figures. Project Page:\n https://vocal-sandbox.github.io"},{"id":"http://arxiv.org/abs/2408.01333v2","updated":"2024-11-04T19:35:49Z","published":"2024-08-02T15:30:51Z","title":"Incorporating Control Inputs in Continuous-Time Gaussian Process State\n Estimation for Robotics","summary":" Continuous-time batch state estimation using Gaussian processes is an\nefficient approach to estimate the trajectories of robots over time. In the\npast, relatively simple physics-motivated priors have been considered for such\napproaches, using assumptions such as constant velocity or acceleration. This\npaper presents an approach to incorporating exogenous control inputs, such as\nvelocity or acceleration commands, into the continuous Gaussian process\nstate-estimation framework. It is shown that this approach generalizes across\ndifferent domains in robotics, making it applicable to both the estimation of\ncontinuous-time trajectories for mobile robots and the estimation of\nquasi-static continuum robot shapes. Results show that incorporating control\ninputs leads to more informed priors, potentially requiring less measurements\nand estimation nodes to obtain accurate estimates. This makes the approach\nparticularly useful in situations in which limited sensing is available.\n","authors":["Sven Lilge","Timothy D. Barfoot"],"pdf_url":"https://arxiv.org/pdf/2408.01333v2.pdf","comment":"17 pages, 5 figures, submitted to Robotica"},{"id":"http://arxiv.org/abs/2411.02553v1","updated":"2024-11-04T19:35:46Z","published":"2024-11-04T19:35:46Z","title":"Map++: Towards User-Participatory Visual SLAM Systems with Efficient Map\n Expansion and Sharing","summary":" Constructing precise 3D maps is crucial for the development of future\nmap-based systems such as self-driving and navigation. However, generating\nthese maps in complex environments, such as multi-level parking garages or\nshopping malls, remains a formidable challenge. In this paper, we introduce a\nparticipatory sensing approach that delegates map-building tasks to map users,\nthereby enabling cost-effective and continuous data collection. The proposed\nmethod harnesses the collective efforts of users, facilitating the expansion\nand ongoing update of the maps as the environment evolves.\n We realized this approach by developing Map++, an efficient system that\nfunctions as a plug-and-play extension, supporting participatory map-building\nbased on existing SLAM algorithms. Map++ addresses a plethora of scalability\nissues in this participatory map-building system by proposing a set of\nlightweight, application-layer protocols. We evaluated Map++ in four\nrepresentative settings: an indoor garage, an outdoor plaza, a public SLAM\nbenchmark, and a simulated environment. The results demonstrate that Map++ can\nreduce traffic volume by approximately 46% with negligible degradation in\nmapping accuracy, i.e., less than 0.03m compared to the baseline system. It can\nsupport approximately $2 \\times$ as many concurrent users as the baseline under\nthe same network bandwidth. Additionally, for users who travel on\nalready-mapped trajectories, they can directly utilize the existing maps for\nlocalization and save 47% of the CPU usage.\n","authors":["Xinran Zhang","Hanqi Zhu","Yifan Duan","Wuyang Zhang","Longfei Shangguan","Yu Zhang","Jianmin Ji","Yanyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.02553v1.pdf","comment":"15 pages, 15 figures. Accepted by MobiCom 2024"},{"id":"http://arxiv.org/abs/2411.02547v1","updated":"2024-11-04T19:31:03Z","published":"2024-11-04T19:31:03Z","title":"Modeling Uncertainty in 3D Gaussian Splatting through Continuous\n Semantic Splatting","summary":" In this paper, we present a novel algorithm for probabilistically updating\nand rasterizing semantic maps within 3D Gaussian Splatting (3D-GS). Although\nprevious methods have introduced algorithms which learn to rasterize features\nin 3D-GS for enhanced scene understanding, 3D-GS can fail without warning which\npresents a challenge for safety-critical robotic applications. To address this\ngap, we propose a method which advances the literature of continuous semantic\nmapping from voxels to ellipsoids, combining the precise structure of 3D-GS\nwith the ability to quantify uncertainty of probabilistic robotic maps. Given a\nset of images, our algorithm performs a probabilistic semantic update directly\non the 3D ellipsoids to obtain an expectation and variance through the use of\nconjugate priors. We also propose a probabilistic rasterization which returns\nper-pixel segmentation predictions with quantifiable uncertainty. We compare\nour method with similar probabilistic voxel-based methods to verify our\nextension to 3D ellipsoids, and perform ablation studies on uncertainty\nquantification and temporal smoothing.\n","authors":["Joey Wilson","Marcelino Almeida","Min Sun","Sachit Mahajan","Maani Ghaffari","Parker Ewen","Omid Ghasemalizadeh","Cheng-Hao Kuo","Arnie Sen"],"pdf_url":"https://arxiv.org/pdf/2411.02547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02524v1","updated":"2024-11-04T19:04:09Z","published":"2024-11-04T19:04:09Z","title":"SPACE: 3D Spatial Co-operation and Exploration Framework for Robust\n Mapping and Coverage with Multi-Robot Systems","summary":" In indoor environments, multi-robot visual (RGB-D) mapping and exploration\nhold immense potential for application in domains such as domestic service and\nlogistics, where deploying multiple robots in the same environment can\nsignificantly enhance efficiency. However, there are two primary challenges:\n(1) the \"ghosting trail\" effect, which occurs due to overlapping views of\nrobots impacting the accuracy and quality of point cloud reconstruction, and\n(2) the oversight of visual reconstructions in selecting the most effective\nfrontiers for exploration. Given these challenges are interrelated, we address\nthem together by proposing a new semi-distributed framework (SPACE) for spatial\ncooperation in indoor environments that enables enhanced coverage and 3D\nmapping. SPACE leverages geometric techniques, including \"mutual awareness\" and\na \"dynamic robot filter,\" to overcome spatial mapping constraints.\nAdditionally, we introduce a novel spatial frontier detection system and map\nmerger, integrated with an adaptive frontier assigner for optimal coverage\nbalancing the exploration and reconstruction objectives. In extensive\nROS-Gazebo simulations, SPACE demonstrated superior performance over\nstate-of-the-art approaches in both exploration and mapping metrics.\n","authors":["Sai Krishna Ghanta","Ramviyas Parasuraman"],"pdf_url":"https://arxiv.org/pdf/2411.02524v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02482v1","updated":"2024-11-04T18:59:36Z","published":"2024-11-04T18:59:36Z","title":"NeRF-Aug: Data Augmentation for Robotics with Neural Radiance Fields","summary":" Training a policy that can generalize to unknown objects is a long standing\nchallenge within the field of robotics. The performance of a policy often drops\nsignificantly in situations where an object in the scene was not seen during\ntraining. To solve this problem, we present NeRF-Aug, a novel method that is\ncapable of teaching a policy to interact with objects that are not present in\nthe dataset. This approach differs from existing approaches by leveraging the\nspeed and photorealism of a neural radiance field for augmentation. NeRF- Aug\nboth creates more photorealistic data and runs 3.83 times faster than existing\nmethods. We demonstrate the effectiveness of our method on 4 tasks with 11\nnovel objects that have no expert demonstration data. We achieve an average\n69.1% success rate increase over existing methods. See video results at\nhttps://nerf-aug.github.io.\n","authors":["Eric Zhu","Mara Levy","Matthew Gwilliam","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2411.02482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02479v1","updated":"2024-11-04T18:38:50Z","published":"2024-11-04T18:38:50Z","title":"Digitizing Touch with an Artificial Multimodal Fingertip","summary":" Touch is a crucial sensing modality that provides rich information about\nobject properties and interactions with the physical environment. Humans and\nrobots both benefit from using touch to perceive and interact with the\nsurrounding environment (Johansson and Flanagan, 2009; Li et al., 2020;\nCalandra et al., 2017). However, no existing systems provide rich, multi-modal\ndigital touch-sensing capabilities through a hemispherical compliant\nembodiment. Here, we describe several conceptual and technological innovations\nto improve the digitization of touch. These advances are embodied in an\nartificial finger-shaped sensor with advanced sensing capabilities.\nSignificantly, this fingertip contains high-resolution sensors (~8.3 million\ntaxels) that respond to omnidirectional touch, capture multi-modal signals, and\nuse on-device artificial intelligence to process the data in real time.\nEvaluations show that the artificial fingertip can resolve spatial features as\nsmall as 7 um, sense normal and shear forces with a resolution of 1.01 mN and\n1.27 mN, respectively, perceive vibrations up to 10 kHz, sense heat, and even\nsense odor. Furthermore, it embeds an on-device AI neural network accelerator\nthat acts as a peripheral nervous system on a robot and mimics the reflex arc\nfound in humans. These results demonstrate the possibility of digitizing touch\nwith superhuman performance. The implications are profound, and we anticipate\npotential applications in robotics (industrial, medical, agricultural, and\nconsumer-level), virtual reality and telepresence, prosthetics, and e-commerce.\nToward digitizing touch at scale, we open-source a modular platform to\nfacilitate future research on the nature of touch.\n","authors":["Mike Lambeta","Tingfan Wu","Ali Sengul","Victoria Rose Most","Nolan Black","Kevin Sawyer","Romeo Mercado","Haozhi Qi","Alexander Sohn","Byron Taylor","Norb Tydingco","Gregg Kammerer","Dave Stroud","Jake Khatha","Kurt Jenkins","Kyle Most","Neal Stein","Ricardo Chavira","Thomas Craven-Bartle","Eric Sanchez","Yitian Ding","Jitendra Malik","Roberto Calandra"],"pdf_url":"https://arxiv.org/pdf/2411.02479v1.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2205.10933v3","updated":"2024-11-04T14:20:51Z","published":"2022-05-22T21:18:40Z","title":"AutoJoin: Efficient Adversarial Training against Gradient-Free\n Perturbations for Robust Maneuvering via Denoising Autoencoder and Joint\n Learning","summary":" With the growing use of machine learning algorithms and ubiquitous sensors,\nmany `perception-to-control' systems are being developed and deployed. To\nensure their trustworthiness, improving their robustness through adversarial\ntraining is one potential approach. We propose a gradient-free adversarial\ntraining technique, named AutoJoin, to effectively and efficiently produce\nrobust models for image-based maneuvering. Compared to other state-of-the-art\nmethods with testing on over 5M images, AutoJoin achieves significant\nperformance increases up to the 40% range against perturbations while improving\non clean performance up to 300%. AutoJoin is also highly efficient, saving up\nto 86% time per training epoch and 90% training data over other\nstate-of-the-art techniques. The core idea of AutoJoin is to use a decoder\nattachment to the original regression model creating a denoising autoencoder\nwithin the architecture. This architecture allows the tasks `maneuvering' and\n`denoising sensor input' to be jointly learnt and reinforce each other's\nperformance.\n","authors":["Michael Villarreal","Bibek Poudel","Ryan Wickman","Yu Shen","Weizi Li"],"pdf_url":"https://arxiv.org/pdf/2205.10933v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02468v1","updated":"2024-11-04T14:00:43Z","published":"2024-11-04T14:00:43Z","title":"Modeling and Simulation of a Multi Robot System Architecture","summary":" A Multi Robot System (MRS) is the infrastructure of an intelligent\ncyberphysical system, where the robots understand the need of the human, and\nhence cooperate together to fulfill this need. Modeling an MRS is a crucial\naspect of designing the proper system architecture, because this model can be\nused to simulate and measure the performance of the proposed architecture.\nHowever, an MRS solution architecture modeling is a very difficult problem, as\nit contains many dependent behaviors that dynamically change due to the current\nstatus of the overall system. In this paper, we introduce a general purpose MRS\ncase study, where the humans initiate requests that are achieved by the\navailable robots. These requests require different plans that use the current\ncapabilities of the available robots. After proposing an architecture that\ndefines the solution components, three steps are followed. First is modeling\nthese components via Business Process Model and Notation (BPMN) language. BPMN\nprovides a graphical notation to precisely represent the behaviors of every\ncomponent, which is an essential need to model the solution. Second is to\nsimulate these components behaviors and interaction in form of software agents.\nJava Agent DEvelopment (JADE) middleware has been used to develop and simulate\nthe proposed model. JADE is based on a reactive agent approach, therefore it\ncan dynamically represent the interaction among the solution components.\nFinally is to analyze the performance of the solution by defining a number of\nquantitative measurements, which can be obtained while simulating the system\nmodel in JADE middleware, therefore the solution can be analyzed and compared\nto another architecture.\n","authors":["Ahmed R. Sadik","Christian Goerick","Manuel Muehlig"],"pdf_url":"https://arxiv.org/pdf/2411.02468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.06415v3","updated":"2024-11-04T12:57:25Z","published":"2022-12-13T07:48:24Z","title":"Collision probability reduction method for tracking control in automatic\n docking / berthing using reinforcement learning","summary":" Automation of berthing maneuvers in shipping is a pressing issue as the\nberthing maneuver is one of the most stressful tasks seafarers undertake.\nBerthing control problems are often tackled via tracking a predefined\ntrajectory or path. Maintaining a tracking error of zero under an uncertain\nenvironment is impossible; the tracking controller is nonetheless required to\nbring vessels close to desired berths. The tracking controller must prioritize\nthe avoidance of tracking errors that may cause collisions with obstacles. This\npaper proposes a training method based on reinforcement learning for a\ntrajectory tracking controller that reduces the probability of collisions with\nstatic obstacles. Via numerical simulations, we show that the proposed method\nreduces the probability of collisions during berthing maneuvers. Furthermore,\nthis paper shows the tracking performance in a model experiment.\n","authors":["Kouki Wakita","Youhei Akimoto","Dimas M. Rachman","Yoshiki Miyauchi","Umeda Naoya","Atsuo Maki"],"pdf_url":"https://arxiv.org/pdf/2212.06415v3.pdf","comment":"14 pages, 15 figures, Published by Journal of Marine Science and\n Technology"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2411.02397v1","updated":"2024-11-04T18:59:44Z","published":"2024-11-04T18:59:44Z","title":"Adaptive Caching for Faster Video Generation with Diffusion Transformers","summary":" Generating temporally-consistent high-fidelity videos can be computationally\nexpensive, especially over longer temporal spans. More-recent Diffusion\nTransformers (DiTs) -- despite making significant headway in this context --\nhave only heightened such challenges as they rely on larger models and heavier\nattention mechanisms, resulting in slower inference speeds. In this paper, we\nintroduce a training-free method to accelerate video DiTs, termed Adaptive\nCaching (AdaCache), which is motivated by the fact that \"not all videos are\ncreated equal\": meaning, some videos require fewer denoising steps to attain a\nreasonable quality than others. Building on this, we not only cache\ncomputations through the diffusion process, but also devise a caching schedule\ntailored to each video generation, maximizing the quality-latency trade-off. We\nfurther introduce a Motion Regularization (MoReg) scheme to utilize video\ninformation within AdaCache, essentially controlling the compute allocation\nbased on motion content. Altogether, our plug-and-play contributions grant\nsignificant inference speedups (e.g. up to 4.7x on Open-Sora 720p - 2s video\ngeneration) without sacrificing the generation quality, across multiple video\nDiT baselines.\n","authors":["Kumara Kahatapitiya","Haozhe Liu","Sen He","Ding Liu","Menglin Jia","Michael S. Ryoo","Tian Xie"],"pdf_url":"https://arxiv.org/pdf/2411.02397v1.pdf","comment":"Project-page is available at https://adacache-dit.github.io"},{"id":"http://arxiv.org/abs/2411.02394v1","updated":"2024-11-04T18:59:05Z","published":"2024-11-04T18:59:05Z","title":"AutoVFX: Physically Realistic Video Editing from Natural Language\n Instructions","summary":" Modern visual effects (VFX) software has made it possible for skilled artists\nto create imagery of virtually anything. However, the creation process remains\nlaborious, complex, and largely inaccessible to everyday users. In this work,\nwe present AutoVFX, a framework that automatically creates realistic and\ndynamic VFX videos from a single video and natural language instructions. By\ncarefully integrating neural scene modeling, LLM-based code generation, and\nphysical simulation, AutoVFX is able to provide physically-grounded,\nphotorealistic editing effects that can be controlled directly using natural\nlanguage instructions. We conduct extensive experiments to validate AutoVFX's\nefficacy across a diverse spectrum of videos and instructions. Quantitative and\nqualitative results suggest that AutoVFX outperforms all competing methods by a\nlarge margin in generative quality, instruction alignment, editing versatility,\nand physical plausibility.\n","authors":["Hao-Yu Hsu","Zhi-Hao Lin","Albert Zhai","Hongchi Xia","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2411.02394v1.pdf","comment":"Project page: https://haoyuhsu.github.io/autovfx-website/"},{"id":"http://arxiv.org/abs/2411.02395v1","updated":"2024-11-04T18:59:05Z","published":"2024-11-04T18:59:05Z","title":"Training-free Regional Prompting for Diffusion Transformers","summary":" Diffusion models have demonstrated excellent capabilities in text-to-image\ngeneration. Their semantic understanding (i.e., prompt following) ability has\nalso been greatly improved with large language models (e.g., T5, Llama).\nHowever, existing models cannot perfectly handle long and complex text prompts,\nespecially when the text prompts contain various objects with numerous\nattributes and interrelated spatial relationships. While many regional\nprompting methods have been proposed for UNet-based models (SD1.5, SDXL), but\nthere are still no implementations based on the recent Diffusion Transformer\n(DiT) architecture, such as SD3 and FLUX.1.In this report, we propose and\nimplement regional prompting for FLUX.1 based on attention manipulation, which\nenables DiT with fined-grained compositional text-to-image generation\ncapability in a training-free manner. Code is available at\nhttps://github.com/antonioo-c/Regional-Prompting-FLUX.\n","authors":["Anthony Chen","Jianjin Xu","Wenzhao Zheng","Gaole Dai","Yida Wang","Renrui Zhang","Haofan Wang","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.02395v1.pdf","comment":"Code is available at\n https://github.com/antonioo-c/Regional-Prompting-FLUX"},{"id":"http://arxiv.org/abs/2411.02393v1","updated":"2024-11-04T18:58:01Z","published":"2024-11-04T18:58:01Z","title":"Adaptive Length Image Tokenization via Recurrent Allocation","summary":" Current vision systems typically assign fixed-length representations to\nimages, regardless of the information content. This contrasts with human\nintelligence - and even large language models - which allocate varying\nrepresentational capacities based on entropy, context and familiarity. Inspired\nby this, we propose an approach to learn variable-length token representations\nfor 2D images. Our encoder-decoder architecture recursively processes 2D image\ntokens, distilling them into 1D latent tokens over multiple iterations of\nrecurrent rollouts. Each iteration refines the 2D tokens, updates the existing\n1D latent tokens, and adaptively increases representational capacity by adding\nnew tokens. This enables compression of images into a variable number of\ntokens, ranging from 32 to 256. We validate our tokenizer using reconstruction\nloss and FID metrics, demonstrating that token count aligns with image entropy,\nfamiliarity and downstream task requirements. Recurrent token processing with\nincreasing representational capacity in each iteration shows signs of token\nspecialization, revealing potential for object / part discovery.\n","authors":["Shivam Duggal","Phillip Isola","Antonio Torralba","William T. Freeman"],"pdf_url":"https://arxiv.org/pdf/2411.02393v1.pdf","comment":"Code at: https://github.com/ShivamDuggal4/adaptive-length-tokenizer"},{"id":"http://arxiv.org/abs/2411.02385v1","updated":"2024-11-04T18:53:05Z","published":"2024-11-04T18:53:05Z","title":"How Far is Video Generation from World Model: A Physical Law Perspective","summary":" OpenAI's Sora highlights the potential of video generation for developing\nworld models that adhere to fundamental physical laws. However, the ability of\nvideo generation models to discover such laws purely from visual data without\nhuman priors can be questioned. A world model learning the true law should give\npredictions robust to nuances and correctly extrapolate on unseen scenarios. In\nthis work, we evaluate across three key scenarios: in-distribution,\nout-of-distribution, and combinatorial generalization. We developed a 2D\nsimulation testbed for object movement and collisions to generate videos\ndeterministically governed by one or more classical mechanics laws. This\nprovides an unlimited supply of data for large-scale experimentation and\nenables quantitative evaluation of whether the generated videos adhere to\nphysical laws. We trained diffusion-based video generation models to predict\nobject movements based on initial frames. Our scaling experiments show perfect\ngeneralization within the distribution, measurable scaling behavior for\ncombinatorial generalization, but failure in out-of-distribution scenarios.\nFurther experiments reveal two key insights about the generalization mechanisms\nof these models: (1) the models fail to abstract general physical rules and\ninstead exhibit \"case-based\" generalization behavior, i.e., mimicking the\nclosest training example; (2) when generalizing to new cases, models are\nobserved to prioritize different factors when referencing training data: color\n> size > velocity > shape. Our study suggests that scaling alone is\ninsufficient for video generation models to uncover fundamental physical laws,\ndespite its role in Sora's broader success. See our project page at\nhttps://phyworld.github.io\n","authors":["Bingyi Kang","Yang Yue","Rui Lu","Zhijie Lin","Yang Zhao","Kaixin Wang","Gao Huang","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2411.02385v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2410.23262v2","updated":"2024-11-04T18:44:20Z","published":"2024-10-30T17:46:31Z","title":"EMMA: End-to-End Multimodal Model for Autonomous Driving","summary":" We introduce EMMA, an End-to-end Multimodal Model for Autonomous driving.\nBuilt on a multi-modal large language model foundation, EMMA directly maps raw\ncamera sensor data into various driving-specific outputs, including planner\ntrajectories, perception objects, and road graph elements. EMMA maximizes the\nutility of world knowledge from the pre-trained large language models, by\nrepresenting all non-sensor inputs (e.g. navigation instructions and ego\nvehicle status) and outputs (e.g. trajectories and 3D locations) as natural\nlanguage text. This approach allows EMMA to jointly process various driving\ntasks in a unified language space, and generate the outputs for each task using\ntask-specific prompts. Empirically, we demonstrate EMMA's effectiveness by\nachieving state-of-the-art performance in motion planning on nuScenes as well\nas competitive results on the Waymo Open Motion Dataset (WOMD). EMMA also\nyields competitive results for camera-primary 3D object detection on the Waymo\nOpen Dataset (WOD). We show that co-training EMMA with planner trajectories,\nobject detection, and road graph tasks yields improvements across all three\ndomains, highlighting EMMA's potential as a generalist model for autonomous\ndriving applications. However, EMMA also exhibits certain limitations: it can\nprocess only a small amount of image frames, does not incorporate accurate 3D\nsensing modalities like LiDAR or radar and is computationally expensive. We\nhope that our results will inspire further research to mitigate these issues\nand to further evolve the state of the art in autonomous driving model\narchitectures.\n","authors":["Jyh-Jing Hwang","Runsheng Xu","Hubert Lin","Wei-Chih Hung","Jingwei Ji","Kristy Choi","Di Huang","Tong He","Paul Covington","Benjamin Sapp","Yin Zhou","James Guo","Dragomir Anguelov","Mingxing Tan"],"pdf_url":"https://arxiv.org/pdf/2410.23262v2.pdf","comment":"Blog post: https://waymo.com/blog/2024/10/introducing-emma/"},{"id":"http://arxiv.org/abs/2411.02372v1","updated":"2024-11-04T18:40:46Z","published":"2024-11-04T18:40:46Z","title":"Learning General-Purpose Biomedical Volume Representations using\n Randomized Synthesis","summary":" Current volumetric biomedical foundation models struggle to generalize as\npublic 3D datasets are small and do not cover the broad diversity of medical\nprocedures, conditions, anatomical regions, and imaging protocols. We address\nthis by creating a representation learning method that instead anticipates\nstrong domain shifts at training time itself. We first propose a data engine\nthat synthesizes highly variable training samples that enable generalization to\nnew biomedical contexts. To then train a single 3D network for any voxel-level\ntask, we develop a contrastive learning method that pretrains the network to be\nstable against nuisance imaging variation simulated by the data engine, a key\ninductive bias for generalization. This network's features can be used as\nrobust representations of input images for downstream tasks and its weights\nprovide a strong, dataset-agnostic initialization for finetuning on new\ndatasets. As a result, we set new standards across both multimodality\nregistration and few-shot segmentation, a first for any 3D biomedical vision\nmodel, all without (pre-)training on any existing dataset of real images.\n","authors":["Neel Dey","Benjamin Billot","Hallee E. Wong","Clinton J. Wang","Mengwei Ren","P. Ellen Grant","Adrian V. Dalca","Polina Golland"],"pdf_url":"https://arxiv.org/pdf/2411.02372v1.pdf","comment":"Code and model weights available at\n https://github.com/neel-dey/anatomix"},{"id":"http://arxiv.org/abs/2411.02354v1","updated":"2024-11-04T18:21:56Z","published":"2024-11-04T18:21:56Z","title":"Machine learning identification of maternal inflammatory response and\n histologic choroamnionitis from placental membrane whole slide images","summary":" The placenta forms a critical barrier to infection through pregnancy, labor\nand, delivery. Inflammatory processes in the placenta have short-term, and\nlong-term consequences for offspring health. Digital pathology and machine\nlearning can play an important role in understanding placental inflammation,\nand there have been very few investigations into methods for predicting and\nunderstanding Maternal Inflammatory Response (MIR). This work intends to\ninvestigate the potential of using machine learning to understand MIR based on\nwhole slide images (WSI), and establish early benchmarks. To that end, we use\nMultiple Instance Learning framework with 3 feature extractors: ImageNet-based\nEfficientNet-v2s, and 2 histopathology foundation models, UNI and Phikon to\ninvestigate predictability of MIR stage from histopathology WSIs. We also\ninterpret predictions from these models using the learned attention maps from\nthese models. We also use the MIL framework for predicting white blood cells\ncount (WBC) and maximum fever temperature ($T_{max}$). Attention-based MIL\nmodels are able to classify MIR with a balanced accuracy of up to 88.5% with a\nCohen's Kappa ($\\kappa$) of up to 0.772. Furthermore, we found that the\npathology foundation models (UNI and Phikon) are both able to achieve higher\nperformance with balanced accuracy and $\\kappa$, compared to ImageNet-based\nfeature extractor (EfficientNet-v2s). For WBC and $T_{max}$ prediction, we\nfound mild correlation between actual values and those predicted from\nhistopathology WSIs. We used MIL framework for predicting MIR stage from WSIs,\nand compared effectiveness of foundation models as feature extractors, with\nthat of an ImageNet-based model. We further investigated model failure cases\nand found them to be either edge cases prone to interobserver variability,\nexamples of pathologist's overreach, or mislabeled due to processing errors.\n","authors":["Abhishek Sharma","Ramin Nateghi","Marina Ayad","Lee A. D. Cooper","Jeffery A. Goldstein"],"pdf_url":"https://arxiv.org/pdf/2411.02354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02347v1","updated":"2024-11-04T18:17:44Z","published":"2024-11-04T18:17:44Z","title":"Physically Based Neural Bidirectional Reflectance Distribution Function","summary":" We introduce the physically based neural bidirectional reflectance\ndistribution function (PBNBRDF), a novel, continuous representation for\nmaterial appearance based on neural fields. Our model accurately reconstructs\nreal-world materials while uniquely enforcing physical properties for realistic\nBRDFs, specifically Helmholtz reciprocity via reparametrization and energy\npassivity via efficient analytical integration. We conduct a systematic\nanalysis demonstrating the benefits of adhering to these physical laws on the\nvisual quality of reconstructed materials. Additionally, we enhance the color\naccuracy of neural BRDFs by introducing chromaticity enforcement supervising\nthe norms of RGB channels. Through both qualitative and quantitative\nexperiments on multiple databases of measured real-world BRDFs, we show that\nadhering to these physical constraints enables neural fields to more faithfully\nand stably represent the original data and achieve higher rendering quality.\n","authors":["Chenliang Zhou","Alejandro Sztrajman","Gilles Rainer","Fangcheng Zhong","Fazilet Gokbudak","Zhilin Guo","Weihao Xia","Rafal Mantiuk","Cengiz Oztireli"],"pdf_url":"https://arxiv.org/pdf/2411.02347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02336v1","updated":"2024-11-04T17:59:39Z","published":"2024-11-04T17:59:39Z","title":"MVPaint: Synchronized Multi-View Diffusion for Painting Anything 3D","summary":" Texturing is a crucial step in the 3D asset production workflow, which\nenhances the visual appeal and diversity of 3D assets. Despite recent\nadvancements in Text-to-Texture (T2T) generation, existing methods often yield\nsubpar results, primarily due to local discontinuities, inconsistencies across\nmultiple views, and their heavy dependence on UV unwrapping outcomes. To tackle\nthese challenges, we propose a novel generation-refinement 3D texturing\nframework called MVPaint, which can generate high-resolution, seamless textures\nwhile emphasizing multi-view consistency. MVPaint mainly consists of three key\nmodules. 1) Synchronized Multi-view Generation (SMG). Given a 3D mesh model,\nMVPaint first simultaneously generates multi-view images by employing an SMG\nmodel, which leads to coarse texturing results with unpainted parts due to\nmissing observations. 2) Spatial-aware 3D Inpainting (S3I). To ensure complete\n3D texturing, we introduce the S3I method, specifically designed to effectively\ntexture previously unobserved areas. 3) UV Refinement (UVR). Furthermore,\nMVPaint employs a UVR module to improve the texture quality in the UV space,\nwhich first performs a UV-space Super-Resolution, followed by a Spatial-aware\nSeam-Smoothing algorithm for revising spatial texturing discontinuities caused\nby UV unwrapping. Moreover, we establish two T2T evaluation benchmarks: the\nObjaverse T2T benchmark and the GSO T2T benchmark, based on selected\nhigh-quality 3D meshes from the Objaverse dataset and the entire GSO dataset,\nrespectively. Extensive experimental results demonstrate that MVPaint surpasses\nexisting state-of-the-art methods. Notably, MVPaint could generate\nhigh-fidelity textures with minimal Janus issues and highly enhanced cross-view\nconsistency.\n","authors":["Wei Cheng","Juncheng Mu","Xianfang Zeng","Xin Chen","Anqi Pang","Chi Zhang","Zhibin Wang","Bin Fu","Gang Yu","Ziwei Liu","Liang Pan"],"pdf_url":"https://arxiv.org/pdf/2411.02336v1.pdf","comment":"Project Page: https://mvpaint.github.io"},{"id":"http://arxiv.org/abs/2411.02334v1","updated":"2024-11-04T17:58:54Z","published":"2024-11-04T17:58:54Z","title":"Diffusion-based Generative Multicasting with Intent-aware Semantic\n Decomposition","summary":" Generative diffusion models (GDMs) have recently shown great success in\nsynthesizing multimedia signals with high perceptual quality enabling highly\nefficient semantic communications in future wireless networks. In this paper,\nwe develop an intent-aware generative semantic multicasting framework utilizing\npre-trained diffusion models. In the proposed framework, the transmitter\ndecomposes the source signal to multiple semantic classes based on the\nmulti-user intent, i.e. each user is assumed to be interested in details of\nonly a subset of the semantic classes. The transmitter then sends to each user\nonly its intended classes, and multicasts a highly compressed semantic map to\nall users over shared wireless resources that allows them to locally synthesize\nthe other classes, i.e. non-intended classes, utilizing pre-trained diffusion\nmodels. The signal retrieved at each user is thereby partially reconstructed\nand partially synthesized utilizing the received semantic map. This improves\nutilization of the wireless resources, with better preserving privacy of the\nnon-intended classes. We design a communication/computation-aware scheme for\nper-class adaptation of the communication parameters, such as the transmission\npower and compression rate to minimize the total latency of retrieving signals\nat multiple receivers, tailored to the prevailing channel conditions as well as\nthe users reconstruction/synthesis distortion/perception requirements. The\nsimulation results demonstrate significantly reduced per-user latency compared\nwith non-generative and intent-unaware multicasting benchmarks while\nmaintaining high perceptual quality of the signals retrieved at the users.\n","authors":["Xinkai Liu","Mahdi Boloursaz Mashhadi","Li Qiao","Yi Ma","Rahim Tafazolli","Mehdi Bennis"],"pdf_url":"https://arxiv.org/pdf/2411.02334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02327v1","updated":"2024-11-04T17:50:36Z","published":"2024-11-04T17:50:36Z","title":"PPLLaVA: Varied Video Sequence Understanding With Prompt Guidance","summary":" The past year has witnessed the significant advancement of video-based large\nlanguage models. However, the challenge of developing a unified model for both\nshort and long video understanding remains unresolved. Most existing video LLMs\ncannot handle hour-long videos, while methods custom for long videos tend to be\nineffective for shorter videos and images. In this paper, we identify the key\nissue as the redundant content in videos. To address this, we propose a novel\npooling strategy that simultaneously achieves token compression and\ninstruction-aware visual feature aggregation. Our model is termed Prompt-guided\nPooling LLaVA, or PPLLaVA for short. Specifically, PPLLaVA consists of three\ncore components: the CLIP-based visual-prompt alignment that extracts visual\ninformation relevant to the user's instructions, the prompt-guided pooling that\ncompresses the visual sequence to arbitrary scales using convolution-style\npooling, and the clip context extension designed for lengthy prompt common in\nvisual dialogue. Moreover, our codebase also integrates the most advanced video\nDirect Preference Optimization (DPO) and visual interleave training. Extensive\nexperiments have validated the performance of our model. With superior\nthroughput and only 1024 visual context, PPLLaVA achieves better results on\nimage benchmarks as a video LLM, while achieving state-of-the-art performance\nacross various video benchmarks, excelling in tasks ranging from caption\ngeneration to multiple-choice questions, and handling video lengths from\nseconds to hours. Codes have been available at\nhttps://github.com/farewellthree/PPLLaVA.\n","authors":["Ruyang Liu","Haoran Tang","Haibo Liu","Yixiao Ge","Ying Shan","Chen Li","Jiankun Yang"],"pdf_url":"https://arxiv.org/pdf/2411.02327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02319v1","updated":"2024-11-04T17:45:44Z","published":"2024-11-04T17:45:44Z","title":"GenXD: Generating Any 3D and 4D Scenes","summary":" Recent developments in 2D visual generation have been remarkably successful.\nHowever, 3D and 4D generation remain challenging in real-world applications due\nto the lack of large-scale 4D data and effective model design. In this paper,\nwe propose to jointly investigate general 3D and 4D generation by leveraging\ncamera and object movements commonly observed in daily life. Due to the lack of\nreal-world 4D data in the community, we first propose a data curation pipeline\nto obtain camera poses and object motion strength from videos. Based on this\npipeline, we introduce a large-scale real-world 4D scene dataset: CamVid-30K.\nBy leveraging all the 3D and 4D data, we develop our framework, GenXD, which\nallows us to produce any 3D or 4D scene. We propose multiview-temporal modules,\nwhich disentangle camera and object movements, to seamlessly learn from both 3D\nand 4D data. Additionally, GenXD employs masked latent conditions to support a\nvariety of conditioning views. GenXD can generate videos that follow the camera\ntrajectory as well as consistent 3D views that can be lifted into 3D\nrepresentations. We perform extensive evaluations across various real-world and\nsynthetic datasets, demonstrating GenXD's effectiveness and versatility\ncompared to previous methods in 3D and 4D generation.\n","authors":["Yuyang Zhao","Chung-Ching Lin","Kevin Lin","Zhiwen Yan","Linjie Li","Zhengyuan Yang","Jianfeng Wang","Gim Hee Lee","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2411.02319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18145v2","updated":"2024-11-04T17:31:40Z","published":"2024-07-25T15:49:26Z","title":"Taxonomy-Aware Continual Semantic Segmentation in Hyperbolic Spaces for\n Open-World Perception","summary":" Semantic segmentation models are typically trained on a fixed set of classes,\nlimiting their applicability in open-world scenarios. Class-incremental\nsemantic segmentation aims to update models with emerging new classes while\npreventing catastrophic forgetting of previously learned ones. However,\nexisting methods impose strict rigidity on old classes, reducing their\neffectiveness in learning new incremental classes. In this work, we propose\nTaxonomy-Oriented Poincar\\'e-regularized Incremental-Class Segmentation\n(TOPICS) that learns feature embeddings in hyperbolic space following explicit\ntaxonomy-tree structures. This supervision provides plasticity for old classes,\nupdating ancestors based on new classes while integrating new classes at\nfitting positions. Additionally, we maintain implicit class relational\nconstraints on the geometric basis of the Poincar\\'e ball. This ensures that\nthe latent space can continuously adapt to new constraints while maintaining a\nrobust structure to combat catastrophic forgetting. We also establish eight\nrealistic incremental learning protocols for autonomous driving scenarios,\nwhere novel classes can originate from known classes or the background.\nExtensive evaluations of TOPICS on the Cityscapes and Mapillary Vistas 2.0\nbenchmarks demonstrate that it achieves state-of-the-art performance. We make\nthe code and trained models publicly available at\nhttp://topics.cs.uni-freiburg.de.\n","authors":["Julia Hindel","Daniele Cattaneo","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2407.18145v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10376v2","updated":"2024-11-04T17:28:54Z","published":"2024-02-16T00:04:36Z","title":"Interpreting CLIP with Sparse Linear Concept Embeddings (SpLiCE)","summary":" CLIP embeddings have demonstrated remarkable performance across a wide range\nof multimodal applications. However, these high-dimensional, dense vector\nrepresentations are not easily interpretable, limiting our understanding of the\nrich structure of CLIP and its use in downstream applications that require\ntransparency. In this work, we show that the semantic structure of CLIP's\nlatent space can be leveraged to provide interpretability, allowing for the\ndecomposition of representations into semantic concepts. We formulate this\nproblem as one of sparse recovery and propose a novel method, Sparse Linear\nConcept Embeddings, for transforming CLIP representations into sparse linear\ncombinations of human-interpretable concepts. Distinct from previous work,\nSpLiCE is task-agnostic and can be used, without training, to explain and even\nreplace traditional dense CLIP representations, maintaining high downstream\nperformance while significantly improving their interpretability. We also\ndemonstrate significant use cases of SpLiCE representations including detecting\nspurious correlations and model editing.\n","authors":["Usha Bhalla","Alex Oesterling","Suraj Srinivas","Flavio P. Calmon","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2402.10376v2.pdf","comment":"25 pages, 15 figures, NeurIPS 2024. Code is provided at\n https://github.com/AI4LIFE-GROUP/SpLiCE"},{"id":"http://arxiv.org/abs/2411.02299v1","updated":"2024-11-04T17:25:10Z","published":"2024-11-04T17:25:10Z","title":"Grouped Discrete Representation for Object-Centric Learning","summary":" Object-Centric Learning (OCL) can discover objects in images or videos by\nsimply reconstructing the input. For better object discovery, representative\nOCL methods reconstruct the input as its Variational Autoencoder (VAE)\nintermediate representation, which suppresses pixel noises and promotes object\nseparability by discretizing continuous super-pixels with template features.\nHowever, treating features as units overlooks their composing attributes, thus\nimpeding model generalization; indexing features with scalar numbers loses\nattribute-level similarities and differences, thus hindering model convergence.\nWe propose \\textit{Grouped Discrete Representation} (GDR) for OCL. We decompose\nfeatures into combinatorial attributes via organized channel grouping, and\ncompose these attributes into discrete representation via tuple indexes.\nExperiments show that our GDR improves both Transformer- and Diffusion-based\nOCL methods consistently on various datasets. Visualizations show that our GDR\ncaptures better object separability.\n","authors":["Rongzhen Zhao","Vivienne Wang","Juho Kannala","Joni Pajarinen"],"pdf_url":"https://arxiv.org/pdf/2411.02299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02293v1","updated":"2024-11-04T17:21:42Z","published":"2024-11-04T17:21:42Z","title":"Hunyuan3D-1.0: A Unified Framework for Text-to-3D and Image-to-3D\n Generation","summary":" While 3D generative models have greatly improved artists' workflows, the\nexisting diffusion models for 3D generation suffer from slow generation and\npoor generalization. To address this issue, we propose a two-stage approach\nnamed Hunyuan3D-1.0 including a lite version and a standard version, that both\nsupport text- and image-conditioned generation. In the first stage, we employ a\nmulti-view diffusion model that efficiently generates multi-view RGB in\napproximately 4 seconds. These multi-view images capture rich details of the 3D\nasset from different viewpoints, relaxing the tasks from single-view to\nmulti-view reconstruction. In the second stage, we introduce a feed-forward\nreconstruction model that rapidly and faithfully reconstructs the 3D asset\ngiven the generated multi-view images in approximately 7 seconds. The\nreconstruction network learns to handle noises and in-consistency introduced by\nthe multi-view diffusion and leverages the available information from the\ncondition image to efficiently recover the 3D structure. % Extensive\nexperimental results demonstrate the effectiveness of Hunyuan3D-1.0 in\ngenerating high-quality 3D assets. Our framework involves the text-to-image\nmodel ~\\ie, Hunyuan-DiT, making it a unified framework to support both text-\nand image-conditioned 3D generation. Our standard version has $10\\times$ more\nparameters than our lite and other existing model. Our Hunyuan3D-1.0 achieves\nan impressive balance between speed and quality, significantly reducing\ngeneration time while maintaining the quality and diversity of the produced\nassets.\n","authors":["Xianghui Yang","Huiwen Shi","Bowen Zhang","Fan Yang","Jiacheng Wang","Hongxu Zhao","Xinhai Liu","Xinzhou Wang","Qingxiang Lin","Jiaao Yu","Lifu Wang","Zhuo Chen","Sicong Liu","Yuhong Liu","Yong Yang","Di Wang","Jie Jiang","Chunchao Guo"],"pdf_url":"https://arxiv.org/pdf/2411.02293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02281v1","updated":"2024-11-04T17:09:58Z","published":"2024-11-04T17:09:58Z","title":"Conformal-in-the-Loop for Learning with Imbalanced Noisy Data","summary":" Class imbalance and label noise are pervasive in large-scale datasets, yet\nmuch of machine learning research assumes well-labeled, balanced data, which\nrarely reflects real world conditions. Existing approaches typically address\neither label noise or class imbalance in isolation, leading to suboptimal\nresults when both issues coexist. In this work, we propose\nConformal-in-the-Loop (CitL), a novel training framework that addresses both\nchallenges with a conformal prediction-based approach. CitL evaluates sample\nuncertainty to adjust weights and prune unreliable examples, enhancing model\nresilience and accuracy with minimal computational cost. Our extensive\nexperiments include a detailed analysis showing how CitL effectively emphasizes\nimpactful data in noisy, imbalanced datasets. Our results show that CitL\nconsistently boosts model performance, achieving up to a 6.1% increase in\nclassification accuracy and a 5.0 mIoU improvement in segmentation. Our code is\npublicly available: CitL.\n","authors":["John Brandon Graham-Knight","Jamil Fayyad","Nourhan Bayasi","Patricia Lasserre","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2411.02281v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2406.04280v2","updated":"2024-11-04T17:02:45Z","published":"2024-06-06T17:26:40Z","title":"xMIL: Insightful Explanations for Multiple Instance Learning in\n Histopathology","summary":" Multiple instance learning (MIL) is an effective and widely used approach for\nweakly supervised machine learning. In histopathology, MIL models have achieved\nremarkable success in tasks like tumor detection, biomarker prediction, and\noutcome prognostication. However, MIL explanation methods are still lagging\nbehind, as they are limited to small bag sizes or disregard instance\ninteractions. We revisit MIL through the lens of explainable AI (XAI) and\nintroduce xMIL, a refined framework with more general assumptions. We\ndemonstrate how to obtain improved MIL explanations using layer-wise relevance\npropagation (LRP) and conduct extensive evaluation experiments on three toy\nsettings and four real-world histopathology datasets. Our approach consistently\noutperforms previous explanation attempts with particularly improved\nfaithfulness scores on challenging biomarker prediction tasks. Finally, we\nshowcase how xMIL explanations enable pathologists to extract insights from MIL\nmodels, representing a significant advance for knowledge discovery and model\ndebugging in digital histopathology. Codes are available at:\nhttps://github.com/tubml-pathology/xMIL.\n","authors":["Julius Hense","Mina Jamshidi Idaji","Oliver Eberle","Thomas Schnake","Jonas Dippel","Laure Ciernik","Oliver Buchstab","Andreas Mock","Frederick Klauschen","Klaus-Robert Müller"],"pdf_url":"https://arxiv.org/pdf/2406.04280v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02256v1","updated":"2024-11-04T16:46:53Z","published":"2024-11-04T16:46:53Z","title":"Unified Speech Recognition: A Single Model for Auditory, Visual, and\n Audiovisual Inputs","summary":" Research in auditory, visual, and audiovisual speech recognition (ASR, VSR,\nand AVSR, respectively) has traditionally been conducted independently. Even\nrecent self-supervised studies addressing two or all three tasks simultaneously\ntend to yield separate models, leading to disjoint inference pipelines with\nincreased memory requirements and redundancies. This paper proposes unified\ntraining strategies for these systems. We demonstrate that training a single\nmodel for all three tasks enhances VSR and AVSR performance, overcoming typical\noptimisation challenges when training from scratch. Moreover, we introduce a\ngreedy pseudo-labelling approach to more effectively leverage unlabelled\nsamples, addressing shortcomings in related self-supervised methods. Finally,\nwe develop a self-supervised pre-training method within our framework, proving\nits effectiveness alongside our semi-supervised approach. Despite using a\nsingle model for all tasks, our unified approach achieves state-of-the-art\nperformance compared to recent methods on LRS3 and LRS2 for ASR, VSR, and AVSR,\nas well as on the newly released WildVSR dataset. Code and models are available\nat https://github.com/ahaliassos/usr.\n","authors":["Alexandros Haliassos","Rodrigo Mira","Honglie Chen","Zoe Landgraf","Stavros Petridis","Maja Pantic"],"pdf_url":"https://arxiv.org/pdf/2411.02256v1.pdf","comment":"NeurIPS 2024. Code: https://github.com/ahaliassos/usr"},{"id":"http://arxiv.org/abs/2411.00225v2","updated":"2024-11-04T16:46:01Z","published":"2024-10-31T21:52:33Z","title":"Fashion-VDM: Video Diffusion Model for Virtual Try-On","summary":" We present Fashion-VDM, a video diffusion model (VDM) for generating virtual\ntry-on videos. Given an input garment image and person video, our method aims\nto generate a high-quality try-on video of the person wearing the given\ngarment, while preserving the person's identity and motion. Image-based virtual\ntry-on has shown impressive results; however, existing video virtual try-on\n(VVT) methods are still lacking garment details and temporal consistency. To\naddress these issues, we propose a diffusion-based architecture for video\nvirtual try-on, split classifier-free guidance for increased control over the\nconditioning inputs, and a progressive temporal training strategy for\nsingle-pass 64-frame, 512px video generation. We also demonstrate the\neffectiveness of joint image-video training for video try-on, especially when\nvideo data is limited. Our qualitative and quantitative experiments show that\nour approach sets the new state-of-the-art for video virtual try-on. For\nadditional results, visit our project page:\nhttps://johannakarras.github.io/Fashion-VDM.\n","authors":["Johanna Karras","Yingwei Li","Nan Liu","Luyang Zhu","Innfarn Yoo","Andreas Lugmayr","Chris Lee","Ira Kemelmacher-Shlizerman"],"pdf_url":"https://arxiv.org/pdf/2411.00225v2.pdf","comment":"Accepted to SIGGRAPH Asia 2024"},{"id":"http://arxiv.org/abs/2405.07257v3","updated":"2024-11-04T16:42:38Z","published":"2024-05-12T11:41:44Z","title":"SPEAK: Speech-Driven Pose and Emotion-Adjustable Talking Head Generation","summary":" Most earlier researches on talking face generation have focused on the\nsynchronization of lip motion and speech content. However, head pose and facial\nemotions are equally important characteristics of natural faces. While\naudio-driven talking face generation has seen notable advancements, existing\nmethods either overlook facial emotions or are limited to specific individuals\nand cannot be applied to arbitrary subjects. In this paper, we propose a novel\none-shot Talking Head Generation framework (SPEAK) that distinguishes itself\nfrom the general Talking Face Generation by enabling emotional and postural\ncontrol. Specifically, we introduce Inter-Reconstructed Feature Disentanglement\n(IRFD) module to decouple facial features into three latent spaces. Then we\ndesign a face editing module that modifies speech content and facial latent\ncodes into a single latent space. Subsequently, we present a novel generator\nthat employs modified latent codes derived from the editing module to regulate\nemotional expression, head poses, and speech content in synthesizing facial\nanimations. Extensive trials demonstrate that our method ensures lip\nsynchronization with the audio while enabling decoupled control of facial\nfeatures, it can generate realistic talking head with coordinated lip motions,\nauthentic facial emotions, and smooth head movements. The demo video is\navailable: https://anonymous.4open.science/r/SPEAK-8A22\n","authors":["Changpeng Cai","Guinan Guo","Jiao Li","Junhao Su","Fei Shen","Chenghao He","Jing Xiao","Yuanxu Chen","Lei Dai","Feiyu Zhu"],"pdf_url":"https://arxiv.org/pdf/2405.07257v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02236v1","updated":"2024-11-04T16:30:14Z","published":"2024-11-04T16:30:14Z","title":"3D Audio-Visual Segmentation","summary":" Recognizing the sounding objects in scenes is a longstanding objective in\nembodied AI, with diverse applications in robotics and AR/VR/MR. To that end,\nAudio-Visual Segmentation (AVS), taking as condition an audio signal to\nidentify the masks of the target sounding objects in an input image with\nsynchronous camera and microphone sensors, has been recently advanced. However,\nthis paradigm is still insufficient for real-world operation, as the mapping\nfrom 2D images to 3D scenes is missing. To address this fundamental limitation,\nwe introduce a novel research problem, 3D Audio-Visual Segmentation, extending\nthe existing AVS to the 3D output space. This problem poses more challenges due\nto variations in camera extrinsics, audio scattering, occlusions, and diverse\nacoustics across sounding object categories. To facilitate this research, we\ncreate the very first simulation based benchmark, 3DAVS-S34-O7, providing\nphotorealistic 3D scene environments with grounded spatial audio under\nsingle-instance and multi-instance settings, across 34 scenes and 7 object\ncategories. This is made possible by re-purposing the Habitat simulator to\ngenerate comprehensive annotations of sounding object locations and\ncorresponding 3D masks. Subsequently, we propose a new approach, EchoSegnet,\ncharacterized by integrating the ready-to-use knowledge from pretrained 2D\naudio-visual foundation models synergistically with 3D visual scene\nrepresentation through spatial audio-aware mask alignment and refinement.\nExtensive experiments demonstrate that EchoSegnet can effectively segment\nsounding objects in 3D space on our new benchmark, representing a significant\nadvancement in the field of embodied AI. Project page:\nhttps://surrey-uplab.github.io/research/3d-audio-visual-segmentation/\n","authors":["Artem Sokolov","Swapnil Bhosale","Xiatian Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.02236v1.pdf","comment":"Accepted at the NeurIPS 2024 Workshop on Audio Imagination"},{"id":"http://arxiv.org/abs/2411.02229v1","updated":"2024-11-04T16:21:00Z","published":"2024-11-04T16:21:00Z","title":"FewViewGS: Gaussian Splatting with Few View Matching and Multi-stage\n Training","summary":" The field of novel view synthesis from images has seen rapid advancements\nwith the introduction of Neural Radiance Fields (NeRF) and more recently with\n3D Gaussian Splatting. Gaussian Splatting became widely adopted due to its\nefficiency and ability to render novel views accurately. While Gaussian\nSplatting performs well when a sufficient amount of training images are\navailable, its unstructured explicit representation tends to overfit in\nscenarios with sparse input images, resulting in poor rendering performance. To\naddress this, we present a 3D Gaussian-based novel view synthesis method using\nsparse input images that can accurately render the scene from the viewpoints\nnot covered by the training images. We propose a multi-stage training scheme\nwith matching-based consistency constraints imposed on the novel views without\nrelying on pre-trained depth estimation or diffusion models. This is achieved\nby using the matches of the available training images to supervise the\ngeneration of the novel views sampled between the training frames with color,\ngeometry, and semantic losses. In addition, we introduce a locality preserving\nregularization for 3D Gaussians which removes rendering artifacts by preserving\nthe local color structure of the scene. Evaluation on synthetic and real-world\ndatasets demonstrates competitive or superior performance of our method in\nfew-shot novel view synthesis compared to existing state-of-the-art methods.\n","authors":["Ruihong Yin","Vladimir Yugay","Yue Li","Sezer Karaoglu","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2411.02229v1.pdf","comment":"Accepted by NeurIPS2024"},{"id":"http://arxiv.org/abs/2411.02220v1","updated":"2024-11-04T16:14:35Z","published":"2024-11-04T16:14:35Z","title":"SIRA: Scalable Inter-frame Relation and Association for Radar Perception","summary":" Conventional radar feature extraction faces limitations due to low spatial\nresolution, noise, multipath reflection, the presence of ghost targets, and\nmotion blur. Such limitations can be exacerbated by nonlinear object motion,\nparticularly from an ego-centric viewpoint. It becomes evident that to address\nthese challenges, the key lies in exploiting temporal feature relation over an\nextended horizon and enforcing spatial motion consistency for effective\nassociation. To this end, this paper proposes SIRA (Scalable Inter-frame\nRelation and Association) with two designs. First, inspired by Swin\nTransformer, we introduce extended temporal relation, generalizing the existing\ntemporal relation layer from two consecutive frames to multiple inter-frames\nwith temporally regrouped window attention for scalability. Second, we propose\nmotion consistency track with the concept of a pseudo-tracklet generated from\nobservational data for better trajectory prediction and subsequent object\nassociation. Our approach achieves 58.11 mAP@0.5 for oriented object detection\nand 47.79 MOTA for multiple object tracking on the Radiate dataset, surpassing\nprevious state-of-the-art by a margin of +4.11 mAP@0.5 and +9.94 MOTA,\nrespectively.\n","authors":["Ryoma Yataka","Pu Perry Wang","Petros Boufounos","Ryuhei Takahashi"],"pdf_url":"https://arxiv.org/pdf/2411.02220v1.pdf","comment":"25 pages, Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2411.02210v1","updated":"2024-11-04T16:04:59Z","published":"2024-11-04T16:04:59Z","title":"One VLM to Keep it Learning: Generation and Balancing for Data-free\n Continual Visual Question Answering","summary":" Vision-Language Models (VLMs) have shown significant promise in Visual\nQuestion Answering (VQA) tasks by leveraging web-scale multimodal datasets.\nHowever, these models often struggle with continual learning due to\ncatastrophic forgetting when adapting to new tasks. As an effective remedy to\nmitigate catastrophic forgetting, rehearsal strategy uses the data of past\ntasks upon learning new task. However, such strategy incurs the need of storing\npast data, which might not be feasible due to hardware constraints or privacy\nconcerns. In this work, we propose the first data-free method that leverages\nthe language generation capability of a VLM, instead of relying on external\nmodels, to produce pseudo-rehearsal data for addressing continual VQA. Our\nproposal, named as GaB, generates pseudo-rehearsal data by posing previous task\nquestions on new task data. Yet, despite being effective, the distribution of\ngenerated questions skews towards the most frequently posed questions due to\nthe limited and task-specific training data. To mitigate this issue, we\nintroduce a pseudo-rehearsal balancing module that aligns the generated data\ntowards the ground-truth data distribution using either the question\nmeta-statistics or an unsupervised clustering method. We evaluate our proposed\nmethod on two recent benchmarks, \\ie VQACL-VQAv2 and CLOVE-function benchmarks.\nGaB outperforms all the data-free baselines with substantial improvement in\nmaintaining VQA performance across evolving tasks, while being on-par with\nmethods with access to the past data.\n","authors":["Deepayan Das","Davide Talon","Massimiliano Mancini","Yiming Wang","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2411.02210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15127v2","updated":"2024-11-04T15:54:21Z","published":"2024-04-23T15:27:19Z","title":"GSCo: Towards Generalizable AI in Medicine via Generalist-Specialist\n Collaboration","summary":" Generalist foundation models (GFMs) are renowned for their exceptional\ncapability and flexibility in effectively generalizing across diverse tasks and\nmodalities. In the field of medicine, while GFMs exhibit superior\ngeneralizability based on their extensive intrinsic knowledge as well as\nproficiency in instruction following and in-context learning, specialist models\nexcel in precision due to their domain knowledge. In this work, for the first\ntime, we explore the synergy between the GFM and specialist models, to enable\nprecise medical image analysis on a broader scope. Specifically, we propose a\ncooperative framework, Generalist-Specialist Collaboration (GSCo), which\nconsists of two stages, namely the construction of GFM and specialists, and\ncollaborative inference on downstream tasks. In the construction stage, we\ndevelop MedDr, the largest open-source GFM tailored for medicine, showcasing\nexceptional instruction-following and in-context learning capabilities.\nMeanwhile, a series of lightweight specialists are crafted for downstream tasks\nwith low computational cost. In the collaborative inference stage, we introduce\ntwo cooperative mechanisms, Mixture-of-Expert Diagnosis and Retrieval-Augmented\nDiagnosis, to harvest the generalist's in-context learning abilities alongside\nthe specialists' domain expertise. For a comprehensive evaluation, we curate a\nlarge-scale benchmark featuring 28 datasets and about 250,000 images. Extensive\nresults demonstrate that MedDr consistently outperforms state-of-the-art GFMs\non downstream datasets. Furthermore, GSCo exceeds both GFMs and specialists\nacross all out-of-domain disease diagnosis datasets. These findings indicate a\nsignificant paradigm shift in the application of GFMs, transitioning from\nseparate models for specific tasks to a collaborative approach between GFMs and\nspecialists, thereby advancing the frontiers of generalizable AI in medicine.\n","authors":["Sunan He","Yuxiang Nie","Hongmei Wang","Shu Yang","Yihui Wang","Zhiyuan Cai","Zhixuan Chen","Yingxue Xu","Luyang Luo","Huiling Xiang","Xi Lin","Mingxiang Wu","Yifan Peng","George Shih","Ziyang Xu","Xian Wu","Qiong Wang","Ronald Cheong Kin Chan","Varut Vardhanabhuti","Winnie Chiu Wing Chu","Yefeng Zheng","Pranav Rajpurkar","Kang Zhang","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.15127v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20915v2","updated":"2024-11-04T15:48:10Z","published":"2024-05-31T15:21:44Z","title":"Fast yet Safe: Early-Exiting with Risk Control","summary":" Scaling machine learning models significantly improves their performance.\nHowever, such gains come at the cost of inference being slow and\nresource-intensive. Early-exit neural networks (EENNs) offer a promising\nsolution: they accelerate inference by allowing intermediate layers to exit and\nproduce a prediction early. Yet a fundamental issue with EENNs is how to\ndetermine when to exit without severely degrading performance. In other words,\nwhen is it 'safe' for an EENN to go 'fast'? To address this issue, we\ninvestigate how to adapt frameworks of risk control to EENNs. Risk control\noffers a distribution-free, post-hoc solution that tunes the EENN's exiting\nmechanism so that exits only occur when the output is of sufficient quality. We\nempirically validate our insights on a range of vision and language tasks,\ndemonstrating that risk control can produce substantial computational savings,\nall the while preserving user-specified performance goals.\n","authors":["Metod Jazbec","Alexander Timans","Tin Hadži Veljković","Kaspar Sakmann","Dan Zhang","Christian A. Naesseth","Eric Nalisnick"],"pdf_url":"https://arxiv.org/pdf/2405.20915v2.pdf","comment":"27 pages, 13 figures, 4 tables (incl. appendix)"},{"id":"http://arxiv.org/abs/2411.02188v1","updated":"2024-11-04T15:42:22Z","published":"2024-11-04T15:42:22Z","title":"Digi2Real: Bridging the Realism Gap in Synthetic Data Face Recognition\n via Foundation Models","summary":" The accuracy of face recognition systems has improved significantly in the\npast few years, thanks to the large amount of data collected and the\nadvancement in neural network architectures. However, these large-scale\ndatasets are often collected without explicit consent, raising ethical and\nprivacy concerns. To address this, there have been proposals to use synthetic\ndatasets for training face recognition models. Yet, such models still rely on\nreal data to train the generative models and generally exhibit inferior\nperformance compared to those trained on real datasets. One of these datasets,\nDigiFace, uses a graphics pipeline to generate different identities and\ndifferent intra-class variations without using real data in training the\nmodels. However, the performance of this approach is poor on face recognition\nbenchmarks, possibly due to the lack of realism in the images generated from\nthe graphics pipeline. In this work, we introduce a novel framework for realism\ntransfer aimed at enhancing the realism of synthetically generated face images.\nOur method leverages the large-scale face foundation model, and we adapt the\npipeline for realism enhancement. By integrating the controllable aspects of\nthe graphics pipeline with our realism enhancement technique, we generate a\nlarge amount of realistic variations-combining the advantages of both\napproaches. Our empirical evaluations demonstrate that models trained using our\nenhanced dataset significantly improve the performance of face recognition\nsystems over the baseline. The source code and datasets will be made available\npublicly.\n","authors":["Anjith George","Sebastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2411.02188v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2411.02184v1","updated":"2024-11-04T15:39:12Z","published":"2024-11-04T15:39:12Z","title":"Double Descent Meets Out-of-Distribution Detection: Theoretical Insights\n and Empirical Analysis on the role of model complexity","summary":" While overparameterization is known to benefit generalization, its impact on\nOut-Of-Distribution (OOD) detection is less understood. This paper investigates\nthe influence of model complexity in OOD detection. We propose an expected OOD\nrisk metric to evaluate classifiers confidence on both training and OOD\nsamples. Leveraging Random Matrix Theory, we derive bounds for the expected OOD\nrisk of binary least-squares classifiers applied to Gaussian data. We show that\nthe OOD risk depicts an infinite peak, when the number of parameters is equal\nto the number of samples, which we associate with the double descent\nphenomenon. Our experimental study on different OOD detection methods across\nmultiple neural architectures extends our theoretical insights and highlights a\ndouble descent curve. Our observations suggest that overparameterization does\nnot necessarily lead to better OOD detection. Using the Neural Collapse\nframework, we provide insights to better understand this behavior. To\nfacilitate reproducibility, our code will be made publicly available upon\npublication.\n","authors":["Mouïn Ben Ammar","David Brellmann","Arturo Mendoza","Antoine Manzanera","Gianni Franchi"],"pdf_url":"https://arxiv.org/pdf/2411.02184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02181v1","updated":"2024-11-04T15:38:32Z","published":"2024-11-04T15:38:32Z","title":"Detect an Object At Once without Fine-tuning","summary":" When presented with one or a few photos of a previously unseen object, humans\ncan instantly recognize it in different scenes. Although the human brain\nmechanism behind this phenomenon is still not fully understood, this work\nintroduces a novel technical realization of this task. It consists of two\nphases: (1) generating a Similarity Density Map (SDM) by convolving the scene\nimage with the given object image patch(es) so that the highlight areas in the\nSDM indicate the possible locations; (2) obtaining the object occupied areas in\nthe scene through a Region Alignment Network (RAN). The RAN is constructed on a\nbackbone of Deep Siamese Network (DSN), and different from the traditional\nDSNs, it aims to obtain the object accurate regions by regressing the location\nand area differences between the ground truths and the predicted ones indicated\nby the highlight areas in SDM. By pre-learning from labels annotated in\ntraditional datasets, the SDM-RAN can detect previously unknown objects without\nfine-tuning. Experiments were conducted on the MS COCO, PASCAL VOC datasets.\nThe results indicate that the proposed method outperforms state-of-the-art\nmethods on the same task.\n","authors":["Junyu Hao","Jianheng Liu","Yongjia Zhao","Zuofan Chen","Qi Sun","Jinlong Chen","Jianguo Wei","Minghao Yang"],"pdf_url":"https://arxiv.org/pdf/2411.02181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02179v1","updated":"2024-11-04T15:37:18Z","published":"2024-11-04T15:37:18Z","title":"CleAR: Robust Context-Guided Generative Lighting Estimation for Mobile\n Augmented Reality","summary":" High-quality environment lighting is the foundation of creating immersive\nuser experiences in mobile augmented reality (AR) applications. However,\nachieving visually coherent environment lighting estimation for Mobile AR is\nchallenging due to several key limitations associated with AR device sensing\ncapabilities, including limitations in device camera FoV and pixel dynamic\nranges. Recent advancements in generative AI, which can generate high-quality\nimages from different types of prompts, including texts and images, present a\npotential solution for high-quality lighting estimation. Still, to effectively\nuse generative image diffusion models, we must address their key limitations of\ngeneration hallucination and slow inference process. To do so, in this work, we\ndesign and implement a generative lighting estimation system called CleAR that\ncan produce high-quality and diverse environment maps in the format of\n360$^\\circ$ images. Specifically, we design a two-step generation pipeline\nguided by AR environment context data to ensure the results follow physical\nenvironment visual context and color appearances. To improve the estimation\nrobustness under different lighting conditions, we design a real-time\nrefinement component to adjust lighting estimation results on AR devices. To\ntrain and test our generative models, we curate a large-scale environment\nlighting estimation dataset with diverse lighting conditions. Through\nquantitative evaluation and user study, we show that CleAR outperforms\nstate-of-the-art lighting estimation methods on both estimation accuracy and\nrobustness. Moreover, CleAR supports real-time refinement of lighting\nestimation results, ensuring robust and timely environment lighting updates for\nAR applications. Our end-to-end generative estimation takes as fast as 3.2\nseconds, outperforming state-of-the-art methods by 110x.\n","authors":["Yiqin Zhao","Mallesham Dasari","Tian Guo"],"pdf_url":"https://arxiv.org/pdf/2411.02179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02175v1","updated":"2024-11-04T15:34:30Z","published":"2024-11-04T15:34:30Z","title":"SAFE: Slow and Fast Parameter-Efficient Tuning for Continual Learning\n with Pre-Trained Models","summary":" Continual learning aims to incrementally acquire new concepts in data streams\nwhile resisting forgetting previous knowledge. With the rise of powerful\npre-trained models (PTMs), there is a growing interest in training incremental\nlearning systems using these foundation models, rather than learning from\nscratch. Existing works often view PTMs as a strong initial point and directly\napply parameter-efficient tuning (PET) in the first session for adapting to\ndownstream tasks. In the following sessions, most methods freeze model\nparameters for tackling forgetting issues. However, applying PET directly to\ndownstream data cannot fully explore the inherent knowledge in PTMs.\nAdditionally, freezing the parameters in incremental sessions hinders models'\nplasticity to novel concepts not covered in the first session. To solve the\nabove issues, we propose a Slow And Fast parameter-Efficient tuning (SAFE)\nframework. In particular, to inherit general knowledge from foundation models,\nwe include a transfer loss function by measuring the correlation between the\nPTM and the PET-applied model. After calibrating in the first session, the slow\nefficient tuning parameters can capture more informative features, improving\ngeneralization to incoming classes. Moreover, to further incorporate novel\nconcepts, we strike a balance between stability and plasticity by fixing slow\nefficient tuning parameters and continuously updating the fast ones.\nSpecifically, a cross-classification loss with feature alignment is proposed to\ncircumvent catastrophic forgetting. During inference, we introduce an\nentropy-based aggregation strategy to dynamically utilize the complementarity\nin the slow and fast learners. Extensive experiments on seven benchmark\ndatasets verify the effectiveness of our method by significantly surpassing the\nstate-of-the-art.\n","authors":["Linglan Zhao","Xuerui Zhang","Ke Yan","Shouhong Ding","Weiran Huang"],"pdf_url":"https://arxiv.org/pdf/2411.02175v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.02149v1","updated":"2024-11-04T15:06:57Z","published":"2024-11-04T15:06:57Z","title":"Improving Domain Generalization in Self-supervised Monocular Depth\n Estimation via Stabilized Adversarial Training","summary":" Learning a self-supervised Monocular Depth Estimation (MDE) model with great\ngeneralization remains significantly challenging. Despite the success of\nadversarial augmentation in the supervised learning generalization, naively\nincorporating it into self-supervised MDE models potentially causes\nover-regularization, suffering from severe performance degradation. In this\npaper, we conduct qualitative analysis and illuminate the main causes: (i)\ninherent sensitivity in the UNet-alike depth network and (ii) dual optimization\nconflict caused by over-regularization. To tackle these issues, we propose a\ngeneral adversarial training framework, named Stabilized Conflict-optimization\nAdversarial Training (SCAT), integrating adversarial data augmentation into\nself-supervised MDE methods to achieve a balance between stability and\ngeneralization. Specifically, we devise an effective scaling depth network that\ntunes the coefficients of long skip connection and effectively stabilizes the\ntraining process. Then, we propose a conflict gradient surgery strategy, which\nprogressively integrates the adversarial gradient and optimizes the model\ntoward a conflict-free direction. Extensive experiments on five benchmarks\ndemonstrate that SCAT can achieve state-of-the-art performance and\nsignificantly improve the generalization capability of existing self-supervised\nMDE methods.\n","authors":["Yuanqi Yao","Gang Wu","Kui Jiang","Siao Liu","Jian Kuai","Xianming Liu","Junjun Jiang"],"pdf_url":"https://arxiv.org/pdf/2411.02149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02136v1","updated":"2024-11-04T14:49:01Z","published":"2024-11-04T14:49:01Z","title":"Advanced computer vision for extracting georeferenced vehicle\n trajectories from drone imagery","summary":" This paper presents a framework for extracting georeferenced vehicle\ntrajectories from high-altitude drone footage, addressing key challenges in\nurban traffic monitoring and limitations of traditional ground-based systems.\nWe employ state-of-the-art computer vision and deep learning to create an\nend-to-end pipeline that enhances vehicle detection, tracking, and trajectory\nstabilization. Conducted in the Songdo International Business District, South\nKorea, the study used a multi-drone experiment over 20 intersections, capturing\napproximately 12TB of 4K video data over four days. We developed a novel track\nstabilization method that uses detected vehicle bounding boxes as exclusion\nmasks during image registration, which, combined with advanced georeferencing\ntechniques, accurately transforms vehicle coordinates into real-world\ngeographical data. Additionally, our framework includes robust vehicle\ndimension estimation and detailed road segmentation for in-depth traffic\nanalysis. The framework produced two high-quality datasets: the Songdo Traffic\ndataset, comprising nearly 1 million unique vehicle trajectories, and the\nSongdo Vision dataset, containing over 5,000 human-annotated frames with about\n300,000 vehicle instances in four classes. Comparisons between drone-derived\ndata and high-precision sensor data from an instrumented probe vehicle\nhighlight the accuracy and consistency of our framework's extraction in dense\nurban settings. By publicly releasing these datasets and the pipeline source\ncode, this work sets new benchmarks for data quality, reproducibility, and\nscalability in traffic research. Results demonstrate the potential of\nintegrating drone technology with advanced computer vision for precise,\ncost-effective urban traffic monitoring, providing valuable resources for the\nresearch community to develop intelligent transportation systems and improve\ntraffic management strategies.\n","authors":["Robert Fonod","Haechan Cho","Hwasoo Yeo","Nikolas Geroliminis"],"pdf_url":"https://arxiv.org/pdf/2411.02136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13174v2","updated":"2024-11-04T14:48:23Z","published":"2024-09-20T03:02:05Z","title":"Manipulation Facing Threats: Evaluating Physical Vulnerabilities in\n End-to-End Vision Language Action Models","summary":" Recently, driven by advancements in Multimodal Large Language Models (MLLMs),\nVision Language Action Models (VLAMs) are being proposed to achieve better\nperformance in open-vocabulary scenarios for robotic manipulation tasks. Since\nmanipulation tasks involve direct interaction with the physical world, ensuring\nrobustness and safety during the execution of this task is always a very\ncritical issue. In this paper, by synthesizing current safety research on MLLMs\nand the specific application scenarios of the manipulation task in the physical\nworld, we comprehensively evaluate VLAMs in the face of potential physical\nthreats. Specifically, we propose the Physical Vulnerability Evaluating\nPipeline (PVEP) that can incorporate as many visual modal physical threats as\npossible for evaluating the physical robustness of VLAMs. The physical threats\nin PVEP specifically include Out-of-Distribution, Typography-based Visual\nPrompts, and Adversarial Patch Attacks. By comparing the performance\nfluctuations of VLAMs before and after being attacked, we provide generalizable\nAnalyses of how VLAMs respond to different physical security threats. Our\nproject page is in this link:\nhttps://chaducheng.github.io/Manipulat-Facing-Threats/.\n","authors":["Hao Cheng","Erjia Xiao","Chengyuan Yu","Zhao Yao","Jiahang Cao","Qiang Zhang","Jiaxu Wang","Mengshu Sun","Kaidi Xu","Jindong Gu","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2409.13174v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17360v2","updated":"2024-11-04T14:47:12Z","published":"2024-04-26T12:21:57Z","title":"UniRGB-IR: A Unified Framework for RGB-Infrared Semantic Tasks via\n Adapter Tuning","summary":" Semantic analysis on visible (RGB) and infrared (IR) images has gained\nattention for its ability to be more accurate and robust under low-illumination\nand complex weather conditions. Due to the lack of pre-trained foundation\nmodels on the large-scale infrared image datasets, existing methods prefer to\ndesign task-specific frameworks and directly fine-tune them with pre-trained\nfoundation models on their RGB-IR semantic relevance datasets, which results in\npoor scalability and limited generalization. In this work, we propose a general\nand efficient framework called UniRGB-IR to unify RGB-IR semantic tasks, in\nwhich a novel adapter is developed to efficiently introduce richer RGB-IR\nfeatures into the pre-trained RGB-based foundation model. Specifically, our\nframework consists of a RGB-based foundation model, a Multi-modal Feature Pool\n(MFP) module and a Supplementary Feature Injector (SFI) module. The MFP and SFI\nmodules cooperate with each other as an adapter to effectively complement the\nRGB-based features with the rich RGB-IR features. During training process, we\nfreeze the entire foundation model to inherit prior knowledge and only optimize\nthe proposed adapter. Furthermore, to verify the effectiveness of our\nframework, we utilize the vanilla vision transformer (ViT-Base) as the\npre-trained foundation model to perform extensive experiments. Experimental\nresults on various RGB-IR downstream tasks demonstrate that our method can\nachieve state-of-the-art performance. The source code and results are available\nat https://github.com/PoTsui99/UniRGB-IR.git.\n","authors":["Maoxun Yuan","Bo Cui","Tianyi Zhao","Jiayi Wang","Shan Fu","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2404.17360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06753v3","updated":"2024-11-04T14:32:02Z","published":"2023-03-12T21:01:54Z","title":"Modular Quantization-Aware Training for 6D Object Pose Estimation","summary":" Edge applications, such as collaborative robotics and spacecraft rendezvous,\ndemand efficient 6D object pose estimation on resource-constrained embedded\nplatforms. Existing 6D pose estimation networks are often too large for such\ndeployments, necessitating compression while maintaining reliable performance.\nTo address this challenge, we introduce Modular Quantization-Aware Training\n(MQAT), an adaptive and mixed-precision quantization-aware training strategy\nthat exploits the modular structure of modern 6D pose estimation architectures.\nMQAT guides a systematic gradated modular quantization sequence and determines\nmodule-specific bit precisions, leading to quantized models that outperform\nthose produced by state-of-the-art uniform and mixed-precision quantization\ntechniques. Our experiments showcase the generality of MQAT across datasets,\narchitectures, and quantization algorithms. Remarkably, MQAT-trained quantized\nmodels achieve a significant accuracy boost (>7%) over the baseline\nfull-precision network while reducing model size by a factor of 4x or more. Our\nproject website is at: https://saqibjaved1.github.io/MQAT_/\n","authors":["Saqib Javed","Chengkun Li","Andrew Price","Yinlin Hu","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2303.06753v3.pdf","comment":"Accepted to Transactions on Machine Learning Research (TMLR), 2024"},{"id":"http://arxiv.org/abs/2408.05500v2","updated":"2024-11-04T14:30:03Z","published":"2024-08-10T09:31:58Z","title":"PointNCBW: Towards Dataset Ownership Verification for Point Clouds via\n Negative Clean-label Backdoor Watermark","summary":" Recently, point clouds have been widely used in computer vision, whereas\ntheir collection is time-consuming and expensive. As such, point cloud datasets\nare the valuable intellectual property of their owners and deserve protection.\nTo detect and prevent unauthorized use of these datasets, especially for\ncommercial or open-sourced ones that cannot be sold again or used commercially\nwithout permission, we intend to identify whether a suspicious third-party\nmodel is trained on our protected dataset under the black-box setting. We\nachieve this goal by designing a scalable clean-label backdoor-based dataset\nwatermark for point clouds that ensures both effectiveness and stealthiness.\nUnlike existing clean-label watermark schemes, which are susceptible to the\nnumber of categories, our method could watermark samples from all classes\ninstead of only from the target one. Accordingly, it can still preserve high\neffectiveness even on large-scale datasets with many classes. Specifically, we\nperturb selected point clouds with non-target categories in both shape-wise and\npoint-wise manners before inserting trigger patterns without changing their\nlabels. The features of perturbed samples are similar to those of benign\nsamples from the target class. As such, models trained on the watermarked\ndataset will have a distinctive yet stealthy backdoor behavior, i.e.,\nmisclassifying samples from the target class whenever triggers appear, since\nthe trained DNNs will treat the inserted trigger pattern as a signal to deny\npredicting the target label. We also design a hypothesis-test-guided dataset\nownership verification based on the proposed watermark. Extensive experiments\non benchmark datasets are conducted, verifying the effectiveness of our method\nand its resistance to potential removal methods.\n","authors":["Cheng Wei","Yang Wang","Kuofeng Gao","Shuo Shao","Yiming Li","Zhibo Wang","Zhan Qin"],"pdf_url":"https://arxiv.org/pdf/2408.05500v2.pdf","comment":"This paper was accepted by IEEE Transactions on Information Forensics\n and Security (TIFS), 2024. 16 pages"},{"id":"http://arxiv.org/abs/2411.02116v1","updated":"2024-11-04T14:29:28Z","published":"2024-11-04T14:29:28Z","title":"Advancements and limitations of LLMs in replicating human color-word\n associations","summary":" Color-word associations play a fundamental role in human cognition and design\napplications. Large Language Models (LLMs) have become widely available and\ndemonstrated intelligent behaviors in various benchmarks with natural\nconversation skills. However, their ability to replicate human color-word\nassociations remains understudied. We compared multiple generations of LLMs\n(from GPT-3 to GPT- 4o) against human color-word associations using data\ncollected from over 10,000 Japanese participants, involving 17 colors and words\nfrom eight categories in Japanese. Our findings reveal a clear progression in\nLLM performance across generations, with GPT-4o achieving the highest accuracy\nin predicting the best voted word for each color and category, particularly\nwhen using visual inputs rather than text-based color codes. However, the\nhighest median performance was approximately 50% even for GPT4-o with visual\ninputs (chance level is 10%), and the performance levels varied significantly\nacross word categories and colors, indicating a failure to fully replicate\nhuman color-word associations. On the other hand, color discrimination ability\nestimated from our color-word association data showed that LLMs demonstrated\nhigh correlation with human color discrimination patterns, similarly to\nprevious studies. Our study highlights both the advancements in LLM\ncapabilities and their persistent limitations, suggesting differences in\nsemantic memory structures between humans and LLMs in representing color-word\nassociations.\n","authors":["Makoto Fukushima","Shusuke Eshita","Hiroshige Fukuhara"],"pdf_url":"https://arxiv.org/pdf/2411.02116v1.pdf","comment":"20 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2411.02112v1","updated":"2024-11-04T14:27:10Z","published":"2024-11-04T14:27:10Z","title":"Multi-modal biometric authentication: Leveraging shared layer\n architectures for enhanced security","summary":" In this study, we introduce a novel multi-modal biometric authentication\nsystem that integrates facial, vocal, and signature data to enhance security\nmeasures. Utilizing a combination of Convolutional Neural Networks (CNNs) and\nRecurrent Neural Networks (RNNs), our model architecture uniquely incorporates\ndual shared layers alongside modality-specific enhancements for comprehensive\nfeature extraction. The system undergoes rigorous training with a joint loss\nfunction, optimizing for accuracy across diverse biometric inputs.\nFeature-level fusion via Principal Component Analysis (PCA) and classification\nthrough Gradient Boosting Machines (GBM) further refine the authentication\nprocess. Our approach demonstrates significant improvements in authentication\naccuracy and robustness, paving the way for advanced secure identity\nverification solutions.\n","authors":["Vatchala S","Yogesh C","Yeshwanth Govindarajan","Krithik Raja M","Vishal Pramav Amirtha Ganesan","Aashish Vinod A","Dharun Ramesh"],"pdf_url":"https://arxiv.org/pdf/2411.02112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02104v1","updated":"2024-11-04T14:15:26Z","published":"2024-11-04T14:15:26Z","title":"Deep Learning on 3D Semantic Segmentation: A Detailed Review","summary":" In this paper an exhaustive review and comprehensive analysis of recent and\nformer deep learning methods in 3D Semantic Segmentation (3DSS) is presented.\nIn the related literature, the taxonomy scheme used for the classification of\nthe 3DSS deep learning methods is ambiguous. Based on the taxonomy schemes of 9\nexisting review papers, a new taxonomy scheme of the 3DSS deep learning methods\nis proposed, aiming to standardize it and improve the comparability and clarity\nacross related studies. Furthermore, an extensive overview of the available\n3DSS indoor and outdoor datasets is provided along with their links. The core\npart of the review is the detailed presentation of recent and former 3DSS deep\nlearning methods and their classification using the proposed taxonomy scheme\nalong with their GitHub repositories. Additionally, a brief but informative\nanalysis of the evaluation metrics and loss functions used in 3DSS is included.\nFinally, a fruitful discussion of the examined 3DSS methods and datasets, is\npresented to foster new research directions and applications in the field of\n3DSS. Supplementary, to this review a GitHub repository is provided\n(https://github.com/thobet/Deep-Learning-on-3D-Semantic-Segmentation-a-\nDetailed-Review) including a quick classification of over 400 3DSS methods,\nusing the proposed taxonomy scheme.\n","authors":["Thodoris Betsas","Andreas Georgopoulos","Anastasios Doulamis","Pierre Grussenmeyer"],"pdf_url":"https://arxiv.org/pdf/2411.02104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02099v1","updated":"2024-11-04T14:08:26Z","published":"2024-11-04T14:08:26Z","title":"Differentially Private Integrated Decision Gradients (IDG-DP) for\n Radar-based Human Activity Recognition","summary":" Human motion analysis offers significant potential for healthcare monitoring\nand early detection of diseases. The advent of radar-based sensing systems has\ncaptured the spotlight for they are able to operate without physical contact\nand they can integrate with pre-existing Wi-Fi networks. They are also seen as\nless privacy-invasive compared to camera-based systems. However, recent\nresearch has shown high accuracy in recognizing subjects or gender from radar\ngait patterns, raising privacy concerns. This study addresses these issues by\ninvestigating privacy vulnerabilities in radar-based Human Activity Recognition\n(HAR) systems and proposing a novel method for privacy preservation using\nDifferential Privacy (DP) driven by attributions derived with Integrated\nDecision Gradient (IDG) algorithm. We investigate Black-box Membership\nInference Attack (MIA) Models in HAR settings across various levels of\nattacker-accessible information. We extensively evaluated the effectiveness of\nthe proposed IDG-DP method by designing a CNN-based HAR model and rigorously\nassessing its resilience against MIAs. Experimental results demonstrate the\npotential of IDG-DP in mitigating privacy attacks while maintaining utility\nacross all settings, particularly excelling against label-only and shadow model\nblack-box MIA attacks. This work represents a crucial step towards balancing\nthe need for effective radar-based HAR with robust privacy protection in\nhealthcare environments.\n","authors":["Idris Zakariyya","Linda Tran","Kaushik Bhargav Sivangi","Paul Henderson","Fani Deligianni"],"pdf_url":"https://arxiv.org/pdf/2411.02099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22709v2","updated":"2024-11-04T14:06:19Z","published":"2024-10-30T05:38:03Z","title":"FilterViT and DropoutViT: Lightweight Vision Transformer Models for\n Efficient Attention Mechanisms","summary":" In this study, we introduce FilterViT, an enhanced version of MobileViT,\nwhich leverages an attention-based mechanism for early-stage downsampling.\nTraditional QKV operations on high-resolution feature maps are computationally\nintensive due to the abundance of tokens. To address this, we propose a filter\nattention mechanism using a convolutional neural network (CNN) to generate an\nimportance mask, focusing attention on key image regions. The method\nsignificantly reduces computational complexity while maintaining\ninterpretability, as it highlights essential image areas. Experimental results\nshow that FilterViT achieves substantial gains in both efficiency and accuracy\ncompared to other models. We also introduce DropoutViT, a variant that uses a\nstochastic approach for pixel selection, further enhancing robustness.\n","authors":["Bohang Sun"],"pdf_url":"https://arxiv.org/pdf/2410.22709v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02095v1","updated":"2024-11-04T13:59:01Z","published":"2024-11-04T13:59:01Z","title":"The evolution of volumetric video: A survey of smart transcoding and\n compression approaches","summary":" Volumetric video, the capture and display of three-dimensional (3D) imagery,\nhas emerged as a revolutionary technology poised to transform the media\nlandscape, enabling immersive experiences that transcend the limitations of\ntraditional 2D video. One of the key challenges in this domain is the efficient\ndelivery of these high-bandwidth, data-intensive volumetric video streams,\nwhich requires innovative transcoding and compression techniques. This research\npaper explores the state-of-the-art in volumetric video compression and\ndelivery, with a focus on the potential of AI-driven solutions to address the\nunique challenges posed by this emerging medium.\n","authors":["Preetish Kakkar","Hariharan Ragothaman"],"pdf_url":"https://arxiv.org/pdf/2411.02095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06457v2","updated":"2024-11-04T13:43:15Z","published":"2024-08-12T19:17:57Z","title":"Advanced Vision Transformers and Open-Set Learning for Robust Mosquito\n Classification: A Novel Approach to Entomological Studies","summary":" Mosquito-related diseases pose a significant threat to global public health,\nnecessitating efficient and accurate mosquito classification for effective\nsurveillance and control. This work presents an innovative approach to mosquito\nclassification by leveraging state-of-the-art vision transformers and open-set\nlearning techniques. A novel framework has been introduced that integrates\nTransformer-based deep learning models with comprehensive data augmentation and\npreprocessing methods, enabling robust and precise identification of ten\nmosquito species. The Swin Transformer model achieves the best performance for\ntraditional closed-set learning with 99.80% accuracy and 0.998 F1 score. The\nlightweight MobileViT technique attains an almost similar accuracy of 98.90%\nwith significantly reduced parameters and model complexities. Next, the applied\ndeep learning models' adaptability and generalizability in a static environment\nhave been enhanced by using new classes of data samples during the inference\nstage that have not been included in the training set. The proposed framework's\nability to handle unseen classes like insects similar to mosquitoes, even\nhumans, through open-set learning further enhances its practical applicability\nby employing the OpenMax technique and Weibull distribution. The traditional\nCNN model, Xception, outperforms the latest transformer with higher accuracy\nand F1 score for open-set learning. The study's findings highlight the\ntransformative potential of advanced deep-learning architectures in entomology,\nproviding a strong groundwork for future research and development in mosquito\nsurveillance and vector control. The implications of this work extend beyond\nmosquito classification, offering valuable insights for broader ecological and\nenvironmental monitoring applications.\n","authors":["Ahmed Akib Jawad Karim","Muhammad Zawad Mahmud","Riasat Khan"],"pdf_url":"https://arxiv.org/pdf/2408.06457v2.pdf","comment":"23 pages, 15 figures"},{"id":"http://arxiv.org/abs/2402.18718v2","updated":"2024-11-04T13:40:24Z","published":"2024-02-28T21:29:16Z","title":"Model Pairing Using Embedding Translation for Backdoor Attack Detection\n on Open-Set Classification Tasks","summary":" Backdoor attacks allow an attacker to embed a specific vulnerability in a\nmachine learning algorithm, activated when an attacker-chosen pattern is\npresented, causing a specific misprediction. The need to identify backdoors in\nbiometric scenarios has led us to propose a novel technique with different\ntrade-offs. In this paper we propose to use model pairs on open-set\nclassification tasks for detecting backdoors. Using a simple linear operation\nto project embeddings from a probe model's embedding space to a reference\nmodel's embedding space, we can compare both embeddings and compute a\nsimilarity score. We show that this score, can be an indicator for the presence\nof a backdoor despite models being of different architectures, having been\ntrained independently and on different datasets. This technique allows for the\ndetection of backdoors on models designed for open-set classification tasks,\nwhich is little studied in the literature. Additionally, we show that backdoors\ncan be detected even when both models are backdoored. The source code is made\navailable for reproducibility purposes.\n","authors":["Alexander Unnervik","Hatef Otroshi Shahreza","Anjith George","Sébastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2402.18718v2.pdf","comment":"Accepted in NeurIPS 2024 Safe Generative AI Workshop (oral\n presentation)"},{"id":"http://arxiv.org/abs/2410.18978v2","updated":"2024-11-04T13:37:31Z","published":"2024-10-24T17:59:51Z","title":"Framer: Interactive Frame Interpolation","summary":" We propose Framer for interactive frame interpolation, which targets\nproducing smoothly transitioning frames between two images as per user\ncreativity. Concretely, besides taking the start and end frames as inputs, our\napproach supports customizing the transition process by tailoring the\ntrajectory of some selected keypoints. Such a design enjoys two clear benefits.\nFirst, incorporating human interaction mitigates the issue arising from\nnumerous possibilities of transforming one image to another, and in turn\nenables finer control of local motions. Second, as the most basic form of\ninteraction, keypoints help establish the correspondence across frames,\nenhancing the model to handle challenging cases (e.g., objects on the start and\nend frames are of different shapes and styles). It is noteworthy that our\nsystem also offers an \"autopilot\" mode, where we introduce a module to estimate\nthe keypoints and refine the trajectory automatically, to simplify the usage in\npractice. Extensive experimental results demonstrate the appealing performance\nof Framer on various applications, such as image morphing, time-lapse video\ngeneration, cartoon interpolation, etc. The code, the model, and the interface\nwill be released to facilitate further research.\n","authors":["Wen Wang","Qiuyu Wang","Kecheng Zheng","Hao Ouyang","Zhekai Chen","Biao Gong","Hao Chen","Yujun Shen","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2410.18978v2.pdf","comment":"Project page: https://aim-uofa.github.io/Framer/"},{"id":"http://arxiv.org/abs/2410.01768v2","updated":"2024-11-04T13:33:16Z","published":"2024-10-02T17:25:31Z","title":"SegEarth-OV: Towards Training-Free Open-Vocabulary Segmentation for\n Remote Sensing Images","summary":" Remote sensing image plays an irreplaceable role in fields such as\nagriculture, water resources, military, and disaster relief. Pixel-level\ninterpretation is a critical aspect of remote sensing image applications;\nhowever, a prevalent limitation remains the need for extensive manual\nannotation. For this, we try to introduce open-vocabulary semantic segmentation\n(OVSS) into the remote sensing context. However, due to the sensitivity of\nremote sensing images to low-resolution features, distorted target shapes and\nill-fitting boundaries are exhibited in the prediction mask. To tackle this\nissue, we propose a simple and general upsampler, SimFeatUp, to restore lost\nspatial information in deep features in a training-free style. Further, based\non the observation of the abnormal response of local patch tokens to [CLS]\ntoken in CLIP, we propose to execute a straightforward subtraction operation to\nalleviate the global bias in patch tokens. Extensive experiments are conducted\non 17 remote sensing datasets spanning semantic segmentation, building\nextraction, road detection, and flood detection tasks. Our method achieves an\naverage of 5.8%, 8.2%, 4.0%, and 15.3% improvement over state-of-the-art\nmethods on 4 tasks. All codes are released.\n\\url{https://earth-insights.github.io/SegEarth-OV}\n","authors":["Kaiyu Li","Ruixun Liu","Xiangyong Cao","Xueru Bai","Feng Zhou","Deyu Meng","Zhi Wang"],"pdf_url":"https://arxiv.org/pdf/2410.01768v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11944v4","updated":"2024-11-04T13:28:48Z","published":"2024-01-22T13:34:34Z","title":"CMMMU: A Chinese Massive Multi-discipline Multimodal Understanding\n Benchmark","summary":" As the capabilities of large multimodal models (LMMs) continue to advance,\nevaluating the performance of LMMs emerges as an increasing need. Additionally,\nthere is an even larger gap in evaluating the advanced knowledge and reasoning\nabilities of LMMs in non-English contexts such as Chinese. We introduce CMMMU,\na new Chinese Massive Multi-discipline Multimodal Understanding benchmark\ndesigned to evaluate LMMs on tasks demanding college-level subject knowledge\nand deliberate reasoning in a Chinese context. CMMMU is inspired by and\nstrictly follows the annotation and analysis pattern of MMMU. CMMMU includes\n12k manually collected multimodal questions from college exams, quizzes, and\ntextbooks, covering six core disciplines: Art & Design, Business, Science,\nHealth & Medicine, Humanities & Social Science, and Tech & Engineering, like\nits companion, MMMU. These questions span 30 subjects and comprise 39 highly\nheterogeneous image types, such as charts, diagrams, maps, tables, music\nsheets, and chemical structures. CMMMU focuses on complex perception and\nreasoning with domain-specific knowledge in the Chinese context. We evaluate 11\nopen-source LLMs and one proprietary GPT-4V(ision). Even GPT-4V only achieves\naccuracies of 42%, indicating a large space for improvement. CMMMU will boost\nthe community to build the next-generation LMMs towards expert artificial\nintelligence and promote the democratization of LMMs by providing diverse\nlanguage contexts.\n","authors":["Ge Zhang","Xinrun Du","Bei Chen","Yiming Liang","Tongxu Luo","Tianyu Zheng","Kang Zhu","Yuyang Cheng","Chunpu Xu","Shuyue Guo","Haoran Zhang","Xingwei Qu","Junjie Wang","Ruibin Yuan","Yizhi Li","Zekun Wang","Yudong Liu","Yu-Hsuan Tsai","Fengji Zhang","Chenghua Lin","Wenhao Huang","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2401.11944v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02074v1","updated":"2024-11-04T13:26:15Z","published":"2024-11-04T13:26:15Z","title":"GraphVL: Graph-Enhanced Semantic Modeling via Vision-Language Models for\n Generalized Class Discovery","summary":" Generalized Category Discovery (GCD) aims to cluster unlabeled images into\nknown and novel categories using labeled images from known classes. To address\nthe challenge of transferring features from known to unknown classes while\nmitigating model bias, we introduce GraphVL, a novel approach for\nvision-language modeling in GCD, leveraging CLIP. Our method integrates a graph\nconvolutional network (GCN) with CLIP's text encoder to preserve class\nneighborhood structure. We also employ a lightweight visual projector for image\ndata, ensuring discriminative features through margin-based contrastive losses\nfor image-text mapping. This neighborhood preservation criterion effectively\nregulates the semantic space, making it less sensitive to known classes.\nAdditionally, we learn textual prompts from known classes and align them to\ncreate a more contextually meaningful semantic feature space for the GCN layer\nusing a contextual similarity loss. Finally, we represent unlabeled samples\nbased on their semantic distance to class prompts from the GCN, enabling\nsemi-supervised clustering for class discovery and minimizing errors. Our\nexperiments on seven benchmark datasets consistently demonstrate the\nsuperiority of GraphVL when integrated with the CLIP backbone.\n","authors":["Bhupendra Solanki","Ashwin Nair","Mainak Singha","Souradeep Mukhopadhyay","Ankit Jha","Biplab Banerjee"],"pdf_url":"https://arxiv.org/pdf/2411.02074v1.pdf","comment":"Accepted in ACM ICVGIP 2024"},{"id":"http://arxiv.org/abs/2406.09952v2","updated":"2024-11-04T13:26:07Z","published":"2024-06-14T11:58:49Z","title":"BiVLC: Extending Vision-Language Compositionality Evaluation with\n Text-to-Image Retrieval","summary":" Existing Vision-Language Compositionality (VLC) benchmarks like SugarCrepe\nare formulated as image-to-text retrieval problems, where, given an image, the\nmodels need to select between the correct textual description and a synthetic\nhard negative text. In this work, we present the Bidirectional Vision-Language\nCompositionality (BiVLC) dataset. The novelty of BiVLC is to add a synthetic\nhard negative image generated from the synthetic text, resulting in two\nimage-to-text retrieval examples (one for each image) and, more importantly,\ntwo text-to-image retrieval examples (one for each text). Human annotators\nfilter out ill-formed examples ensuring the validity of the benchmark. The\nexperiments on BiVLC uncover a weakness of current multimodal models, as they\nperform poorly in the text-to-image direction. In fact, when considering both\nretrieval directions, the conclusions obtained in previous works change\nsignificantly. In addition to the benchmark, we show that a contrastive model\ntrained using synthetic images and texts significantly improves over the base\nmodel in SugarCrepe and in BiVLC for both retrieval directions. The gap to\nhuman performance in BiVLC confirms that Vision-Language Compositionality is\nstill a challenging problem. BiVLC and code are available at\nhttps://imirandam.github.io/BiVLC_project_page.\n","authors":["Imanol Miranda","Ander Salaberria","Eneko Agirre","Gorka Azkune"],"pdf_url":"https://arxiv.org/pdf/2406.09952v2.pdf","comment":"Accepted to NeurIPS 24 Datasets and Benchmarks Track; Project page\n at: https://imirandam.github.io/BiVLC_project_page/"},{"id":"http://arxiv.org/abs/2404.13686v3","updated":"2024-11-04T13:24:18Z","published":"2024-04-21T15:16:05Z","title":"Hyper-SD: Trajectory Segmented Consistency Model for Efficient Image\n Synthesis","summary":" Recently, a series of diffusion-aware distillation algorithms have emerged to\nalleviate the computational overhead associated with the multi-step inference\nprocess of Diffusion Models (DMs). Current distillation techniques often\ndichotomize into two distinct aspects: i) ODE Trajectory Preservation; and ii)\nODE Trajectory Reformulation. However, these approaches suffer from severe\nperformance degradation or domain shifts. To address these limitations, we\npropose Hyper-SD, a novel framework that synergistically amalgamates the\nadvantages of ODE Trajectory Preservation and Reformulation, while maintaining\nnear-lossless performance during step compression. Firstly, we introduce\nTrajectory Segmented Consistency Distillation to progressively perform\nconsistent distillation within pre-defined time-step segments, which\nfacilitates the preservation of the original ODE trajectory from a higher-order\nperspective. Secondly, we incorporate human feedback learning to boost the\nperformance of the model in a low-step regime and mitigate the performance loss\nincurred by the distillation process. Thirdly, we integrate score distillation\nto further improve the low-step generation capability of the model and offer\nthe first attempt to leverage a unified LoRA to support the inference process\nat all steps. Extensive experiments and user studies demonstrate that Hyper-SD\nachieves SOTA performance from 1 to 8 inference steps for both SDXL and SD1.5.\nFor example, Hyper-SDXL surpasses SDXL-Lightning by +0.68 in CLIP Score and\n+0.51 in Aes Score in the 1-step inference.\n","authors":["Yuxi Ren","Xin Xia","Yanzuo Lu","Jiacheng Zhang","Jie Wu","Pan Xie","Xing Wang","Xuefeng Xiao"],"pdf_url":"https://arxiv.org/pdf/2404.13686v3.pdf","comment":"Accepted by NeurIPS 2024 (Camera-Ready Version). Project Page:\n https://hyper-sd.github.io/"},{"id":"http://arxiv.org/abs/2309.09875v2","updated":"2024-11-04T13:17:00Z","published":"2023-09-18T15:37:01Z","title":"RaLF: Flow-based Global and Metric Radar Localization in LiDAR Maps","summary":" Localization is paramount for autonomous robots. While camera and LiDAR-based\napproaches have been extensively investigated, they are affected by adverse\nillumination and weather conditions. Therefore, radar sensors have recently\ngained attention due to their intrinsic robustness to such conditions. In this\npaper, we propose RaLF, a novel deep neural network-based approach for\nlocalizing radar scans in a LiDAR map of the environment, by jointly learning\nto address both place recognition and metric localization. RaLF is composed of\nradar and LiDAR feature encoders, a place recognition head that generates\nglobal descriptors, and a metric localization head that predicts the 3-DoF\ntransformation between the radar scan and the map. We tackle the place\nrecognition task by learning a shared embedding space between the two\nmodalities via cross-modal metric learning. Additionally, we perform metric\nlocalization by predicting pixel-level flow vectors that align the query radar\nscan with the LiDAR map. We extensively evaluate our approach on multiple\nreal-world driving datasets and show that RaLF achieves state-of-the-art\nperformance for both place recognition and metric localization. Moreover, we\ndemonstrate that our approach can effectively generalize to different cities\nand sensor setups than the ones used during training. We make the code and\ntrained models publicly available at http://ralf.cs.uni-freiburg.de.\n","authors":["Abhijeet Nayak","Daniele Cattaneo","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2309.09875v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02068v1","updated":"2024-11-04T13:15:28Z","published":"2024-11-04T13:15:28Z","title":"Model Integrity when Unlearning with T2I Diffusion Models","summary":" The rapid advancement of text-to-image Diffusion Models has led to their\nwidespread public accessibility. However these models, trained on large\ninternet datasets, can sometimes generate undesirable outputs. To mitigate\nthis, approximate Machine Unlearning algorithms have been proposed to modify\nmodel weights to reduce the generation of specific types of images,\ncharacterized by samples from a ``forget distribution'', while preserving the\nmodel's ability to generate other images, characterized by samples from a\n``retain distribution''. While these methods aim to minimize the influence of\ntraining data in the forget distribution without extensive additional\ncomputation, we point out that they can compromise the model's integrity by\ninadvertently affecting generation for images in the retain distribution.\nRecognizing the limitations of FID and CLIPScore in capturing these effects, we\nintroduce a novel retention metric that directly assesses the perceptual\ndifference between outputs generated by the original and the unlearned models.\nWe then propose unlearning algorithms that demonstrate superior effectiveness\nin preserving model integrity compared to existing baselines. Given their\nstraightforward implementation, these algorithms serve as valuable benchmarks\nfor future advancements in approximate Machine Unlearning for Diffusion Models.\n","authors":["Andrea Schioppa","Emiel Hoogeboom","Jonathan Heek"],"pdf_url":"https://arxiv.org/pdf/2411.02068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02065v1","updated":"2024-11-04T13:07:22Z","published":"2024-11-04T13:07:22Z","title":"AM Flow: Adapters for Temporal Processing in Action Recognition","summary":" Deep learning models, in particular \\textit{image} models, have recently\ngained generalisability and robustness. %are becoming more general and robust\nby the day. In this work, we propose to exploit such advances in the realm of\n\\textit{video} classification. Video foundation models suffer from the\nrequirement of extensive pretraining and a large training time. Towards\nmitigating such limitations, we propose \"\\textit{Attention Map (AM) Flow}\" for\nimage models, a method for identifying pixels relevant to motion in each input\nvideo frame. In this context, we propose two methods to compute AM flow,\ndepending on camera motion. AM flow allows the separation of spatial and\ntemporal processing, while providing improved results over combined\nspatio-temporal processing (as in video models). Adapters, one of the popular\ntechniques in parameter efficient transfer learning, facilitate the\nincorporation of AM flow into pretrained image models, mitigating the need for\nfull-finetuning. We extend adapters to \"\\textit{temporal processing adapters}\"\nby incorporating a temporal processing unit into the adapters. Our work\nachieves faster convergence, therefore reducing the number of epochs needed for\ntraining. Moreover, we endow an image model with the ability to achieve\nstate-of-the-art results on popular action recognition datasets. This reduces\ntraining time and simplifies pretraining. We present experiments on\nKinetics-400, Something-Something v2, and Toyota Smarthome datasets, showcasing\nstate-of-the-art or comparable results.\n","authors":["Tanay Agrawal","Abid Ali","Antitza Dantcheva","Francois Bremond"],"pdf_url":"https://arxiv.org/pdf/2411.02065v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02057v1","updated":"2024-11-04T12:59:13Z","published":"2024-11-04T12:59:13Z","title":"Exploiting Unlabeled Data with Multiple Expert Teachers for Open\n Vocabulary Aerial Object Detection and Its Orientation Adaptation","summary":" In recent years, aerial object detection has been increasingly pivotal in\nvarious earth observation applications. However, current algorithms are limited\nto detecting a set of pre-defined object categories, demanding sufficient\nannotated training samples, and fail to detect novel object categories. In this\npaper, we put forth a novel formulation of the aerial object detection problem,\nnamely open-vocabulary aerial object detection (OVAD), which can detect objects\nbeyond training categories without costly collecting new labeled data. We\npropose CastDet, a CLIP-activated student-teacher detection framework that\nserves as the first OVAD detector specifically designed for the challenging\naerial scenario, where objects often exhibit weak appearance features and\narbitrary orientations. Our framework integrates a robust localization teacher\nalong with several box selection strategies to generate high-quality proposals\nfor novel objects. Additionally, the RemoteCLIP model is adopted as an\nomniscient teacher, which provides rich knowledge to enhance classification\ncapabilities for novel categories. A dynamic label queue is devised to maintain\nhigh-quality pseudo-labels during training. By doing so, the proposed CastDet\nboosts not only novel object proposals but also classification. Furthermore, we\nextend our approach from horizontal OVAD to oriented OVAD with tailored\nalgorithm designs to effectively manage bounding box representation and\npseudo-label generation. Extensive experiments for both tasks on multiple\nexisting aerial object detection datasets demonstrate the effectiveness of our\napproach. The code is available at https://github.com/lizzy8587/CastDet.\n","authors":["Yan Li","Weiwei Guo","Xue Yang","Ning Liao","Shaofeng Zhang","Yi Yu","Wenxian Yu","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2411.02057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14435v2","updated":"2024-11-04T12:48:38Z","published":"2023-11-24T12:22:00Z","title":"Local Concept Embeddings for Analysis of Concept Distributions in DNN\n Feature Spaces","summary":" Insights into the learned latent representations are imperative for verifying\ndeep neural networks (DNNs) in critical computer vision (CV) tasks. Therefore,\nstate-of-the-art supervised Concept-based eXplainable Artificial Intelligence\n(C-XAI) methods associate user-defined concepts like ``car'' each with a single\nvector in the DNN latent space (concept embedding vector). In the case of\nconcept segmentation, these linearly separate between activation map pixels\nbelonging to a concept and those belonging to background. Existing methods for\nconcept segmentation, however, fall short of capturing sub-concepts (e.g.,\n``proximate car'' and ``distant car''), and concept overlap (e.g., between\n``bus'' and ``truck''). In other words, they do not capture the full\ndistribution of concept representatives in latent space. For the first time,\nthis work shows that these simplifications are frequently broken and that\ndistribution information can be particularly useful for understanding\nDNN-learned notions of sub-concepts, concept confusion, and concept outliers.\nTo allow exploration of learned concept distributions, we propose a novel local\nconcept analysis framework. Instead of optimizing a single global concept\nvector on the complete dataset, it generates a local concept embedding (LoCE)\nvector for each individual sample. We use the distribution formed by LoCEs to\nexplore the latent concept distribution by fitting Gaussian mixture models\n(GMMs), hierarchical clustering, and concept-level information retrieval and\noutlier detection. Despite its context sensitivity, our method's concept\nsegmentation performance is competitive to global baselines. Analysis results\nare obtained on two datasets and five diverse vision DNN architectures,\nincluding vision transformers (ViTs).\n","authors":["Georgii Mikriukov","Gesina Schwalbe","Korinna Bade"],"pdf_url":"https://arxiv.org/pdf/2311.14435v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02038v1","updated":"2024-11-04T12:40:18Z","published":"2024-11-04T12:40:18Z","title":"Addressing Representation Collapse in Vector Quantized Models with One\n Linear Layer","summary":" Vector Quantization (VQ) is a widely used method for converting continuous\nrepresentations into discrete codes, which has become fundamental in\nunsupervised representation learning and latent generative models. However, VQ\nmodels are often hindered by the problem of representation collapse in the\nlatent space, which leads to low codebook utilization and limits the\nscalability of the codebook for large-scale training. Existing methods designed\nto mitigate representation collapse typically reduce the dimensionality of\nlatent space at the expense of model capacity, which do not fully resolve the\ncore issue. In this study, we conduct a theoretical analysis of representation\ncollapse in VQ models and identify its primary cause as the disjoint\noptimization of the codebook, where only a small subset of code vectors are\nupdated through gradient descent. To address this issue, we propose\n\\textbf{SimVQ}, a novel method which reparameterizes the code vectors through a\nlinear transformation layer based on a learnable latent basis. This\ntransformation optimizes the \\textit{entire linear space} spanned by the\ncodebook, rather than merely updating \\textit{the code vector} selected by the\nnearest-neighbor search in vanilla VQ models. Although it is commonly\nunderstood that the multiplication of two linear matrices is equivalent to\napplying a single linear layer, our approach works surprisingly well in\nresolving the collapse issue in VQ models with just one linear layer. We\nvalidate the efficacy of SimVQ through extensive experiments across various\nmodalities, including image and audio data with different model architectures.\nOur code is available at \\url{https://github.com/youngsheen/SimVQ}.\n","authors":["Yongxin Zhu","Bocheng Li","Yifei Xin","Linli Xu"],"pdf_url":"https://arxiv.org/pdf/2411.02038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03078v2","updated":"2024-11-04T12:19:57Z","published":"2024-08-06T10:13:57Z","title":"BodySLAM: A Generalized Monocular Visual SLAM Framework for Surgical\n Applications","summary":" Endoscopic surgery relies on two-dimensional views, posing challenges for\nsurgeons in depth perception and instrument manipulation. While Monocular\nVisual Simultaneous Localization and Mapping (MVSLAM) has emerged as a\npromising solution, its implementation in endoscopic procedures faces\nsignificant challenges due to hardware limitations, such as the use of a\nmonocular camera and the absence of odometry sensors. This study presents\nBodySLAM, a robust deep learning-based MVSLAM approach that addresses these\nchallenges through three key components: CycleVO, a novel unsupervised\nmonocular pose estimation module; the integration of the state-of-the-art Zoe\narchitecture for monocular depth estimation; and a 3D reconstruction module\ncreating a coherent surgical map. The approach is rigorously evaluated using\nthree publicly available datasets (Hamlyn, EndoSLAM, and SCARED) spanning\nlaparoscopy, gastroscopy, and colonoscopy scenarios, and benchmarked against\nfour state-of-the-art methods. Results demonstrate that CycleVO exhibited\ncompetitive performance with the lowest inference time among pose estimation\nmethods, while maintaining robust generalization capabilities, whereas Zoe\nsignificantly outperformed existing algorithms for depth estimation in\nendoscopy. BodySLAM's strong performance across diverse endoscopic scenarios\ndemonstrates its potential as a viable MVSLAM solution for endoscopic\napplications.\n","authors":["G. Manni","C. Lauretti","F. Prata","R. Papalia","L. Zollo","P. Soda"],"pdf_url":"https://arxiv.org/pdf/2408.03078v2.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.02009v1","updated":"2024-11-04T11:54:31Z","published":"2024-11-04T11:54:31Z","title":"Tree level change detection over Ahmedabad city using very high\n resolution satellite images and Deep Learning","summary":" In this study, 0.5m high resolution satellite datasets over Indian urban\nregion was used to demonstrate the applicability of deep learning models over\nAhmedabad, India. Here, YOLOv7 instance segmentation model was trained on well\ncurated trees canopy dataset (6500 images) in order to carry out the change\ndetection. During training, evaluation metrics such as bounding box regression\nand mask regression loss, mean average precision (mAP) and stochastic gradient\ndescent algorithm were used for evaluating and optimizing the performance of\nmodel. After the 500 epochs, the mAP of 0.715 and 0.699 for individual tree\ndetection and tree canopy mask segmentation were obtained. However, by further\ntuning hyper parameters of the model, maximum accuracy of 80 % of trees\ndetection with false segmentation rate of 2% on data was obtained.\n","authors":["Jai G Singla","Gautam Jaiswal"],"pdf_url":"https://arxiv.org/pdf/2411.02009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16998v2","updated":"2024-11-04T11:44:29Z","published":"2024-09-25T15:03:22Z","title":"PitRSDNet: Predicting Intra-operative Remaining Surgery Duration in\n Endoscopic Pituitary Surgery","summary":" Accurate intra-operative Remaining Surgery Duration (RSD) predictions allow\nfor anaesthetists to more accurately decide when to administer anaesthetic\nagents and drugs, as well as to notify hospital staff to send in the next\npatient. Therefore RSD plays an important role in improving patient care and\nminimising surgical theatre costs via efficient scheduling. In endoscopic\npituitary surgery, it is uniquely challenging due to variable workflow\nsequences with a selection of optional steps contributing to high variability\nin surgery duration. This paper presents PitRSDNet for predicting RSD during\npituitary surgery, a spatio-temporal neural network model that learns from\nhistorical data focusing on workflow sequences. PitRSDNet integrates workflow\nknowledge into RSD prediction in two forms: 1) multi-task learning for\nconcurrently predicting step and RSD; and 2) incorporating prior steps as\ncontext in temporal learning and inference. PitRSDNet is trained and evaluated\non a new endoscopic pituitary surgery dataset with 88 videos to show\ncompetitive performance improvements over previous statistical and machine\nlearning methods. The findings also highlight how PitRSDNet improve RSD\nprecision on outlier cases utilising the knowledge of prior steps.\n","authors":["Anjana Wijekoon","Adrito Das","Roxana R. Herrera","Danyal Z. Khan","John Hanrahan","Eleanor Carter","Valpuri Luoma","Danail Stoyanov","Hani J. Marcus","Sophia Bano"],"pdf_url":"https://arxiv.org/pdf/2409.16998v2.pdf","comment":"Accepted to the Augmented Environments for Computer-Assisted\n Interventions (AE-CAI) Workshop at the Medical Image Computing and\n Computer-Assisted Interventions (MICCAI) Conference 2024"},{"id":"http://arxiv.org/abs/2409.18783v2","updated":"2024-11-04T11:39:40Z","published":"2024-09-27T14:30:24Z","title":"DualDn: Dual-domain Denoising via Differentiable ISP","summary":" Image denoising is a critical component in a camera's Image Signal Processing\n(ISP) pipeline. There are two typical ways to inject a denoiser into the ISP\npipeline: applying a denoiser directly to captured raw frames (raw domain) or\nto the ISP's output sRGB images (sRGB domain). However, both approaches have\ntheir limitations. Residual noise from raw-domain denoising can be amplified by\nthe subsequent ISP processing, and the sRGB domain struggles to handle\nspatially varying noise since it only sees noise distorted by the ISP.\nConsequently, most raw or sRGB domain denoising works only for specific noise\ndistributions and ISP configurations. To address these challenges, we propose\nDualDn, a novel learning-based dual-domain denoising. Unlike previous\nsingle-domain denoising, DualDn consists of two denoising networks: one in the\nraw domain and one in the sRGB domain. The raw domain denoising adapts to\nsensor-specific noise as well as spatially varying noise levels, while the sRGB\ndomain denoising adapts to ISP variations and removes residual noise amplified\nby the ISP. Both denoising networks are connected with a differentiable ISP,\nwhich is trained end-to-end and discarded during the inference stage. With this\ndesign, DualDn achieves greater generalizability compared to most\nlearning-based denoising methods, as it can adapt to different unseen noises,\nISP parameters, and even novel ISP pipelines. Experiments show that DualDn\nachieves state-of-the-art performance and can adapt to different denoising\narchitectures. Moreover, DualDn can be used as a plug-and-play denoising module\nwith real cameras without retraining, and still demonstrate better performance\nthan commercial on-camera denoising. The project website is available at:\nhttps://openimaginglab.github.io/DualDn/\n","authors":["Ruikang Li","Yujin Wang","Shiqi Chen","Fan Zhang","Jinwei Gu","Tianfan Xue"],"pdf_url":"https://arxiv.org/pdf/2409.18783v2.pdf","comment":"Accepted at ECCV 2024, Project page:\n https://openimaginglab.github.io/DualDn/"},{"id":"http://arxiv.org/abs/2403.00174v4","updated":"2024-11-04T11:38:49Z","published":"2024-02-29T22:58:13Z","title":"A citizen science toolkit to collect human perceptions of urban\n environments using open street view images","summary":" Street View Imagery (SVI) is a valuable data source for studies (e.g.,\nenvironmental assessments, green space identification or land cover\nclassification). While commercial SVI is available, such providers commonly\nrestrict copying or reuse in ways necessary for research. Open SVI datasets are\nreadily available from less restrictive sources, such as Mapillary, but due to\nthe heterogeneity of the images, these require substantial preprocessing,\nfiltering, and careful quality checks. We present an efficient method for\nautomated downloading, processing, cropping, and filtering open SVI, to be used\nin a survey of human perceptions of the streets portrayed in these images. We\ndemonstrate our open-source reusable SVI preparation and smartphone-friendly\nperception-survey software with Amsterdam (Netherlands) as the case study.\nUsing a citizen science approach, we collected from 331 people 22,637 ratings\nabout their perceptions for various criteria. We have published our software in\na public repository for future re-use and reproducibility.\n","authors":["Matthew Danish","SM Labib","Britta Ricker","Marco Helbich"],"pdf_url":"https://arxiv.org/pdf/2403.00174v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10853v2","updated":"2024-11-04T11:37:51Z","published":"2024-06-16T08:54:38Z","title":"MV2Cyl: Reconstructing 3D Extrusion Cylinders from Multi-View Images","summary":" We present MV2Cyl, a novel method for reconstructing 3D from 2D multi-view\nimages, not merely as a field or raw geometry but as a sketch-extrude CAD\nmodel. Extracting extrusion cylinders from raw 3D geometry has been extensively\nresearched in computer vision, while the processing of 3D data through neural\nnetworks has remained a bottleneck. Since 3D scans are generally accompanied by\nmulti-view images, leveraging 2D convolutional neural networks allows these\nimages to be exploited as a rich source for extracting extrusion cylinder\ninformation. However, we observe that extracting only the surface information\nof the extrudes and utilizing it results in suboptimal outcomes due to the\nchallenges in the occlusion and surface segmentation. By synergizing with the\nextracted base curve information, we achieve the optimal reconstruction result\nwith the best accuracy in 2D sketch and extrude parameter estimation. Our\nexperiments, comparing our method with previous work that takes a raw 3D point\ncloud as input, demonstrate the effectiveness of our approach by taking\nadvantage of multi-view images.\n","authors":["Eunji Hong","Minh Hieu Nguyen","Mikaela Angelina Uy","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2406.10853v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.01988v1","updated":"2024-11-04T11:20:17Z","published":"2024-11-04T11:20:17Z","title":"QCS:Feature Refining from Quadruplet Cross Similarity for Facial\n Expression Recognition","summary":" On facial expression datasets with complex and numerous feature types, where\nthe significance and dominance of labeled features are difficult to predict,\nfacial expression recognition(FER) encounters the challenges of inter-class\nsimilarity and intra-class variances, making it difficult to mine effective\nfeatures. We aim to solely leverage the feature similarity among facial samples\nto address this. We introduce the Cross Similarity Attention (CSA), an\ninput-output position-sensitive attention mechanism that harnesses feature\nsimilarity across different images to compute the corresponding global spatial\nattention. Based on this, we propose a four-branch circular framework, called\nQuadruplet Cross Similarity (QCS), to extract discriminative features from the\nsame class and eliminate redundant ones from different classes synchronously to\nrefine cleaner features. The symmetry of the network ensures balanced and\nstable training and reduces the amount of CSA interaction matrix. Contrastive\nresidual distillation is utilized to transfer the information learned in the\ncross module back to the base network. The cross-attention module exists during\ntraining, and only one base branch is retained during inference. our proposed\nQCS model outperforms state-of-the-art methods on several popular FER datasets,\nwithout requiring additional landmark information or other extra training data.\nThe code is available at https://github.com/birdwcp/QCS.\n","authors":["Chengpeng Wang","Li Chen","Lili Wang","Zhaofan Li","Xuebin Lv"],"pdf_url":"https://arxiv.org/pdf/2411.01988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10618v2","updated":"2024-11-04T11:11:49Z","published":"2024-04-16T14:42:49Z","title":"Private Attribute Inference from Images with Vision-Language Models","summary":" As large language models (LLMs) become ubiquitous in our daily tasks and\ndigital interactions, associated privacy risks are increasingly in focus. While\nLLM privacy research has primarily focused on the leakage of model training\ndata, it has recently been shown that LLMs can make accurate privacy-infringing\ninferences from previously unseen texts. With the rise of vision-language\nmodels (VLMs), capable of understanding both images and text, a key question is\nwhether this concern transfers to the previously unexplored domain of benign\nimages posted online. To answer this question, we compile an image dataset with\nhuman-annotated labels of the image owner's personal attributes. In order to\nunderstand the privacy risks posed by VLMs beyond traditional human attribute\nrecognition, our dataset consists of images where the inferable private\nattributes do not stem from direct depictions of humans. On this dataset, we\nevaluate 7 state-of-the-art VLMs, finding that they can infer various personal\nattributes at up to 77.6% accuracy. Concerningly, we observe that accuracy\nscales with the general capabilities of the models, implying that future models\ncan be misused as stronger inferential adversaries, establishing an imperative\nfor the development of adequate defenses.\n","authors":["Batuhan Tömekçe","Mark Vero","Robin Staab","Martin Vechev"],"pdf_url":"https://arxiv.org/pdf/2404.10618v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01981v1","updated":"2024-11-04T11:09:47Z","published":"2024-11-04T11:09:47Z","title":"Typicalness-Aware Learning for Failure Detection","summary":" Deep neural networks (DNNs) often suffer from the overconfidence issue, where\nincorrect predictions are made with high confidence scores, hindering the\napplications in critical systems. In this paper, we propose a novel approach\ncalled Typicalness-Aware Learning (TAL) to address this issue and improve\nfailure detection performance. We observe that, with the cross-entropy loss,\nmodel predictions are optimized to align with the corresponding labels via\nincreasing logit magnitude or refining logit direction. However, regarding\natypical samples, the image content and their labels may exhibit disparities.\nThis discrepancy can lead to overfitting on atypical samples, ultimately\nresulting in the overconfidence issue that we aim to address. To tackle the\nproblem, we have devised a metric that quantifies the typicalness of each\nsample, enabling the dynamic adjustment of the logit magnitude during the\ntraining process. By allowing atypical samples to be adequately fitted while\npreserving reliable logit direction, the problem of overconfidence can be\nmitigated. TAL has been extensively evaluated on benchmark datasets, and the\nresults demonstrate its superiority over existing failure detection methods.\nSpecifically, TAL achieves a more than 5% improvement on CIFAR100 in terms of\nthe Area Under the Risk-Coverage Curve (AURC) compared to the state-of-the-art.\nCode is available at https://github.com/liuyijungoon/TAL.\n","authors":["Yijun Liu","Jiequan Cui","Zhuotao Tian","Senqiao Yang","Qingdong He","Xiaoling Wang","Jingyong Su"],"pdf_url":"https://arxiv.org/pdf/2411.01981v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2310.08421v4","updated":"2024-11-04T11:06:25Z","published":"2023-10-12T15:42:17Z","title":"Visual Self-supervised Learning Scheme for Dense Prediction Tasks on\n X-ray Images","summary":" Recently, significant advancements in artificial intelligence have been\nattributed to the integration of self-supervised learning (SSL) scheme. While\nSSL has shown impressive achievements in natural language processing (NLP), its\nprogress in computer vision has comparatively lagged behind. However, the\nincorporation of contrastive learning into existing visual SSL models has led\nto considerable progress, often surpassing supervised counterparts.\nNonetheless, these improvements have been mostly limited to classification\ntasks. Moreover, few studies have evaluated visual SSL models in real-world\nscenarios, as most have focused on datasets with class-wise portrait images,\nnotably ImageNet. Here, we focus on dense prediction tasks using security\ninspection x-ray images to evaluate our proposed model, Segment Localization\n(SegLoc). Based upon the Instance Localization (InsLoc) model, SegLoc addresses\none of the key challenges of contrastive learning, i.e., false negative pairs\nof query embeddings. Our pre-training dataset is synthesized by cutting,\ntransforming, and pasting labeled segments from an existing labeled dataset\n(PIDray) as foregrounds onto instances from an unlabeled dataset (SIXray) as\nbackgrounds. Furthermore, we fully leverage the labeled data by incorporating\nthe concept, one queue per class, into the MoCo-v2 memory bank, thereby\navoiding false negative pairs. In our experiments, SegLoc outperformed random\ninitialization by 3% to 6% while underperformed supervised initialization, in\nterms of AR and AP metrics across different IoU values over 20 to 30\npre-training epochs.\n","authors":["Shervin Halat","Mohammad Rahmati","Ehsan Nazerfard"],"pdf_url":"https://arxiv.org/pdf/2310.08421v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01975v1","updated":"2024-11-04T10:51:47Z","published":"2024-11-04T10:51:47Z","title":"SPECTRUM: Semantic Processing and Emotion-informed video-Captioning\n Through Retrieval and Understanding Modalities","summary":" Capturing a video's meaning and critical concepts by analyzing the subtle\ndetails is a fundamental yet challenging task in video captioning. Identifying\nthe dominant emotional tone in a video significantly enhances the perception of\nits context. Despite a strong emphasis on video captioning, existing models\noften need to adequately address emotional themes, resulting in suboptimal\ncaptioning results. To address these limitations, this paper proposes a novel\nSemantic Processing and Emotion-informed video-Captioning Through Retrieval and\nUnderstanding Modalities (SPECTRUM) framework to empower the generation of\nemotionally and semantically credible captions. Leveraging our pioneering\nstructure, SPECTRUM discerns multimodal semantics and emotional themes using\nVisual Text Attribute Investigation (VTAI) and determines the orientation of\ndescriptive captions through a Holistic Concept-Oriented Theme (HCOT),\nexpressing emotionally-informed and field-acquainted references. They exploit\nvideo-to-text retrieval capabilities and the multifaceted nature of video\ncontent to estimate the emotional probabilities of candidate captions. Then,\nthe dominant theme of the video is determined by appropriately weighting\nembedded attribute vectors and applying coarse- and fine-grained emotional\nconcepts, which define the video's contextual alignment. Furthermore, using two\nloss functions, SPECTRUM is optimized to integrate emotional information and\nminimize prediction errors. Extensive experiments on the EmVidCap, MSVD, and\nMSRVTT video captioning datasets demonstrate that our model significantly\nsurpasses state-of-the-art methods. Quantitative and qualitative evaluations\nhighlight the model's ability to accurately capture and convey video emotions\nand multimodal attributes.\n","authors":["Ehsan Faghihi","Mohammedreza Zarenejad","Ali-Asghar Beheshti Shirazi"],"pdf_url":"https://arxiv.org/pdf/2411.01975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11914v3","updated":"2024-11-04T10:48:54Z","published":"2024-05-20T09:49:13Z","title":"PT43D: A Probabilistic Transformer for Generating 3D Shapes from Single\n Highly-Ambiguous RGB Images","summary":" Generating 3D shapes from single RGB images is essential in various\napplications such as robotics. Current approaches typically target images\ncontaining clear and complete visual descriptions of the object, without\nconsidering common realistic cases where observations of objects that are\nlargely occluded or truncated. We thus propose a transformer-based\nautoregressive model to generate the probabilistic distribution of 3D shapes\nconditioned on an RGB image containing potentially highly ambiguous\nobservations of the object. To handle realistic scenarios such as occlusion or\nfield-of-view truncation, we create simulated image-to-shape training pairs\nthat enable improved fine-tuning for real-world scenarios. We then adopt\ncross-attention to effectively identify the most relevant region of interest\nfrom the input image for shape generation. This enables inference of sampled\nshapes with reasonable diversity and strong alignment with the input image. We\ntrain and test our model on our synthetic data then fine-tune and test it on\nreal-world data. Experiments demonstrate that our model outperforms state of\nthe art in both scenarios.\n","authors":["Yiheng Xiong","Angela Dai"],"pdf_url":"https://arxiv.org/pdf/2405.11914v3.pdf","comment":"10 pages, 6 figures. Accepted to BMVC 2024"},{"id":"http://arxiv.org/abs/2411.01969v1","updated":"2024-11-04T10:44:46Z","published":"2024-11-04T10:44:46Z","title":"Active Gaze Behavior Boosts Self-Supervised Object Learning","summary":" Due to significant variations in the projection of the same object from\ndifferent viewpoints, machine learning algorithms struggle to recognize the\nsame object across various perspectives. In contrast, toddlers quickly learn to\nrecognize objects from different viewpoints with almost no supervision. Recent\nworks argue that toddlers develop this ability by mapping close-in-time visual\ninputs to similar representations while interacting with objects. High acuity\nvision is only available in the central visual field, which may explain why\ntoddlers (much like adults) constantly move their gaze around during such\ninteractions. It is unclear whether/how much toddlers curate their visual\nexperience through these eye movements to support learning object\nrepresentations. In this work, we explore whether a bio inspired visual\nlearning model can harness toddlers' gaze behavior during a play session to\ndevelop view-invariant object recognition. Exploiting head-mounted eye tracking\nduring dyadic play, we simulate toddlers' central visual field experience by\ncropping image regions centered on the gaze location. This visual stream feeds\na time-based self-supervised learning algorithm. Our experiments demonstrate\nthat toddlers' gaze strategy supports the learning of invariant object\nrepresentations. Our analysis also reveals that the limited size of the central\nvisual field where acuity is high is crucial for this. We further find that\ntoddlers' visual experience elicits more robust representations compared to\nadults' mostly because toddlers look at objects they hold themselves for longer\nbouts. Overall, our work reveals how toddlers' gaze behavior supports\nself-supervised learning of view-invariant object recognition.\n","authors":["Zhengyang Yu","Arthur Aubret","Marcel C. Raabe","Jane Yang","Chen Yu","Jochen Triesch"],"pdf_url":"https://arxiv.org/pdf/2411.01969v1.pdf","comment":"16 pages, 11 figures"},{"id":"http://arxiv.org/abs/2411.01966v1","updated":"2024-11-04T10:42:21Z","published":"2024-11-04T10:42:21Z","title":"UnSegMedGAT: Unsupervised Medical Image Segmentation using Graph\n Attention Networks Clustering","summary":" The data-intensive nature of supervised classification drives the interest of\nthe researchers towards unsupervised approaches, especially for problems such\nas medical image segmentation, where labeled data is scarce. Building on the\nrecent advancements of Vision transformers (ViT) in computer vision, we propose\nan unsupervised segmentation framework using a pre-trained Dino-ViT. In the\nproposed method, we leverage the inherent graph structure within the image to\nrealize a significant performance gain for segmentation in medical images. For\nthis, we introduce a modularity-based loss function coupled with a Graph\nAttention Network (GAT) to effectively capture the inherent graph topology\nwithin the image. Our method achieves state-of-the-art performance, even\nsignificantly surpassing or matching that of existing (semi)supervised\ntechnique such as MedSAM which is a Segment Anything Model in medical images.\nWe demonstrate this using two challenging medical image datasets ISIC-2018 and\nCVC-ColonDB. This work underscores the potential of unsupervised approaches in\nadvancing medical image analysis in scenarios where labeled data is scarce. The\ngithub repository of the code is available on\n[https://github.com/mudit-adityaja/UnSegMedGAT].\n","authors":["A. Mudit Adityaja","Saurabh J. Shigwan","Nitin Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.01966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01962v1","updated":"2024-11-04T10:38:33Z","published":"2024-11-04T10:38:33Z","title":"Deep Learning for Leopard Individual Identification: An Adaptive Angular\n Margin Approach","summary":" Accurate identification of individual leopards across camera trap images is\ncritical for population monitoring and ecological studies. This paper\nintroduces a deep learning framework to distinguish between individual leopards\nbased on their unique spot patterns. This approach employs a novel adaptive\nangular margin method in the form of a modified CosFace architecture. In\naddition, I propose a preprocessing pipeline that combines RGB channels with an\nedge detection channel to underscore the critical features learned by the\nmodel.\n This approach significantly outperforms the Triplet Network baseline,\nachieving a Dynamic Top-5 Average Precision of 0.8814 and a Top-5 Rank Match\nDetection of 0.9533, demonstrating its potential for open-set learning in\nwildlife identification. While not surpassing the performance of the SIFT-based\nHotspotter algorithm, this method represents a substantial advancement in\napplying deep learning to patterned wildlife identification.\n This research contributes to the field of computer vision and provides a\nvaluable tool for biologists aiming to study and protect leopard populations.\nIt also serves as a stepping stone for applying the power of deep learning in\nCapture-Recapture studies for other patterned species.\n","authors":["David Colomer Matachana"],"pdf_url":"https://arxiv.org/pdf/2411.01962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01955v1","updated":"2024-11-04T10:27:57Z","published":"2024-11-04T10:27:57Z","title":"Robust plug-and-play methods for highly accelerated non-Cartesian MRI\n reconstruction","summary":" Achieving high-quality Magnetic Resonance Imaging (MRI) reconstruction at\naccelerated acquisition rates remains challenging due to the inherent ill-posed\nnature of the inverse problem. Traditional Compressed Sensing (CS) methods,\nwhile robust across varying acquisition settings, struggle to maintain good\nreconstruction quality at high acceleration factors ($\\ge$ 8). Recent advances\nin deep learning have improved reconstruction quality, but purely data-driven\nmethods are prone to overfitting and hallucination effects, notably when the\nacquisition setting is varying. Plug-and-Play (PnP) approaches have been\nproposed to mitigate the pitfalls of both frameworks. In a nutshell, PnP\nalgorithms amount to replacing suboptimal handcrafted CS priors with powerful\ndenoising deep neural network (DNNs). However, in MRI reconstruction, existing\nPnP methods often yield suboptimal results due to instabilities in the proximal\ngradient descent (PGD) schemes and the lack of curated, noiseless datasets for\ntraining robust denoisers. In this work, we propose a fully unsupervised\npreprocessing pipeline to generate clean, noiseless complex MRI signals from\nmulticoil data, enabling training of a high-performance denoising DNN.\nFurthermore, we introduce an annealed Half-Quadratic Splitting (HQS) algorithm\nto address the instability issues, leading to significant improvements over\nexisting PnP algorithms. When combined with preconditioning techniques, our\napproach achieves state-of-the-art results, providing a robust and efficient\nsolution for high-quality MRI reconstruction.\n","authors":["Pierre-Antoine Comby","Benjamin Lapostolle","Matthieu Terris","Philippe Ciuciu"],"pdf_url":"https://arxiv.org/pdf/2411.01955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00543v2","updated":"2024-11-04T10:21:57Z","published":"2024-11-01T12:50:38Z","title":"3D Equivariant Pose Regression via Direct Wigner-D Harmonics Prediction","summary":" Determining the 3D orientations of an object in an image, known as\nsingle-image pose estimation, is a crucial task in 3D vision applications.\nExisting methods typically learn 3D rotations parametrized in the spatial\ndomain using Euler angles or quaternions, but these representations often\nintroduce discontinuities and singularities. SO(3)-equivariant networks enable\nthe structured capture of pose patterns with data-efficient learning, but the\nparametrizations in spatial domain are incompatible with their architecture,\nparticularly spherical CNNs, which operate in the frequency domain to enhance\ncomputational efficiency. To overcome these issues, we propose a\nfrequency-domain approach that directly predicts Wigner-D coefficients for 3D\nrotation regression, aligning with the operations of spherical CNNs. Our\nSO(3)-equivariant pose harmonics predictor overcomes the limitations of spatial\nparameterizations, ensuring consistent pose estimation under arbitrary\nrotations. Trained with a frequency-domain regression loss, our method achieves\nstate-of-the-art results on benchmarks such as ModelNet10-SO(3) and PASCAL3D+,\nwith significant improvements in accuracy, robustness, and data efficiency.\n","authors":["Jongmin Lee","Minsu Cho"],"pdf_url":"https://arxiv.org/pdf/2411.00543v2.pdf","comment":"Accepted to NeurIPS 2024, Project webpage at\n http://cvlab.postech.ac.kr/research/3D_EquiPose"},{"id":"http://arxiv.org/abs/2402.14695v2","updated":"2024-11-04T10:20:25Z","published":"2024-02-22T16:49:58Z","title":"QIS : Interactive Segmentation via Quasi-Conformal Mappings","summary":" Image segmentation plays a crucial role in extracting important objects of\ninterest from images, enabling various applications. While existing methods\nhave shown success in segmenting clean images, they often struggle to produce\naccurate segmentation results when dealing with degraded images, such as those\ncontaining noise or occlusions. To address this challenge, interactive\nsegmentation has emerged as a promising approach, allowing users to provide\nmeaningful input to guide the segmentation process. However, an important\nproblem in interactive segmentation lies in determining how to incorporate\nminimal yet meaningful user guidance into the segmentation model. In this\npaper, we propose the quasi-conformal interactive segmentation (QIS) model,\nwhich incorporates user input in the form of positive and negative clicks.\nUsers mark a few pixels belonging to the object region as positive clicks,\nindicating that the segmentation model should include a region around these\nclicks. Conversely, negative clicks are provided on pixels belonging to the\nbackground, instructing the model to exclude the region near these clicks from\nthe segmentation mask. Additionally, the segmentation mask is obtained by\ndeforming a template mask with the same topology as the object of interest\nusing an orientation-preserving quasiconformal mapping. This approach helps to\navoid topological errors in the segmentation results. We provide a thorough\nanalysis of the proposed model, including theoretical support for the ability\nof QIS to include or exclude regions of interest or disinterest based on the\nuser's indication. To evaluate the performance of QIS, we conduct experiments\non synthesized images, medical images, natural images and noisy natural images.\nThe results demonstrate the efficacy of our proposed method.\n","authors":["Han Zhang","Daoping Zhang","Lok Ming Lui"],"pdf_url":"https://arxiv.org/pdf/2402.14695v2.pdf","comment":"34 pages, 14 figures"},{"id":"http://arxiv.org/abs/2411.01948v1","updated":"2024-11-04T10:17:40Z","published":"2024-11-04T10:17:40Z","title":"Learning Where to Edit Vision Transformers","summary":" Model editing aims to data-efficiently correct predictive errors of large\npre-trained models while ensuring generalization to neighboring failures and\nlocality to minimize unintended effects on unrelated examples. While\nsignificant progress has been made in editing Transformer-based large language\nmodels, effective strategies for editing vision Transformers (ViTs) in computer\nvision remain largely untapped. In this paper, we take initial steps towards\ncorrecting predictive errors of ViTs, particularly those arising from\nsubpopulation shifts. Taking a locate-then-edit approach, we first address the\nwhere-to-edit challenge by meta-learning a hypernetwork on CutMix-augmented\ndata generated for editing reliability. This trained hypernetwork produces\ngeneralizable binary masks that identify a sparse subset of structured model\nparameters, responsive to real-world failure samples. Afterward, we solve the\nhow-to-edit problem by simply fine-tuning the identified parameters using a\nvariant of gradient descent to achieve successful edits. To validate our\nmethod, we construct an editing benchmark that introduces subpopulation shifts\ntowards natural underrepresented images and AI-generated images, thereby\nrevealing the limitations of pre-trained ViTs for object recognition. Our\napproach not only achieves superior performance on the proposed benchmark but\nalso allows for adjustable trade-offs between generalization and locality. Our\ncode is available at https://github.com/hustyyq/Where-to-Edit.\n","authors":["Yunqiao Yang","Long-Kai Huang","Shengzhuang Chen","Kede Ma","Ying Wei"],"pdf_url":"https://arxiv.org/pdf/2411.01948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05964v2","updated":"2024-11-04T10:14:22Z","published":"2024-08-12T07:33:11Z","title":"Target Detection of Safety Protective Gear Using the Improved YOLOv5","summary":" In high-risk railway construction, personal protective equipment monitoring\nis critical but challenging due to small and frequently obstructed targets. We\npropose YOLO-EA, an innovative model that enhances safety measure detection by\nintegrating ECA into its backbone's convolutional layers, improving discernment\nof minuscule objects like hardhats. YOLO-EA further refines target recognition\nunder occlusion by replacing GIoU with EIoU loss. YOLO-EA's effectiveness was\nempirically substantiated using a dataset derived from real-world railway\nconstruction site surveillance footage. It outperforms YOLOv5, achieving 98.9%\nprecision and 94.7% recall, up 2.5% and 0.5% respectively, while maintaining\nreal-time performance at 70.774 fps. This highly efficient and precise YOLO-EA\nholds great promise for practical application in intricate construction\nscenarios, enforcing stringent safety compliance during complex railway\nconstruction projects.\n","authors":["Hao Liu","Xue Qin"],"pdf_url":"https://arxiv.org/pdf/2408.05964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23085v2","updated":"2024-11-04T10:01:38Z","published":"2024-10-30T15:00:06Z","title":"S3PT: Scene Semantics and Structure Guided Clustering to Boost\n Self-Supervised Pre-Training for Autonomous Driving","summary":" Recent self-supervised clustering-based pre-training techniques like DINO and\nCribo have shown impressive results for downstream detection and segmentation\ntasks. However, real-world applications such as autonomous driving face\nchallenges with imbalanced object class and size distributions and complex\nscene geometries. In this paper, we propose S3PT a novel scene semantics and\nstructure guided clustering to provide more scene-consistent objectives for\nself-supervised training. Specifically, our contributions are threefold: First,\nwe incorporate semantic distribution consistent clustering to encourage better\nrepresentation of rare classes such as motorcycles or animals. Second, we\nintroduce object diversity consistent spatial clustering, to handle imbalanced\nand diverse object sizes, ranging from large background areas to small objects\nsuch as pedestrians and traffic signs. Third, we propose a depth-guided spatial\nclustering to regularize learning based on geometric information of the scene,\nthus further refining region separation on the feature level. Our learned\nrepresentations significantly improve performance in downstream semantic\nsegmentation and 3D object detection tasks on the nuScenes, nuImages, and\nCityscapes datasets and show promising domain translation properties.\n","authors":["Maciej K. Wozniak","Hariprasath Govindarajan","Marvin Klingner","Camille Maurice","B Ravi Kiran","Senthil Yogamani"],"pdf_url":"https://arxiv.org/pdf/2410.23085v2.pdf","comment":"Accepted for WACV 2025"},{"id":"http://arxiv.org/abs/2410.22392v2","updated":"2024-11-04T09:56:16Z","published":"2024-10-29T17:56:05Z","title":"EfficientNet with Hybrid Attention Mechanisms for Enhanced Breast\n Histopathology Classification: A Comprehensive Approach","summary":" Breast cancer histopathology image classification is crucial for early cancer\ndetection, offering the potential to reduce mortality rates through timely\ndiagnosis. This paper introduces a novel approach integrating Hybrid\nEfficientNet models with advanced attention mechanisms, including Convolutional\nBlock Attention Module (CBAM), Self-Attention, and Deformable Attention, to\nenhance feature extraction and focus on critical image regions. We evaluate the\nperformance of our models across multiple magnification scales using publicly\navailable histopathological datasets. Our method achieves significant\nimprovements, with accuracy reaching 98.42% at 400X magnification, surpassing\nseveral state-of-the-art models, including VGG and ResNet architectures. The\nresults are validated using metrics such as accuracy, F1-score, precision, and\nrecall, demonstrating the clinical potential of our model in improving\ndiagnostic accuracy. Furthermore, the proposed method shows increased\ncomputational efficiency, making it suitable for integration into real-time\ndiagnostic workflows.\n","authors":["Naren Sengodan"],"pdf_url":"https://arxiv.org/pdf/2410.22392v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01925v1","updated":"2024-11-04T09:43:33Z","published":"2024-11-04T09:43:33Z","title":"Exploiting Contextual Uncertainty of Visual Data for Efficient Training\n of Deep Models","summary":" Objects, in the real world, rarely occur in isolation and exhibit typical\narrangements governed by their independent utility, and their expected\ninteraction with humans and other objects in the context. For example, a chair\nis expected near a table, and a computer is expected on top. Humans use this\nspatial context and relative placement as an important cue for visual\nrecognition in case of ambiguities. Similar to human's, DNN's exploit\ncontextual information from data to learn representations. Our research focuses\non harnessing the contextual aspects of visual data to optimize data annotation\nand enhance the training of deep networks. Our contributions can be summarized\nas follows: (1) We introduce the notion of contextual diversity for active\nlearning CDAL and show its applicability in three different visual tasks\nsemantic segmentation, object detection and image classification, (2) We\npropose a data repair algorithm to curate contextually fair data to reduce\nmodel bias, enabling the model to detect objects out of their obvious context,\n(3) We propose Class-based annotation, where contextually relevant classes are\nselected that are complementary for model training under domain shift.\nUnderstanding the importance of well-curated data, we also emphasize the\nnecessity of involving humans in the loop to achieve accurate annotations and\nto develop novel interaction strategies that allow humans to serve as\nfact-checkers. In line with this we are working on developing image retrieval\nsystem for wildlife camera trap images and reliable warning system for poor\nquality rural roads. For large-scale annotation, we are employing a strategic\ncombination of human expertise and zero-shot models, while also integrating\nhuman input at various stages for continuous feedback.\n","authors":["Sharat Agarwal"],"pdf_url":"https://arxiv.org/pdf/2411.01925v1.pdf","comment":"ICVGIP, Young Researchers Symposium"},{"id":"http://arxiv.org/abs/2411.01919v1","updated":"2024-11-04T09:34:55Z","published":"2024-11-04T09:34:55Z","title":"Real-Time Polygonal Semantic Mapping for Humanoid Robot Stair Climbing","summary":" We present a novel algorithm for real-time planar semantic mapping tailored\nfor humanoid robots navigating complex terrains such as staircases. Our method\nis adaptable to any odometry input and leverages GPU-accelerated processes for\nplanar extraction, enabling the rapid generation of globally consistent\nsemantic maps. We utilize an anisotropic diffusion filter on depth images to\neffectively minimize noise from gradient jumps while preserving essential edge\ndetails, enhancing normal vector images' accuracy and smoothness. Both the\nanisotropic diffusion and the RANSAC-based plane extraction processes are\noptimized for parallel processing on GPUs, significantly enhancing\ncomputational efficiency. Our approach achieves real-time performance,\nprocessing single frames at rates exceeding $30~Hz$, which facilitates detailed\nplane extraction and map management swiftly and efficiently. Extensive testing\nunderscores the algorithm's capabilities in real-time scenarios and\ndemonstrates its practical application in humanoid robot gait planning,\nsignificantly improving its ability to navigate dynamic environments.\n","authors":["Teng Bin","Jianming Yao","Tin Lun Lam","Tianwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.01919v1.pdf","comment":"Accepted by The 2024 IEEE-RAS International Conference on Humanoid\n Robots. The code: https://github.com/BTFrontier/polygon_mapping"},{"id":"http://arxiv.org/abs/2406.14056v3","updated":"2024-11-04T09:31:06Z","published":"2024-06-20T07:24:43Z","title":"VGA: Vision GUI Assistant -- Minimizing Hallucinations through\n Image-Centric Fine-Tuning","summary":" Recent advances in Large Vision-Language Models (LVLMs) have significantly\nimprove performance in image comprehension tasks, such as formatted charts and\nrich-content images. Yet, Graphical User Interface (GUI) pose a greater\nchallenge due to their structured format and detailed textual information.\nExisting LVLMs often overly depend on internal knowledge and neglect image\ncontent, resulting in hallucinations and incorrect responses in GUI\ncomprehension. To address these issues, we introduce VGA, a fine-tuned model\ndesigned for comprehensive GUI understanding. Our model aims to enhance the\ninterpretation of visual data of GUI and reduce hallucinations. We first\nconstruct a Vision Question Answering (VQA) dataset of 63.8k high-quality\nexamples with our propose Referent Method, which ensures the model's responses\nare highly depend on visual content within the image. We then design a\ntwo-stage fine-tuning method called Foundation and Advanced Comprehension (FAC)\nto enhance both the model's ability to extract information from image content\nand alignment with human intent. Experiments show that our approach enhances\nthe model's ability to extract information from images and achieves\nstate-of-the-art results in GUI understanding tasks. Our dataset and\nfine-tuning script will be released soon.\n","authors":["Ziyang Meng","Yu Dai","Zezheng Gong","Shaoxiong Guo","Minglong Tang","Tongquan Wei"],"pdf_url":"https://arxiv.org/pdf/2406.14056v3.pdf","comment":"Accepted by EMNLP2024"},{"id":"http://arxiv.org/abs/2411.01916v1","updated":"2024-11-04T09:28:18Z","published":"2024-11-04T09:28:18Z","title":"Masked Autoencoders are Parameter-Efficient Federated Continual Learners","summary":" Federated learning is a specific distributed learning paradigm in which a\ncentral server aggregates updates from multiple clients' local models, thereby\nenabling the server to learn without requiring clients to upload their private\ndata, maintaining data privacy. While existing federated learning methods are\nprimarily designed for static data, real-world applications often require\nclients to learn new categories over time. This challenge necessitates the\nintegration of continual learning techniques, resulting in federated continual\nlearning (FCL). Although advanced prompt-based continual learning methods\nleverage pre-trained transformers to mitigate catastrophic forgetting, they do\nnot adequately address the non-IID challenges in federated learning. To address\nboth catastrophic forgetting and non-IID issues, we propose to use masked\nautoencoders (MAEs) as parameter-efficient federated continual learners, called\npMAE. pMAE learns reconstructive prompt on the client side through image\nreconstruction using MAEs. On the server side, it reconstructs the uploaded\nrestore information to capture the data distribution across previous tasks and\ndifferent clients, using these reconstructed images to finetune discriminative\nprompt and classifier parameters designed for classification, thereby\nalleviating catastrophic forgetting and non-IID challenges on a global scale.\nExperimental results demonstrate that pMAE achieves performance comparable to\nexisting prompt-based methods and can enhance their effectiveness, particularly\nwhen using self-supervised pre-trained transformers as the backbone. Code is\navailable at: https://github.com/ycheoo/pMAE.\n","authors":["Yuchen He","Xiangfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2411.01916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01904v1","updated":"2024-11-04T09:15:21Z","published":"2024-11-04T09:15:21Z","title":"FPPL: An Efficient and Non-IID Robust Federated Continual Learning\n Framework","summary":" Federated continual learning (FCL) aims to learn from sequential data stream\nin the decentralized federated learning setting, while simultaneously\nmitigating the catastrophic forgetting issue in classical continual learning.\nExisting FCL methods usually employ typical rehearsal mechanisms, which could\nresult in privacy violations or additional onerous storage and computational\nburdens. In this work, an efficient and non-IID robust federated continual\nlearning framework, called Federated Prototype-Augmented Prompt Learning\n(FPPL), is proposed. The FPPL can collaboratively learn lightweight prompts\naugmented by prototypes without rehearsal. On the client side, a fusion\nfunction is employed to fully leverage the knowledge contained in task-specific\nprompts for alleviating catastrophic forgetting. Additionally, global\nprototypes aggregated from the server are used to obtain unified representation\nthrough contrastive learning, mitigating the impact of non-IID-derived data\nheterogeneity. On the server side, locally uploaded prototypes are utilized to\nperform debiasing on the classifier, further alleviating the performance\ndegradation caused by both non-IID and catastrophic forgetting. Empirical\nevaluations demonstrate the effectiveness of FPPL, achieving notable\nperformance with an efficient design while remaining robust to diverse non-IID\ndegrees. Code is available at: https://github.com/ycheoo/FPPL.\n","authors":["Yuchen He","Chuyun Shen","Xiangfeng Wang","Bo Jin"],"pdf_url":"https://arxiv.org/pdf/2411.01904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05645v3","updated":"2024-11-04T09:14:03Z","published":"2024-07-08T06:14:37Z","title":"OneDiff: A Generalist Model for Image Difference Captioning","summary":" In computer vision, Image Difference Captioning (IDC) is crucial for\naccurately describing variations between closely related images. Traditional\nIDC methods often rely on specialist models, which restrict their applicability\nacross varied contexts. This paper introduces the OneDiff model, a novel\ngeneralist approach that utilizes a robust vision-language model architecture,\nintegrating a siamese image encoder with a Visual Delta Module. This innovative\nconfiguration allows for the precise detection and articulation of fine-grained\ndifferences between image pairs. OneDiff is trained through a dual-phase\nstrategy, encompassing Coupled Sample Training and multi-task learning across a\ndiverse array of data types, supported by our newly developed DiffCap Dataset.\nThis dataset merges real-world and synthetic data, enhancing the training\nprocess and bolstering the model's robustness. Extensive testing on diverse IDC\nbenchmarks, such as Spot-the-Diff, Image-Editing-Request, and Birds-to-Words,\nshows that OneDiff consistently outperforms existing state-of-the-art models in\naccuracy and adaptability, achieving improvements of up to 97% CIDEr points in\naverage. By setting a new benchmark in IDC, OneDiff paves the way for more\nversatile and effective applications in detecting and describing visual\ndifferences. The code, models, and data will be made publicly available.\n","authors":["Erdong Hu","Longteng Guo","Tongtian Yue","Zijia Zhao","Shuning Xue","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2407.05645v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15706v3","updated":"2024-11-04T09:06:40Z","published":"2024-03-23T03:56:31Z","title":"GACL: Exemplar-Free Generalized Analytic Continual Learning","summary":" Class incremental learning (CIL) trains a network on sequential tasks with\nseparated categories in each task but suffers from catastrophic forgetting,\nwhere models quickly lose previously learned knowledge when acquiring new\ntasks. The generalized CIL (GCIL) aims to address the CIL problem in a more\nreal-world scenario, where incoming data have mixed data categories and unknown\nsample size distribution. Existing attempts for the GCIL either have poor\nperformance or invade data privacy by saving exemplars. In this paper, we\npropose a new exemplar-free GCIL technique named generalized analytic continual\nlearning (GACL). The GACL adopts analytic learning (a gradient-free training\ntechnique) and delivers an analytical (i.e., closed-form) solution to the GCIL\nscenario. This solution is derived via decomposing the incoming data into\nexposed and unexposed classes, thereby attaining a weight-invariant property, a\nrare yet valuable property supporting an equivalence between incremental\nlearning and its joint training. Such an equivalence is crucial in GCIL\nsettings as data distributions among different tasks no longer pose challenges\nto adopting our GACL. Theoretically, this equivalence property is validated\nthrough matrix analysis tools. Empirically, we conduct extensive experiments\nwhere, compared with existing GCIL methods, our GACL exhibits a consistently\nleading performance across various datasets and GCIL settings. Source code is\navailable at https://github.com/CHEN-YIZHU/GACL.\n","authors":["Huiping Zhuang","Yizhu Chen","Di Fang","Run He","Kai Tong","Hongxin Wei","Ziqian Zeng","Cen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.15706v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01896v1","updated":"2024-11-04T09:03:43Z","published":"2024-11-04T09:03:43Z","title":"MBDRes-U-Net: Multi-Scale Lightweight Brain Tumor Segmentation Network","summary":" Accurate segmentation of brain tumors plays a key role in the diagnosis and\ntreatment of brain tumor diseases. It serves as a critical technology for\nquantifying tumors and extracting their features. With the increasing\napplication of deep learning methods, the computational burden has become\nprogressively heavier. To achieve a lightweight model with good segmentation\nperformance, this study proposes the MBDRes-U-Net model using the\nthree-dimensional (3D) U-Net codec framework, which integrates multibranch\nresidual blocks and fused attention into the model. The computational burden of\nthe model is reduced by the branch strategy, which effectively uses the rich\nlocal features in multimodal images and enhances the segmentation performance\nof subtumor regions. Additionally, during encoding, an adaptive weighted\nexpansion convolution layer is introduced into the multi-branch residual block,\nwhich enriches the feature expression and improves the segmentation accuracy of\nthe model. Experiments on the Brain Tumor Segmentation (BraTS) Challenge 2018\nand 2019 datasets show that the architecture could maintain a high precision of\nbrain tumor segmentation while considerably reducing the calculation\noverhead.Our code is released at\nhttps://github.com/Huaibei-normal-university-cv-laboratory/mbdresunet\n","authors":["Longfeng Shen","Yanqi Hou","Jiacong Chen","Liangjin Diao","Yaxi Duan"],"pdf_url":"https://arxiv.org/pdf/2411.01896v1.pdf","comment":"Brain tumor segmentation, lightweight model, Brain Tumor Segmentation\n (BraTS) Challenge, group convolution"},{"id":"http://arxiv.org/abs/2405.17815v2","updated":"2024-11-04T09:03:31Z","published":"2024-05-28T04:23:00Z","title":"Visual Anchors Are Strong Information Aggregators For Multimodal Large\n Language Model","summary":" In the realm of Multimodal Large Language Models (MLLMs), vision-language\nconnector plays a crucial role to link the pre-trained vision encoders with\nLarge Language Models (LLMs). Despite its importance, the vision-language\nconnector has been relatively less explored. In this study, we aim to propose a\nstrong vision-language connector that enables MLLMs to achieve high accuracy\nwhile maintain low computation cost. We first reveal the existence of the\nvisual anchors in Vision Transformer and propose a cost-effective search\nalgorithm to extract them. Building on these findings, we introduce the Anchor\nFormer (AcFormer), a novel vision-language connector designed to leverage the\nrich prior knowledge obtained from these visual anchors during pretraining,\nguiding the aggregation of information. Through extensive experimentation, we\ndemonstrate that the proposed method significantly reduces computational costs\nby nearly two-thirds compared with baseline, while simultaneously outperforming\nbaseline methods. This highlights the effectiveness and efficiency of AcFormer.\nCodes are available at https://github.com/liuhaogeng/Anchor-Former.\n","authors":["Haogeng Liu","Quanzeng You","Xiaotian Han","Yongfei Liu","Huaibo Huang","Ran He","Hongxia Yang"],"pdf_url":"https://arxiv.org/pdf/2405.17815v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05643v2","updated":"2024-11-04T08:58:14Z","published":"2024-10-08T02:46:30Z","title":"TRACE: Temporal Grounding Video LLM via Causal Event Modeling","summary":" Video Temporal Grounding (VTG) is a crucial capability for video\nunderstanding models and plays a vital role in downstream tasks such as video\nbrowsing and editing. To effectively handle various tasks simultaneously and\nenable zero-shot prediction, there is a growing trend in employing video LLMs\nfor VTG tasks. However, current video LLM-based methods rely exclusively on\nnatural language generation, lacking the ability to model the clear structure\ninherent in videos, which restricts their effectiveness in tackling VTG tasks.\nTo address this issue, this paper first formally introduces causal event\nmodeling framework, which represents videos as sequences of events, and predict\nthe current event using previous events, video inputs, and textural\ninstructions. Each event consists of three components: timestamps, salient\nscores, and textual captions. We then propose a novel task-interleaved video\nLLM called TRACE to effectively implement the causal event modeling framework\nin practice. The TRACE processes visual frames, timestamps, salient scores, and\ntext as distinct tasks, employing various encoders and decoding heads for each.\nTask tokens are arranged in an interleaved sequence according to the causal\nevent modeling framework's formulation. Extensive experiments on various VTG\ntasks and datasets demonstrate the superior performance of TRACE compared to\nstate-of-the-art video LLMs. Our model and code are available at\n\\url{https://github.com/gyxxyg/TRACE}.\n","authors":["Yongxin Guo","Jingyu Liu","Mingda Li","Xiaoying Tang","Qingbin Liu","Xi Chen"],"pdf_url":"https://arxiv.org/pdf/2410.05643v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01893v1","updated":"2024-11-04T08:50:16Z","published":"2024-11-04T08:50:16Z","title":"A Global Depth-Range-Free Multi-View Stereo Transformer Network with\n Pose Embedding","summary":" In this paper, we propose a novel multi-view stereo (MVS) framework that gets\nrid of the depth range prior. Unlike recent prior-free MVS methods that work in\na pair-wise manner, our method simultaneously considers all the source images.\nSpecifically, we introduce a Multi-view Disparity Attention (MDA) module to\naggregate long-range context information within and across multi-view images.\nConsidering the asymmetry of the epipolar disparity flow, the key to our method\nlies in accurately modeling multi-view geometric constraints. We integrate pose\nembedding to encapsulate information such as multi-view camera poses, providing\nimplicit geometric constraints for multi-view disparity feature fusion\ndominated by attention. Additionally, we construct corresponding hidden states\nfor each source image due to significant differences in the observation quality\nof the same pixel in the reference frame across multiple source frames. We\nexplicitly estimate the quality of the current pixel corresponding to sampled\npoints on the epipolar line of the source image and dynamically update hidden\nstates through the uncertainty estimation module. Extensive results on the DTU\ndataset and Tanks&Temple benchmark demonstrate the effectiveness of our method.\nThe code is available at our project page:\nhttps://zju3dv.github.io/GD-PoseMVS/.\n","authors":["Yitong Dong","Yijin Li","Zhaoyang Huang","Weikang Bian","Jingbo Liu","Hujun Bao","Zhaopeng Cui","Hongsheng Li","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.01893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02554v2","updated":"2024-11-04T08:48:19Z","published":"2024-02-04T15:59:35Z","title":"DeSparsify: Adversarial Attack Against Token Sparsification Mechanisms\n in Vision Transformers","summary":" Vision transformers have contributed greatly to advancements in the computer\nvision domain, demonstrating state-of-the-art performance in diverse tasks\n(e.g., image classification, object detection). However, their high\ncomputational requirements grow quadratically with the number of tokens used.\nToken sparsification mechanisms have been proposed to address this issue. These\nmechanisms employ an input-dependent strategy, in which uninformative tokens\nare discarded from the computation pipeline, improving the model's efficiency.\nHowever, their dynamism and average-case assumption makes them vulnerable to a\nnew threat vector - carefully crafted adversarial examples capable of fooling\nthe sparsification mechanism, resulting in worst-case performance. In this\npaper, we present DeSparsify, an attack targeting the availability of vision\ntransformers that use token sparsification mechanisms. The attack aims to\nexhaust the operating system's resources, while maintaining its stealthiness.\nOur evaluation demonstrates the attack's effectiveness on three token\nsparsification mechanisms and examines the attack's transferability between\nthem and its effect on the GPU resources. To mitigate the impact of the attack,\nwe propose various countermeasures.\n","authors":["Oryan Yehezkel","Alon Zolfi","Amit Baras","Yuval Elovici","Asaf Shabtai"],"pdf_url":"https://arxiv.org/pdf/2402.02554v2.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.15751v2","updated":"2024-11-04T08:48:10Z","published":"2024-03-23T07:39:13Z","title":"F-OAL: Forward-only Online Analytic Learning with Fast Training and Low\n Memory Footprint in Class Incremental Learning","summary":" Online Class Incremental Learning (OCIL) aims to train models incrementally,\nwhere data arrive in mini-batches, and previous data are not accessible. A\nmajor challenge in OCIL is Catastrophic Forgetting, i.e., the loss of\npreviously learned knowledge. Among existing baselines, replay-based methods\nshow competitive results but requires extra memory for storing exemplars, while\nexemplar-free (i.e., data need not be stored for replay in production) methods\nare resource-friendly but often lack accuracy. In this paper, we propose an\nexemplar-free approach--Forward-only Online Analytic Learning (F-OAL). Unlike\ntraditional methods, F-OAL does not rely on back-propagation and is\nforward-only, significantly reducing memory usage and computational time.\nCooperating with a pre-trained frozen encoder with Feature Fusion, F-OAL only\nneeds to update a linear classifier by recursive least square. This approach\nsimultaneously achieves high accuracy and low resource consumption. Extensive\nexperiments on benchmark datasets demonstrate F-OAL's robust performance in\nOCIL scenarios. Code is available at https://github.com/liuyuchen-cz/F-OAL.\n","authors":["Huiping Zhuang","Yuchen Liu","Run He","Kai Tong","Ziqian Zeng","Cen Chen","Yi Wang","Lap-Pui Chau"],"pdf_url":"https://arxiv.org/pdf/2403.15751v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19458v3","updated":"2024-11-04T08:43:30Z","published":"2024-05-29T19:12:08Z","title":"MemControl: Mitigating Memorization in Diffusion Models via Automated\n Parameter Selection","summary":" Diffusion models excel in generating images that closely resemble their\ntraining data but are also susceptible to data memorization, raising privacy,\nethical, and legal concerns, particularly in sensitive domains such as medical\nimaging. We hypothesize that this memorization stems from the\noverparameterization of deep models and propose that regularizing model\ncapacity during fine-tuning can mitigate this issue. Firstly, we empirically\nshow that regulating the model capacity via Parameter-efficient fine-tuning\n(PEFT) mitigates memorization to some extent, however, it further requires the\nidentification of the exact parameter subsets to be fine-tuned for high-quality\ngeneration. To identify these subsets, we introduce a bi-level optimization\nframework, MemControl, that automates parameter selection using memorization\nand generation quality metrics as rewards during fine-tuning. The parameter\nsubsets discovered through MemControl achieve a superior tradeoff between\ngeneration quality and memorization. For the task of medical image generation,\nour approach outperforms existing state-of-the-art memorization mitigation\nstrategies by fine-tuning as few as 0.019% of model parameters. Moreover, we\ndemonstrate that the discovered parameter subsets are transferable to\nnon-medical domains. Our framework is scalable to large datasets, agnostic to\nreward functions, and can be integrated with existing approaches for further\nmemorization mitigation. To the best of our knowledge, this is the first study\nto empirically evaluate memorization in medical images and propose a targeted\nyet universal mitigation strategy. The code is available at\nhttps://github.com/Raman1121/Diffusion_Memorization_HPO\n","authors":["Raman Dutt","Ondrej Bohdal","Pedro Sanchez","Sotirios A. Tsaftaris","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2405.19458v3.pdf","comment":"Accepted at WACV'25 (Applications Track)"},{"id":"http://arxiv.org/abs/2411.01889v1","updated":"2024-11-04T08:37:12Z","published":"2024-11-04T08:37:12Z","title":"LiDAttack: Robust Black-box Attack on LiDAR-based Object Detection","summary":" Since DNN is vulnerable to carefully crafted adversarial examples,\nadversarial attack on LiDAR sensors have been extensively studied. We introduce\na robust black-box attack dubbed LiDAttack. It utilizes a genetic algorithm\nwith a simulated annealing strategy to strictly limit the location and number\nof perturbation points, achieving a stealthy and effective attack. And it\nsimulates scanning deviations, allowing it to adapt to dynamic changes in real\nworld scenario variations. Extensive experiments are conducted on 3 datasets\n(i.e., KITTI, nuScenes, and self-constructed data) with 3 dominant object\ndetection models (i.e., PointRCNN, PointPillar, and PV-RCNN++). The results\nreveal the efficiency of the LiDAttack when targeting a wide range of object\ndetection models, with an attack success rate (ASR) up to 90%.\n","authors":["Jinyin Chen","Danxin Liao","Sheng Xiang","Haibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.01889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01870v1","updated":"2024-11-04T07:57:44Z","published":"2024-11-04T07:57:44Z","title":"Mining and Transferring Feature-Geometry Coherence for Unsupervised\n Point Cloud Registration","summary":" Point cloud registration, a fundamental task in 3D vision, has achieved\nremarkable success with learning-based methods in outdoor environments.\nUnsupervised outdoor point cloud registration methods have recently emerged to\ncircumvent the need for costly pose annotations. However, they fail to\nestablish reliable optimization objectives for unsupervised training, either\nrelying on overly strong geometric assumptions, or suffering from poor-quality\npseudo-labels due to inadequate integration of low-level geometric and\nhigh-level contextual information. We have observed that in the feature space,\nlatent new inlier correspondences tend to cluster around respective positive\nanchors that summarize features of existing inliers. Motivated by this\nobservation, we propose a novel unsupervised registration method termed INTEGER\nto incorporate high-level contextual information for reliable pseudo-label\nmining. Specifically, we propose the Feature-Geometry Coherence Mining module\nto dynamically adapt the teacher for each mini-batch of data during training\nand discover reliable pseudo-labels by considering both high-level feature\nrepresentations and low-level geometric cues. Furthermore, we propose\nAnchor-Based Contrastive Learning to facilitate contrastive learning with\nanchors for a robust feature space. Lastly, we introduce a Mixed-Density\nStudent to learn density-invariant features, addressing challenges related to\ndensity variation and low overlap in the outdoor scenario. Extensive\nexperiments on KITTI and nuScenes datasets demonstrate that our INTEGER\nachieves competitive performance in terms of accuracy and generalizability.\n","authors":["Kezheng Xiong","Haoen Xiang","Qingshan Xu","Chenglu Wen","Siqi Shen","Jonathan Li","Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2411.01870v1.pdf","comment":"Accepted by NeurIPS2024"},{"id":"http://arxiv.org/abs/2406.05967v2","updated":"2024-11-04T07:55:31Z","published":"2024-06-10T01:59:00Z","title":"CVQA: Culturally-diverse Multilingual Visual Question Answering\n Benchmark","summary":" Visual Question Answering (VQA) is an important task in multimodal AI, and it\nis often used to test the ability of vision-language models to understand and\nreason on knowledge present in both visual and textual data. However, most of\nthe current VQA models use datasets that are primarily focused on English and a\nfew major world languages, with images that are typically Western-centric.\nWhile recent efforts have tried to increase the number of languages covered on\nVQA datasets, they still lack diversity in low-resource languages. More\nimportantly, although these datasets often extend their linguistic range via\ntranslation or some other approaches, they usually keep images the same,\nresulting in narrow cultural representation. To address these limitations, we\nconstruct CVQA, a new Culturally-diverse multilingual Visual Question Answering\nbenchmark, designed to cover a rich set of languages and cultures, where we\nengage native speakers and cultural experts in the data collection process. As\na result, CVQA includes culturally-driven images and questions from across 30\ncountries on four continents, covering 31 languages with 13 scripts, providing\na total of 10k questions. We then benchmark several Multimodal Large Language\nModels (MLLMs) on CVQA, and show that the dataset is challenging for the\ncurrent state-of-the-art models. This benchmark can serve as a probing\nevaluation suite for assessing the cultural capability and bias of multimodal\nmodels and hopefully encourage more research efforts toward increasing cultural\nawareness and linguistic diversity in this field.\n","authors":["David Romero","Chenyang Lyu","Haryo Akbarianto Wibowo","Teresa Lynn","Injy Hamed","Aditya Nanda Kishore","Aishik Mandal","Alina Dragonetti","Artem Abzaliev","Atnafu Lambebo Tonja","Bontu Fufa Balcha","Chenxi Whitehouse","Christian Salamea","Dan John Velasco","David Ifeoluwa Adelani","David Le Meur","Emilio Villa-Cueva","Fajri Koto","Fauzan Farooqui","Frederico Belcavello","Ganzorig Batnasan","Gisela Vallejo","Grainne Caulfield","Guido Ivetta","Haiyue Song","Henok Biadglign Ademtew","Hernán Maina","Holy Lovenia","Israel Abebe Azime","Jan Christian Blaise Cruz","Jay Gala","Jiahui Geng","Jesus-German Ortiz-Barajas","Jinheon Baek","Jocelyn Dunstan","Laura Alonso Alemany","Kumaranage Ravindu Yasas Nagasinghe","Luciana Benotti","Luis Fernando D'Haro","Marcelo Viridiano","Marcos Estecha-Garitagoitia","Maria Camila Buitrago Cabrera","Mario Rodríguez-Cantelar","Mélanie Jouitteau","Mihail Mihaylov","Mohamed Fazli Mohamed Imam","Muhammad Farid Adilazuarda","Munkhjargal Gochoo","Munkh-Erdene Otgonbold","Naome Etori","Olivier Niyomugisha","Paula Mónica Silva","Pranjal Chitale","Raj Dabre","Rendi Chevi","Ruochen Zhang","Ryandito Diandaru","Samuel Cahyawijaya","Santiago Góngora","Soyeong Jeong","Sukannya Purkayastha","Tatsuki Kuribayashi","Teresa Clifford","Thanmay Jayakumar","Tiago Timponi Torrent","Toqeer Ehsan","Vladimir Araujo","Yova Kementchedjhieva","Zara Burzo","Zheng Wei Lim","Zheng Xin Yong","Oana Ignat","Joan Nwatu","Rada Mihalcea","Thamar Solorio","Alham Fikri Aji"],"pdf_url":"https://arxiv.org/pdf/2406.05967v2.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n 2024) Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2411.01859v1","updated":"2024-11-04T07:21:06Z","published":"2024-11-04T07:21:06Z","title":"A Novel Deep Learning Tractography Fiber Clustering Framework for\n Functionally Consistent White Matter Parcellation Using Multimodal Diffusion\n MRI and Functional MRI","summary":" Tractography fiber clustering using diffusion MRI (dMRI) is a crucial\nstrategy for white matter (WM) parcellation. Current methods primarily use the\ngeometric information of fibers (i.e., the spatial trajectories) to group\nsimilar fibers into clusters, overlooking the important functional signals\npresent along the fiber tracts. There is increasing evidence that neural\nactivity in the WM can be measured using functional MRI (fMRI), offering\npotentially valuable multimodal information for fiber clustering. In this\npaper, we develop a novel deep learning fiber clustering framework, namely Deep\nMulti-view Fiber Clustering (DMVFC), that uses joint dMRI and fMRI data to\nenable functionally consistent WM parcellation. DMVFC can effectively integrate\nthe geometric characteristics of the WM fibers with the fMRI BOLD signals along\nthe fiber tracts. It includes two major components: 1) a multi-view pretraining\nmodule to compute embedding features from fiber geometric information and\nfunctional signals separately, and 2) a collaborative fine-tuning module to\nsimultaneously refine the two kinds of embeddings. In the experiments, we\ncompare DMVFC with two state-of-the-art fiber clustering methods and\ndemonstrate superior performance in achieving functionally meaningful and\nconsistent WM parcellation results.\n","authors":["Jin Wang","Bocheng Guo","Yijie Li","Junyi Wang","Yuqian Chen","Jarrett Rushmore","Nikos Makris","Yogesh Rathi","Lauren J O'Donnell","Fan Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.01859v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2406.03879v2","updated":"2024-11-04T07:16:54Z","published":"2024-06-06T09:14:32Z","title":"Decay Pruning Method: Smooth Pruning With a Self-Rectifying Procedure","summary":" Current structured pruning methods often result in considerable accuracy\ndrops due to abrupt network changes and loss of information from pruned\nstructures. To address these issues, we introduce the Decay Pruning Method\n(DPM), a novel smooth pruning approach with a self-rectifying mechanism. DPM\nconsists of two key components: (i) Smooth Pruning: It converts conventional\nsingle-step pruning into multi-step smooth pruning, gradually reducing\nredundant structures to zero over N steps with ongoing optimization. (ii)\nSelf-Rectifying: This procedure further enhances the aforementioned process by\nrectifying sub-optimal pruning based on gradient information. Our approach\ndemonstrates strong generalizability and can be easily integrated with various\nexisting pruning methods. We validate the effectiveness of DPM by integrating\nit with three popular pruning methods: OTOv2, Depgraph, and Gate Decorator.\nExperimental results show consistent improvements in performance compared to\nthe original pruning methods, along with further reductions of FLOPs in most\nscenarios.\n","authors":["Minghao Yang","Linlin Gao","Pengyuan Li","Wenbo Li","Yihong Dong","Zhiying Cui"],"pdf_url":"https://arxiv.org/pdf/2406.03879v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01853v1","updated":"2024-11-04T07:07:31Z","published":"2024-11-04T07:07:31Z","title":"GVKF: Gaussian Voxel Kernel Functions for Highly Efficient Surface\n Reconstruction in Open Scenes","summary":" In this paper we present a novel method for efficient and effective 3D\nsurface reconstruction in open scenes. Existing Neural Radiance Fields (NeRF)\nbased works typically require extensive training and rendering time due to the\nadopted implicit representations. In contrast, 3D Gaussian splatting (3DGS)\nuses an explicit and discrete representation, hence the reconstructed surface\nis built by the huge number of Gaussian primitives, which leads to excessive\nmemory consumption and rough surface details in sparse Gaussian areas. To\naddress these issues, we propose Gaussian Voxel Kernel Functions (GVKF), which\nestablish a continuous scene representation based on discrete 3DGS through\nkernel regression. The GVKF integrates fast 3DGS rasterization and highly\neffective scene implicit representations, achieving high-fidelity open scene\nsurface reconstruction. Experiments on challenging scene datasets demonstrate\nthe efficiency and effectiveness of our proposed GVKF, featuring with high\nreconstruction quality, real-time rendering speed, significant savings in\nstorage and training memory consumption.\n","authors":["Gaochao Song","Chong Cheng","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2411.01853v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.01851v1","updated":"2024-11-04T07:05:47Z","published":"2024-11-04T07:05:47Z","title":"Silver medal Solution for Image Matching Challenge 2024","summary":" Image Matching Challenge 2024 is a competition focused on building 3D maps\nfrom diverse image sets, requiring participants to solve fundamental computer\nvision challenges in image matching across varying angles, lighting, and\nseasonal changes. This project develops a Pipeline method that combines\nmultiple advanced techniques: using pre-trained EfficientNet-B7 for initial\nfeature extraction and cosine distance-based image pair filtering, employing\nboth KeyNetAffNetHardNet and SuperPoint for keypoint feature extraction,\nutilizing AdaLAM and SuperGlue for keypoint matching, and finally applying\nPycolmap for 3D spatial analysis. The methodology achieved an excellent score\nof 0.167 on the private leaderboard, with experimental results demonstrating\nthat the combination of KeyNetAffNetHardNet and SuperPoint provides significant\nadvantages in keypoint detection and matching, particularly when dealing with\nchallenging variations in surface texture and environmental conditions that\ntypically degrade traditional algorithm performance.\n","authors":["Yian Wang"],"pdf_url":"https://arxiv.org/pdf/2411.01851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01846v1","updated":"2024-11-04T06:42:24Z","published":"2024-11-04T06:42:24Z","title":"KptLLM: Unveiling the Power of Large Language Model for Keypoint\n Comprehension","summary":" Recent advancements in Multimodal Large Language Models (MLLMs) have greatly\nimproved their abilities in image understanding. However, these models often\nstruggle with grasping pixel-level semantic details, e.g., the keypoints of an\nobject. To bridge this gap, we introduce the novel challenge of Semantic\nKeypoint Comprehension, which aims to comprehend keypoints across different\ntask scenarios, including keypoint semantic understanding, visual prompt-based\nkeypoint detection, and textual prompt-based keypoint detection. Moreover, we\nintroduce KptLLM, a unified multimodal model that utilizes an\nidentify-then-detect strategy to effectively address these challenges. KptLLM\nunderscores the initial discernment of semantics in keypoints, followed by the\nprecise determination of their positions through a chain-of-thought process.\nWith several carefully designed modules, KptLLM adeptly handles various\nmodality inputs, facilitating the interpretation of both semantic contents and\nkeypoint locations. Our extensive experiments demonstrate KptLLM's superiority\nin various keypoint detection benchmarks and its unique semantic capabilities\nin interpreting keypoints.\n","authors":["Jie Yang","Wang Zeng","Sheng Jin","Lumin Xu","Wentao Liu","Chen Qian","Ruimao Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.01846v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.20024v3","updated":"2024-11-04T06:08:30Z","published":"2024-06-28T16:13:55Z","title":"eMoE-Tracker: Environmental MoE-based Transformer for Robust\n Event-guided Object Tracking","summary":" The unique complementarity of frame-based and event cameras for high frame\nrate object tracking has recently inspired some research attempts to develop\nmulti-modal fusion approaches. However, these methods directly fuse both\nmodalities and thus ignore the environmental attributes, e.g., motion blur,\nillumination variance, occlusion, scale variation, etc. Meanwhile, insufficient\ninteraction between search and template features makes distinguishing target\nobjects and backgrounds difficult. As a result, performance degradation is\ninduced especially in challenging conditions. This paper proposes a novel and\neffective Transformer-based event-guided tracking framework, called\neMoE-Tracker, which achieves new SOTA performance under various conditions. Our\nkey idea is to disentangle the environment into several learnable attributes to\ndynamically learn the attribute-specific features and strengthen the target\ninformation by improving the interaction between the target template and search\nregions. To achieve the goal, we first propose an environmental Mix-of-Experts\n(eMoE) module that is built upon the environmental Attributes Disentanglement\nto learn attribute-specific features and environmental Attributes Assembling to\nassemble the attribute-specific features by the learnable attribute scores\ndynamically. The eMoE module is a subtle router that prompt-tunes the\ntransformer backbone more efficiently. We then introduce a contrastive relation\nmodeling (CRM) module to emphasize target information by leveraging a\ncontrastive learning strategy between the target template and search regions.\nExtensive experiments on diverse event-based benchmark datasets showcase the\nsuperior performance of our eMoE-Tracker compared to the prior arts.\n","authors":["Yucheng Chen","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2406.20024v3.pdf","comment":"RGB-event single object tracking"},{"id":"http://arxiv.org/abs/2411.01833v1","updated":"2024-11-04T06:07:43Z","published":"2024-11-04T06:07:43Z","title":"OwMatch: Conditional Self-Labeling with Consistency for Open-World\n Semi-Supervised Learning","summary":" Semi-supervised learning (SSL) offers a robust framework for harnessing the\npotential of unannotated data. Traditionally, SSL mandates that all classes\npossess labeled instances. However, the emergence of open-world SSL (OwSSL)\nintroduces a more practical challenge, wherein unlabeled data may encompass\nsamples from unseen classes. This scenario leads to misclassification of unseen\nclasses as known ones, consequently undermining classification accuracy. To\novercome this challenge, this study revisits two methodologies from\nself-supervised and semi-supervised learning, self-labeling and consistency,\ntailoring them to address the OwSSL problem. Specifically, we propose an\neffective framework called OwMatch, combining conditional self-labeling and\nopen-world hierarchical thresholding. Theoretically, we analyze the estimation\nof class distribution on unlabeled data through rigorous statistical analysis,\nthus demonstrating that OwMatch can ensure the unbiasedness of the self-label\nassignment estimator with reliability. Comprehensive empirical analyses\ndemonstrate that our method yields substantial performance enhancements across\nboth known and unknown classes in comparison to previous studies. Code is\navailable at https://github.com/niusj03/OwMatch.\n","authors":["Shengjie Niu","Lifan Lin","Jian Huang","Chao Wang"],"pdf_url":"https://arxiv.org/pdf/2411.01833v1.pdf","comment":"NeurIPS 2024 camera-ready (10 pages, 4 figures) with the appendices\n (10 pages, 7 figures)"},{"id":"http://arxiv.org/abs/2310.02901v3","updated":"2024-11-04T06:06:02Z","published":"2023-10-04T15:39:57Z","title":"Efficient Vectorized Backpropagation Algorithms for Training Feedforward\n Networks Composed of Quadratic Neurons","summary":" Higher order artificial neurons whose outputs are computed by applying an\nactivation function to a higher order multinomial function of the inputs have\nbeen considered in the past, but did not gain acceptance due to the extra\nparameters and computational cost. However, higher order neurons have\nsignificantly greater learning capabilities since the decision boundaries of\nhigher order neurons can be complex surfaces instead of just hyperplanes. The\nboundary of a single quadratic neuron can be a general hyper-quadric surface\nallowing it to learn many nonlinearly separable datasets. Since quadratic forms\ncan be represented by symmetric matrices, only $\\frac{n(n+1)}{2}$ additional\nparameters are needed instead of $n^2$. A quadratic Logistic regression model\nis first presented. Solutions to the XOR problem with a single quadratic neuron\nare considered. The complete vectorized equations for both forward and backward\npropagation in feedforward networks composed of quadratic neurons are derived.\nA reduced parameter quadratic neural network model with just $ n $ additional\nparameters per neuron that provides a compromise between learning ability and\ncomputational cost is presented. Comparison on benchmark classification\ndatasets are used to demonstrate that a final layer of quadratic neurons\nenables networks to achieve higher accuracy with significantly fewer hidden\nlayer neurons. In particular this paper shows that any dataset composed of\n$\\mathcal{C}$ bounded clusters can be separated with only a single layer of\n$\\mathcal{C}$ quadratic neurons.\n","authors":["Mathew Mithra Noel","Venkataraman Muthiah-Nakarajan"],"pdf_url":"https://arxiv.org/pdf/2310.02901v3.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2403.16999v3","updated":"2024-11-04T05:50:56Z","published":"2024-03-25T17:59:23Z","title":"Visual CoT: Advancing Multi-Modal Language Models with a Comprehensive\n Dataset and Benchmark for Chain-of-Thought Reasoning","summary":" Multi-Modal Large Language Models (MLLMs) have demonstrated impressive\nperformance in various VQA tasks. However, they often lack interpretability and\nstruggle with complex visual inputs, especially when the resolution of the\ninput image is high or when the interested region that could provide key\ninformation for answering the question is small. To address these challenges,\nwe collect and introduce the large-scale Visual CoT dataset comprising 438k\nquestion-answer pairs, annotated with intermediate bounding boxes highlighting\nkey regions essential for answering the questions. Additionally, about 98k\npairs of them are annotated with detailed reasoning steps. Importantly, we\npropose a multi-turn processing pipeline that dynamically focuses on visual\ninputs and provides interpretable thoughts. We also introduce the related\nbenchmark to evaluate the MLLMs in scenarios requiring specific local region\nidentification. Extensive experiments demonstrate the effectiveness of our\nframework and shed light on better inference strategies. The Visual CoT\ndataset, benchmark, and pre-trained models are available on\nhttps://hao-shao.com/projects/viscot.html to support further research in this\narea.\n","authors":["Hao Shao","Shengju Qian","Han Xiao","Guanglu Song","Zhuofan Zong","Letian Wang","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2403.16999v3.pdf","comment":"Project Page: https://hao-shao.com/projects/viscot.html"},{"id":"http://arxiv.org/abs/2410.24060v2","updated":"2024-11-04T05:44:08Z","published":"2024-10-31T15:57:04Z","title":"Understanding Generalizability of Diffusion Models Requires Rethinking\n the Hidden Gaussian Structure","summary":" In this work, we study the generalizability of diffusion models by looking\ninto the hidden properties of the learned score functions, which are\nessentially a series of deep denoisers trained on various noise levels. We\nobserve that as diffusion models transition from memorization to\ngeneralization, their corresponding nonlinear diffusion denoisers exhibit\nincreasing linearity. This discovery leads us to investigate the linear\ncounterparts of the nonlinear diffusion models, which are a series of linear\nmodels trained to match the function mappings of the nonlinear diffusion\ndenoisers. Surprisingly, these linear denoisers are approximately the optimal\ndenoisers for a multivariate Gaussian distribution characterized by the\nempirical mean and covariance of the training dataset. This finding implies\nthat diffusion models have the inductive bias towards capturing and utilizing\nthe Gaussian structure (covariance information) of the training dataset for\ndata generation. We empirically demonstrate that this inductive bias is a\nunique property of diffusion models in the generalization regime, which becomes\nincreasingly evident when the model's capacity is relatively small compared to\nthe training dataset size. In the case that the model is highly\noverparameterized, this inductive bias emerges during the initial training\nphases before the model fully memorizes its training data. Our study provides\ncrucial insights into understanding the notable strong generalization\nphenomenon recently observed in real-world diffusion models.\n","authors":["Xiang Li","Yixiang Dai","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2410.24060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08590v2","updated":"2024-11-04T05:43:17Z","published":"2024-04-12T16:38:48Z","title":"Vision-Aware Text Features in Referring Image Segmentation: From Object\n Understanding to Context Understanding","summary":" Referring image segmentation is a challenging task that involves generating\npixel-wise segmentation masks based on natural language descriptions. The\ncomplexity of this task increases with the intricacy of the sentences provided.\nExisting methods have relied mostly on visual features to generate the\nsegmentation masks while treating text features as supporting components.\nHowever, this under-utilization of text understanding limits the model's\ncapability to fully comprehend the given expressions. In this work, we propose\na novel framework that specifically emphasizes object and context comprehension\ninspired by human cognitive processes through Vision-Aware Text Features.\nFirstly, we introduce a CLIP Prior module to localize the main object of\ninterest and embed the object heatmap into the query initialization process.\nSecondly, we propose a combination of two components: Contextual Multimodal\nDecoder and Meaning Consistency Constraint, to further enhance the coherent and\nconsistent interpretation of language cues with the contextual understanding\nobtained from the image. Our method achieves significant performance\nimprovements on three benchmark datasets RefCOCO, RefCOCO+ and G-Ref. Project\npage: \\url{https://vatex.hkustvgd.com/}.\n","authors":["Hai Nguyen-Truong","E-Ro Nguyen","Tuan-Anh Vu","Minh-Triet Tran","Binh-Son Hua","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2404.08590v2.pdf","comment":"This paper is accepted in WACV 2025"},{"id":"http://arxiv.org/abs/2411.01822v1","updated":"2024-11-04T05:41:31Z","published":"2024-11-04T05:41:31Z","title":"Distribution alignment based transfer fusion frameworks on quantum\n devices for seeking quantum advantages","summary":" The scarcity of labelled data is specifically an urgent challenge in the\nfield of quantum machine learning (QML). Two transfer fusion frameworks are\nproposed in this paper to predict the labels of a target domain data by\naligning its distribution to a different but related labelled source domain on\nquantum devices. The frameworks fuses the quantum data from two different, but\nrelated domains through a quantum information infusion channel. The predicting\ntasks in the target domain can be achieved with quantum advantages by\npost-processing quantum measurement results. One framework, the quantum basic\nlinear algebra subroutines (QBLAS) based implementation, can theoretically\nachieve the procedure of transfer fusion with quadratic speedup on a universal\nquantum computer. In addition, the other framework, a hardware-scalable\narchitecture, is implemented on the noisy intermediate-scale quantum (NISQ)\ndevices through a variational hybrid quantum-classical procedure. Numerical\nexperiments on the synthetic and handwritten digits datasets demonstrate that\nthe variatioinal transfer fusion (TF) framework can reach state-of-the-art\n(SOTA) quantum DA method performance.\n","authors":["Xi He","Feiyu Du","Xiaohan Yu","Yang Zhao","Tao Lei"],"pdf_url":"https://arxiv.org/pdf/2411.01822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01819v1","updated":"2024-11-04T05:39:01Z","published":"2024-11-04T05:39:01Z","title":"DiffuMask-Editor: A Novel Paradigm of Integration Between the\n Segmentation Diffusion Model and Image Editing to Improve Segmentation\n Ability","summary":" Semantic segmentation models, like mask2former, often demand a substantial\namount of manually annotated data, which is time-consuming and inefficient to\nacquire. Leveraging state-of-the-art text-to-image models like Midjourney and\nStable Diffusion has emerged as an effective strategy for automatically\ngenerating synthetic data instead of human annotations. However, prior\napproaches have been constrained to synthesizing single-instance images due to\nthe instability inherent in generating multiple instances with Stable\nDiffusion. To expand the domains and diversity of synthetic datasets, this\npaper introduces a novel paradigm named DiffuMask-Editor, which combines the\nDiffusion Model for Segmentation with Image Editing. By integrating multiple\nobjects into images using Text2Image models, our method facilitates the\ncreation of more realistic datasets that closely resemble open-world settings\nwhile simultaneously generating accurate masks. Our approach significantly\nreduces the laborious effort associated with manual annotation while ensuring\nprecise mask generation. Experimental results demonstrate that synthetic data\ngenerated by DiffuMask-Editor enable segmentation methods to achieve superior\nperformance compared to real data. Particularly in zero-shot backgrounds,\nDiffuMask-Editor achieves new state-of-the-art results on Unseen classes of VOC\n2012. The code and models will be publicly available soon.\n","authors":["Bo Gao","Fangxu Xing","Daniel Tang"],"pdf_url":"https://arxiv.org/pdf/2411.01819v1.pdf","comment":"13 pages,4 figures"},{"id":"http://arxiv.org/abs/2411.01801v1","updated":"2024-11-04T05:00:49Z","published":"2024-11-04T05:00:49Z","title":"Bootstrapping Top-down Information for Self-modulating Slot Attention","summary":" Object-centric learning (OCL) aims to learn representations of individual\nobjects within visual scenes without manual supervision, facilitating efficient\nand effective visual reasoning. Traditional OCL methods primarily employ\nbottom-up approaches that aggregate homogeneous visual features to represent\nobjects. However, in complex visual environments, these methods often fall\nshort due to the heterogeneous nature of visual features within an object. To\naddress this, we propose a novel OCL framework incorporating a top-down\npathway. This pathway first bootstraps the semantics of individual objects and\nthen modulates the model to prioritize features relevant to these semantics. By\ndynamically modulating the model based on its own output, our top-down pathway\nenhances the representational quality of objects. Our framework achieves\nstate-of-the-art performance across multiple synthetic and real-world\nobject-discovery benchmarks.\n","authors":["Dongwon Kim","Seoyeon Kim","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2411.01801v1.pdf","comment":"Accepted to NeurIPS2 2024"},{"id":"http://arxiv.org/abs/2407.04172v2","updated":"2024-11-04T04:59:45Z","published":"2024-07-04T22:16:40Z","title":"ChartGemma: Visual Instruction-tuning for Chart Reasoning in the Wild","summary":" Given the ubiquity of charts as a data analysis, visualization, and\ndecision-making tool across industries and sciences, there has been a growing\ninterest in developing pre-trained foundation models as well as general purpose\ninstruction-tuned models for chart understanding and reasoning. However,\nexisting methods suffer crucial drawbacks across two critical axes affecting\nthe performance of chart representation models: they are trained on data\ngenerated from underlying data tables of the charts, ignoring the visual trends\nand patterns in chart images, and use weakly aligned vision-language backbone\nmodels for domain-specific training, limiting their generalizability when\nencountering charts in the wild. We address these important drawbacks and\nintroduce ChartGemma, a novel chart understanding and reasoning model developed\nover PaliGemma. Rather than relying on underlying data tables, ChartGemma is\ntrained on instruction-tuning data generated directly from chart images, thus\ncapturing both high-level trends and low-level visual information from a\ndiverse set of charts. Our simple approach achieves state-of-the-art results\nacross $5$ benchmarks spanning chart summarization, question answering, and\nfact-checking, and our elaborate qualitative studies on real-world charts show\nthat ChartGemma generates more realistic and factually correct summaries\ncompared to its contemporaries. We release the code, model checkpoints,\ndataset, and demos at https://github.com/vis-nlp/ChartGemma.\n","authors":["Ahmed Masry","Megh Thakkar","Aayush Bajaj","Aaryaman Kartha","Enamul Hoque","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2407.04172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01800v1","updated":"2024-11-04T04:58:20Z","published":"2024-11-04T04:58:20Z","title":"Expanding Sparse Tuning for Low Memory Usage","summary":" Parameter-efficient fine-tuning (PEFT) is an effective method for adapting\npre-trained vision models to downstream tasks by tuning a small subset of\nparameters. Among PEFT methods, sparse tuning achieves superior performance by\nonly adjusting the weights most relevant to downstream tasks, rather than\ndensely tuning the whole weight matrix. However, this performance improvement\nhas been accompanied by increases in memory usage, which stems from two\nfactors, i.e., the storage of the whole weight matrix as learnable parameters\nin the optimizer and the additional storage of tunable weight indexes. In this\npaper, we propose a method named SNELL (Sparse tuning with kerNELized LoRA) for\nsparse tuning with low memory usage. To achieve low memory usage, SNELL\ndecomposes the tunable matrix for sparsification into two learnable low-rank\nmatrices, saving from the costly storage of the whole original matrix. A\ncompetition-based sparsification mechanism is further proposed to avoid the\nstorage of tunable weight indexes. To maintain the effectiveness of sparse\ntuning with low-rank matrices, we extend the low-rank decomposition by applying\nnonlinear kernel functions to the whole-matrix merging. Consequently, we gain\nan increase in the rank of the merged matrix, enhancing the ability of SNELL in\nadapting the pre-trained models to downstream tasks. Extensive experiments on\nmultiple downstream tasks show that SNELL achieves state-of-the-art performance\nwith low memory usage, endowing PEFT with sparse tuning to large-scale models.\nCodes are available at https://github.com/ssfgunner/SNELL.\n","authors":["Shufan Shen","Junshu Sun","Xiangyang Ji","Qingming Huang","Shuhui Wang"],"pdf_url":"https://arxiv.org/pdf/2411.01800v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.01797v1","updated":"2024-11-04T04:45:45Z","published":"2024-11-04T04:45:45Z","title":"AIWR: Aerial Image Water Resource Dataset for Segmentation Analysis","summary":" Effective water resource management is crucial in agricultural regions like\nnortheastern Thailand, where limited water retention in sandy soils poses\nsignificant challenges. In response to this issue, the Aerial Image Water\nResource (AIWR) dataset was developed, comprising 800 aerial images focused on\nnatural and artificial water bodies in this region. The dataset was created\nusing Bing Maps and follows the standards of the Fundamental Geographic Data\nSet (FGDS). It includes ground truth annotations validated by experts in remote\nsensing, making it an invaluable resource for researchers in geoinformatics,\ncomputer vision, and artificial intelligence. The AIWR dataset presents\nconsiderable challenges, such as segmentation due to variations in the size,\ncolor, shape, and similarity of water bodies, which often resemble other land\nuse categories.\n","authors":["Sangdaow Noppitaka","Emmanuel Okafor","Olarik Surinta"],"pdf_url":"https://arxiv.org/pdf/2411.01797v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.01788v1","updated":"2024-11-04T04:21:41Z","published":"2024-11-04T04:21:41Z","title":"Non rigid geometric distortions correction -- Application to atmospheric\n turbulence stabilization","summary":" A novel approach is presented to recover an image degraded by atmospheric\nturbulence. Given a sequence of frames affected by turbulence, we construct a\nvariational model to characterize the static image. The optimization problem is\nsolved by Bregman Iteration and the operator splitting method. Our algorithm is\nsimple, efficient, and can be easily generalized for different scenarios.\n","authors":["Yu Mao","Jerome Gilles"],"pdf_url":"https://arxiv.org/pdf/2411.01788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01781v1","updated":"2024-11-04T04:14:39Z","published":"2024-11-04T04:14:39Z","title":"MSTA3D: Multi-scale Twin-attention for 3D Instance Segmentation","summary":" Recently, transformer-based techniques incorporating superpoints have become\nprevalent in 3D instance segmentation. However, they often encounter an\nover-segmentation problem, especially noticeable with large objects.\nAdditionally, unreliable mask predictions stemming from superpoint mask\nprediction further compound this issue. To address these challenges, we propose\na novel framework called MSTA3D. It leverages multi-scale feature\nrepresentation and introduces a twin-attention mechanism to effectively capture\nthem. Furthermore, MSTA3D integrates a box query with a box regularizer,\noffering a complementary spatial constraint alongside semantic queries.\nExperimental evaluations on ScanNetV2, ScanNet200 and S3DIS datasets\ndemonstrate that our approach surpasses state-of-the-art 3D instance\nsegmentation methods.\n","authors":["Duc Dang Trung Tran","Byeongkeun Kang","Yeejin Lee"],"pdf_url":"https://arxiv.org/pdf/2411.01781v1.pdf","comment":"14 pages, 9 figures, 7 tables, conference"},{"id":"http://arxiv.org/abs/2411.01777v1","updated":"2024-11-04T03:58:09Z","published":"2024-11-04T03:58:09Z","title":"Learning predictable and robust neural representations by straightening\n image sequences","summary":" Prediction is a fundamental capability of all living organisms, and has been\nproposed as an objective for learning sensory representations. Recent work\ndemonstrates that in primate visual systems, prediction is facilitated by\nneural representations that follow straighter temporal trajectories than their\ninitial photoreceptor encoding, which allows for prediction by linear\nextrapolation. Inspired by these experimental findings, we develop a\nself-supervised learning (SSL) objective that explicitly quantifies and\npromotes straightening. We demonstrate the power of this objective in training\ndeep feedforward neural networks on smoothly-rendered synthetic image sequences\nthat mimic commonly-occurring properties of natural videos. The learned model\ncontains neural embeddings that are predictive, but also factorize the\ngeometric, photometric, and semantic attributes of objects. The representations\nalso prove more robust to noise and adversarial attacks compared to previous\nSSL methods that optimize for invariance to random augmentations. Moreover,\nthese beneficial properties can be transferred to other training procedures by\nusing the straightening objective as a regularizer, suggesting a broader\nutility for straightening as a principle for robust unsupervised learning.\n","authors":["Xueyan Niu","Cristina Savin","Eero P. Simoncelli"],"pdf_url":"https://arxiv.org/pdf/2411.01777v1.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.01769v1","updated":"2024-11-04T03:29:51Z","published":"2024-11-04T03:29:51Z","title":"ARN-LSTM: A Multi-Stream Attention-Based Model for Action Recognition\n with Temporal Dynamics","summary":" This paper presents ARN-LSTM, a novel multi-stream action recognition model\ndesigned to address the challenge of simultaneously capturing spatial motion\nand temporal dynamics in action sequences. Traditional methods often focus\nsolely on spatial or temporal features, limiting their ability to comprehend\ncomplex human activities fully. Our proposed model integrates joint, motion,\nand temporal information through a multi-stream fusion architecture.\nSpecifically, it comprises a joint stream for extracting skeleton features, a\ntemporal stream for capturing dynamic temporal features, and an ARN-LSTM block\nthat utilizes Time-Distributed Long Short-Term Memory (TD-LSTM) layers followed\nby an Attention Relation Network (ARN) to model temporal relations. The outputs\nfrom these streams are fused in a fully connected layer to provide the final\naction prediction. Evaluations on the NTU RGB+D 60 and NTU RGB+D 120 datasets\ndemonstrate the effectiveness of our model, achieving effective performance,\nparticularly in group activity recognition.\n","authors":["Chuanchuan Wang","Ahmad Sufril Azlan Mohmamed","Xiao Yang","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2411.01769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14133v2","updated":"2024-11-04T02:59:25Z","published":"2023-03-24T16:38:58Z","title":"Survey on Adversarial Attack and Defense for Medical Image Analysis:\n Methods and Challenges","summary":" Deep learning techniques have achieved superior performance in computer-aided\nmedical image analysis, yet they are still vulnerable to imperceptible\nadversarial attacks, resulting in potential misdiagnosis in clinical practice.\nOppositely, recent years have also witnessed remarkable progress in defense\nagainst these tailored adversarial examples in deep medical diagnosis systems.\nIn this exposition, we present a comprehensive survey on recent advances in\nadversarial attacks and defenses for medical image analysis with a systematic\ntaxonomy in terms of the application scenario. We also provide a unified\nframework for different types of adversarial attack and defense methods in the\ncontext of medical image analysis. For a fair comparison, we establish a new\nbenchmark for adversarially robust medical diagnosis models obtained by\nadversarial training under various scenarios. To the best of our knowledge,\nthis is the first survey paper that provides a thorough evaluation of\nadversarially robust medical diagnosis models. By analyzing qualitative and\nquantitative results, we conclude this survey with a detailed discussion of\ncurrent challenges for adversarial attack and defense in medical image analysis\nsystems to shed light on future research directions. Code is available on\n\\href{https://github.com/tomvii/Adv_MIA}{\\color{red}{GitHub}}.\n","authors":["Junhao Dong","Junxi Chen","Xiaohua Xie","Jianhuang Lai","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2303.14133v2.pdf","comment":"Accepted by ACM Computing Surveys (CSUR) (DOI:\n https://doi.org/10.1145/3702638)"},{"id":"http://arxiv.org/abs/2305.04161v3","updated":"2024-11-04T02:52:56Z","published":"2023-05-07T02:26:00Z","title":"Camera-Based HRV Prediction for Remote Learning Environments","summary":" In recent years, due to the widespread use of internet videos, remote\nphotoplethysmography (rPPG) has gained more and more attention in the fields of\naffective computing. Restoring blood volume pulse (BVP) signals from facial\nvideos is a challenging task that involves a series of preprocessing, image\nalgorithms, and postprocessing to restore waveforms. Not only is the heart rate\nmetric utilized for affective computing, but the heart rate variability (HRV)\nmetric is even more significant. The challenge in obtaining HRV indices through\nrPPG lies in the necessity for algorithms to precisely predict the BVP peak\npositions. In this paper, we collected the Remote Learning Affect and\nPhysiology (RLAP) dataset, which includes over 32 hours of highly synchronized\nvideo and labels from 58 subjects. This is a public dataset whose BVP labels\nhave been meticulously designed to better suit the training of HRV models.\nUsing the RLAP dataset, we trained a new model called Seq-rPPG, it is a model\nbased on one-dimensional convolution, and experimental results reveal that this\nstructure is more suitable for handling HRV tasks, which outperformed all other\nbaselines in HRV performance and also demonstrated significant advantages in\ncomputational efficiency.\n","authors":["Kegang Wang","Yantao Wei","Jiankai Tang","Yuntao Wang","Mingwen Tong","Jie Gao","Yujian Ma","Zhongjin Zhao"],"pdf_url":"https://arxiv.org/pdf/2305.04161v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01759v1","updated":"2024-11-04T02:52:02Z","published":"2024-11-04T02:52:02Z","title":"Automatic Structured Pruning for Efficient Architecture in Federated\n Learning","summary":" In Federated Learning (FL), training is conducted on client devices,\ntypically with limited computational resources and storage capacity. To address\nthese constraints, we propose an automatic pruning scheme tailored for FL\nsystems. Our solution improves computation efficiency on client devices, while\nminimizing communication costs. One of the challenges of tuning pruning\nhyper-parameters in FL systems is the restricted access to local data. Thus, we\nintroduce an automatic pruning paradigm that dynamically determines pruning\nboundaries. Additionally, we utilized a structured pruning algorithm optimized\nfor mobile devices that lack hardware support for sparse computations.\nExperimental results demonstrate the effectiveness of our approach, achieving\naccuracy comparable to existing methods. Our method notably reduces the number\nof parameters by 89% and FLOPS by 90%, with minimal impact on the accuracy of\nthe FEMNIST and CelebFaces datasets. Furthermore, our pruning method decreases\ncommunication overhead by up to 5x and halves inference time when deployed on\nAndroid devices.\n","authors":["Thai Vu Nguyen","Long Bao Le","Anderson Avila"],"pdf_url":"https://arxiv.org/pdf/2411.01759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01758v1","updated":"2024-11-04T02:50:52Z","published":"2024-11-04T02:50:52Z","title":"Disentangled PET Lesion Segmentation","summary":" PET imaging is an invaluable tool in clinical settings as it captures the\nfunctional activity of both healthy anatomy and cancerous lesions. Developing\nautomatic lesion segmentation methods for PET images is crucial since manual\nlesion segmentation is laborious and prone to inter- and intra-observer\nvariability. We propose PET-Disentangler, a 3D disentanglement method that uses\na 3D UNet-like encoder-decoder architecture to disentangle disease and normal\nhealthy anatomical features with losses for segmentation, reconstruction, and\nhealthy component plausibility. A critic network is used to encourage the\nhealthy latent features to match the distribution of healthy samples and thus\nencourages these features to not contain any lesion-related features. Our\nquantitative results show that PET-Disentangler is less prone to incorrectly\ndeclaring healthy and high tracer uptake regions as cancerous lesions, since\nsuch uptake pattern would be assigned to the disentangled healthy component.\n","authors":["Tanya Gatsak","Kumar Abhishek","Hanene Ben Yedder","Saeid Asgari Taghanaki","Ghassan Hamarneh"],"pdf_url":"https://arxiv.org/pdf/2411.01758v1.pdf","comment":"4 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2411.01756v1","updated":"2024-11-04T02:43:55Z","published":"2024-11-04T02:43:55Z","title":"ChatTracker: Enhancing Visual Tracking Performance via Chatting with\n Multimodal Large Language Model","summary":" Visual object tracking aims to locate a targeted object in a video sequence\nbased on an initial bounding box. Recently, Vision-Language~(VL) trackers have\nproposed to utilize additional natural language descriptions to enhance\nversatility in various applications. However, VL trackers are still inferior to\nState-of-The-Art (SoTA) visual trackers in terms of tracking performance. We\nfound that this inferiority primarily results from their heavy reliance on\nmanual textual annotations, which include the frequent provision of ambiguous\nlanguage descriptions. In this paper, we propose ChatTracker to leverage the\nwealth of world knowledge in the Multimodal Large Language Model (MLLM) to\ngenerate high-quality language descriptions and enhance tracking performance.\nTo this end, we propose a novel reflection-based prompt optimization module to\niteratively refine the ambiguous and inaccurate descriptions of the target with\ntracking feedback. To further utilize semantic information produced by MLLM, a\nsimple yet effective VL tracking framework is proposed and can be easily\nintegrated as a plug-and-play module to boost the performance of both VL and\nvisual trackers. Experimental results show that our proposed ChatTracker\nachieves a performance comparable to existing methods.\n","authors":["Yiming Sun","Fan Yu","Shaoxiang Chen","Yu Zhang","Junwei Huang","Chenhui Li","Yang Li","Changbo Wang"],"pdf_url":"https://arxiv.org/pdf/2411.01756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09728v2","updated":"2024-11-04T02:34:59Z","published":"2024-06-14T05:33:01Z","title":"Neural Pose Representation Learning for Generating and Transferring\n Non-Rigid Object Poses","summary":" We propose a novel method for learning representations of poses for 3D\ndeformable objects, which specializes in 1) disentangling pose information from\nthe object's identity, 2) facilitating the learning of pose variations, and 3)\ntransferring pose information to other object identities. Based on these\nproperties, our method enables the generation of 3D deformable objects with\ndiversity in both identities and poses, using variations of a single object. It\ndoes not require explicit shape parameterization such as skeletons or joints,\npoint-level or shape-level correspondence supervision, or variations of the\ntarget object for pose transfer. To achieve pose disentanglement, compactness\nfor generative models, and transferability, we first design the pose extractor\nto represent the pose as a keypoint-based hybrid representation and the pose\napplier to learn an implicit deformation field. To better distill pose\ninformation from the object's geometry, we propose the implicit pose applier to\noutput an intrinsic mesh property, the face Jacobian. Once the extracted pose\ninformation is transferred to the target object, the pose applier is fine-tuned\nin a self-supervised manner to better describe the target object's shapes with\npose variations. The extracted poses are also used to train a cascaded\ndiffusion model to enable the generation of novel poses. Our experiments with\nthe DeformThings4D and Human datasets demonstrate state-of-the-art performance\nin pose transfer and the ability to generate diverse deformed shapes with\nvarious objects and poses.\n","authors":["Seungwoo Yoo","Juil Koo","Kyeongmin Yeo","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2406.09728v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.19464v2","updated":"2024-11-04T02:21:30Z","published":"2024-06-27T18:06:38Z","title":"ManiWAV: Learning Robot Manipulation from In-the-Wild Audio-Visual Data","summary":" Audio signals provide rich information for the robot interaction and object\nproperties through contact. This information can surprisingly ease the learning\nof contact-rich robot manipulation skills, especially when the visual\ninformation alone is ambiguous or incomplete. However, the usage of audio data\nin robot manipulation has been constrained to teleoperated demonstrations\ncollected by either attaching a microphone to the robot or object, which\nsignificantly limits its usage in robot learning pipelines. In this work, we\nintroduce ManiWAV: an 'ear-in-hand' data collection device to collect\nin-the-wild human demonstrations with synchronous audio and visual feedback,\nand a corresponding policy interface to learn robot manipulation policy\ndirectly from the demonstrations. We demonstrate the capabilities of our system\nthrough four contact-rich manipulation tasks that require either passively\nsensing the contact events and modes, or actively sensing the object surface\nmaterials and states. In addition, we show that our system can generalize to\nunseen in-the-wild environments by learning from diverse in-the-wild human\ndemonstrations.\n","authors":["Zeyi Liu","Cheng Chi","Eric Cousineau","Naveen Kuppuswamy","Benjamin Burchfiel","Shuran Song"],"pdf_url":"https://arxiv.org/pdf/2406.19464v2.pdf","comment":"Conference on Robot Learning (CoRL) 2024; Project website:\n https://maniwav.github.io/"},{"id":"http://arxiv.org/abs/2411.01749v1","updated":"2024-11-04T02:20:22Z","published":"2024-11-04T02:20:22Z","title":"Multi-task Geometric Estimation of Depth and Surface Normal from\n Monocular 360° Images","summary":" Geometric estimation is required for scene understanding and analysis in\npanoramic 360{\\deg} images. Current methods usually predict a single feature,\nsuch as depth or surface normal. These methods can lack robustness, especially\nwhen dealing with intricate textures or complex object surfaces. We introduce a\nnovel multi-task learning (MTL) network that simultaneously estimates depth and\nsurface normals from 360{\\deg} images. Our first innovation is our MTL\narchitecture, which enhances predictions for both tasks by integrating\ngeometric information from depth and surface normal estimation, enabling a\ndeeper understanding of 3D scene structure. Another innovation is our fusion\nmodule, which bridges the two tasks, allowing the network to learn shared\nrepresentations that improve accuracy and robustness. Experimental results\ndemonstrate that our MTL architecture significantly outperforms\nstate-of-the-art methods in both depth and surface normal estimation, showing\nsuperior performance in complex and diverse scenes. Our model's effectiveness\nand generalizability, particularly in handling intricate surface textures,\nestablish it as a new benchmark in 360{\\deg} image geometric estimation. The\ncode and model are available at\n\\url{https://github.com/huangkun101230/360MTLGeometricEstimation}.\n","authors":["Kun Huang","Fang-Lue Zhang","Fangfang Zhang","Yu-Kun Lai","Paul Rosin","Neil A. Dodgson"],"pdf_url":"https://arxiv.org/pdf/2411.01749v1.pdf","comment":"18 pages, this paper is accepted by Computational Visual Media\n Journal (CVMJ) but not pushlished yet"},{"id":"http://arxiv.org/abs/2411.01748v1","updated":"2024-11-04T02:13:41Z","published":"2024-11-04T02:13:41Z","title":"Rotation Perturbation Robustness in Point Cloud Analysis: A Perspective\n of Manifold Distillation","summary":" Point cloud is often regarded as a discrete sampling of Riemannian manifold\nand plays a pivotal role in the 3D image interpretation. Particularly, rotation\nperturbation, an unexpected small change in rotation caused by various factors\n(like equipment offset, system instability, measurement errors and so on), can\neasily lead to the inferior results in point cloud learning tasks. However,\nclassical point cloud learning methods are sensitive to rotation perturbation,\nand the existing networks with rotation robustness also have much room for\nimprovements in terms of performance and noise tolerance. Given these, this\npaper remodels the point cloud from the perspective of manifold as well as\ndesigns a manifold distillation method to achieve the robustness of rotation\nperturbation without any coordinate transformation. In brief, during the\ntraining phase, we introduce a teacher network to learn the rotation robustness\ninformation and transfer this information to the student network through online\ndistillation. In the inference phase, the student network directly utilizes the\noriginal 3D coordinate information to achieve the robustness of rotation\nperturbation. Experiments carried out on four different datasets verify the\neffectiveness of our method. Averagely, on the Modelnet40 and ScanobjectNN\nclassification datasets with random rotation perturbations, our classification\naccuracy has respectively improved by 4.92% and 4.41%, compared to popular\nrotation-robust networks; on the ShapeNet and S3DIS segmentation datasets,\ncompared to the rotation-robust networks, the improvements of mIoU are 7.36%\nand 4.82%, respectively. Besides, from the experimental results, the proposed\nalgorithm also shows excellent performance in resisting noise and outliers.\n","authors":["Xinyu Xu","Huazhen Liu","Feiming Wei","Huilin Xiong","Wenxian Yu","Tao Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.01748v1.pdf","comment":"13 pages, 8 figures, submitted to TCSVT"},{"id":"http://arxiv.org/abs/2411.01742v1","updated":"2024-11-04T01:51:50Z","published":"2024-11-04T01:51:50Z","title":"Learning from Convolution-based Unlearnable Datastes","summary":" The construction of large datasets for deep learning has raised concerns\nregarding unauthorized use of online data, leading to increased interest in\nprotecting data from third-parties who want to use it for training. The\nConvolution-based Unlearnable DAtaset (CUDA) method aims to make data\nunlearnable by applying class-wise blurs to every image in the dataset so that\nneural networks learn relations between blur kernels and labels, as opposed to\ninformative features for classifying clean data. In this work, we evaluate\nwhether CUDA data remains unlearnable after image sharpening and frequency\nfiltering, finding that this combination of simple transforms improves the\nutility of CUDA data for training. In particular, we observe a substantial\nincrease in test accuracy over adversarial training for models trained with\nCUDA unlearnable data from CIFAR-10, CIFAR-100, and ImageNet-100. In training\nmodels to high accuracy using unlearnable data, we underscore the need for\nongoing refinement in data poisoning techniques to ensure data privacy. Our\nmethod opens new avenues for enhancing the robustness of unlearnable datasets\nby highlighting that simple methods such as sharpening and frequency filtering\nare capable of breaking convolution-based unlearnable datasets.\n","authors":["Dohyun Kim","Pedro Sandoval-Segura"],"pdf_url":"https://arxiv.org/pdf/2411.01742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01739v1","updated":"2024-11-04T01:42:41Z","published":"2024-11-04T01:42:41Z","title":"Not Just Object, But State: Compositional Incremental Learning without\n Forgetting","summary":" Most incremental learners excessively prioritize coarse classes of objects\nwhile neglecting various kinds of states (e.g. color and material) attached to\nthe objects. As a result, they are limited in the ability to reason\nfine-grained compositionality of state-object pairs. To remedy this limitation,\nwe propose a novel task called Compositional Incremental Learning\n(composition-IL), enabling the model to recognize state-object compositions as\na whole in an incremental learning fashion. Since the lack of suitable\nbenchmarks, we re-organize two existing datasets and make them tailored for\ncomposition-IL. Then, we propose a prompt-based Composition Incremental Learner\n(CompILer), to overcome the ambiguous composition boundary problem which\nchallenges composition-IL largely. Specifically, we exploit multi-pool prompt\nlearning, which is regularized by inter-pool prompt discrepancy and intra-pool\nprompt diversity. Besides, we devise object-injected state prompting by using\nobject prompts to guide the selection of state prompts. Furthermore, we fuse\nthe selected prompts by a generalized-mean strategy, to eliminate irrelevant\ninformation learned in the prompts. Extensive experiments on two datasets\nexhibit state-of-the-art performance achieved by CompILer.\n","authors":["Yanyi Zhang","Binglin Qiu","Qi Jia","Yu Liu","Ran He"],"pdf_url":"https://arxiv.org/pdf/2411.01739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01734v1","updated":"2024-11-04T01:32:09Z","published":"2024-11-04T01:32:09Z","title":"Next Best View For Point-Cloud Model Acquisition: Bayesian Approximation\n and Uncertainty Analysis","summary":" The Next Best View problem is a computer vision problem widely studied in\nrobotics. To solve it, several methodologies have been proposed over the years.\nSome, more recently, propose the use of deep learning models. Predictions\nobtained with the help of deep learning models naturally have some uncertainty\nassociated with them. Despite this, the standard models do not allow for their\nquantification. However, Bayesian estimation theory contributed to the\ndemonstration that dropout layers allow to estimate prediction uncertainty in\nneural networks.\n This work adapts the point-net-based neural network for Next-Best-View\n(PC-NBV). It incorporates dropout layers into the model's architecture, thus\nallowing the computation of the uncertainty estimate associated with its\npredictions. The aim of the work is to improve the network's accuracy in\ncorrectly predicting the next best viewpoint, proposing a way to make the 3D\nreconstruction process more efficient.\n Two uncertainty measurements capable of reflecting the prediction's error and\naccuracy, respectively, were obtained. These enabled the reduction of the\nmodel's error and the increase in its accuracy from 30\\% to 80\\% by identifying\nand disregarding predictions with high values of uncertainty. Another method\nthat directly uses these uncertainty metrics to improve the final prediction\nwas also proposed. However, it showed very residual improvements.\n","authors":["Madalena Caldeira","Plinio Moreno"],"pdf_url":"https://arxiv.org/pdf/2411.01734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14370v4","updated":"2024-11-04T01:25:37Z","published":"2024-03-21T12:57:30Z","title":"SyncTweedies: A General Generative Framework Based on Synchronized\n Diffusions","summary":" We introduce a general framework for generating diverse visual content,\nincluding ambiguous images, panorama images, mesh textures, and Gaussian splat\ntextures, by synchronizing multiple diffusion processes. We present exhaustive\ninvestigation into all possible scenarios for synchronizing multiple diffusion\nprocesses through a canonical space and analyze their characteristics across\napplications. In doing so, we reveal a previously unexplored case: averaging\nthe outputs of Tweedie's formula while conducting denoising in multiple\ninstance spaces. This case also provides the best quality with the widest\napplicability to downstream tasks. We name this case SyncTweedies. In our\nexperiments generating visual content aforementioned, we demonstrate the\nsuperior quality of generation by SyncTweedies compared to other\nsynchronization methods, optimization-based and iterative-update-based methods.\n","authors":["Jaihoon Kim","Juil Koo","Kyeongmin Yeo","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2403.14370v4.pdf","comment":"Project page: https://synctweedies.github.io/ (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2406.13155v2","updated":"2024-11-04T00:55:06Z","published":"2024-06-19T02:09:44Z","title":"Convolutional Kolmogorov-Arnold Networks","summary":" In this paper, we introduce Convolutional Kolmogorov-Arnold Networks\n(Convolutional KANs), an innovative alternative to the standard Convolutional\nNeural Networks (CNNs) that have revolutionized the field of computer vision.\nBy integrating the learneable non-linear activation functions presented in\nKolmogorov-Arnold Networks (KANs) into convolutions, we propose a new layer.\nThroughout the paper, we empirically validate the performance of Convolutional\nKANs against traditional architectures across Fashion-MNIST dataset, finding\nthat, in some cases, this new approach maintains a similar level of accuracy\nwhile using half the number of parameters. This experiments show that KAN\nConvolutions seem to learn more per kernel, which opens up a new horizon of\npossibilities in deep learning for computer vision.\n","authors":["Alexander Dylan Bodner","Antonio Santiago Tepsich","Jack Natan Spolski","Santiago Pourteau"],"pdf_url":"https://arxiv.org/pdf/2406.13155v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01725v1","updated":"2024-11-04T00:49:47Z","published":"2024-11-04T00:49:47Z","title":"A Probabilistic Formulation of LiDAR Mapping with Neural Radiance Fields","summary":" In this paper we reexamine the process through which a Neural Radiance Field\n(NeRF) can be trained to produce novel LiDAR views of a scene. Unlike image\napplications where camera pixels integrate light over time, LiDAR pulses arrive\nat specific times. As such, multiple LiDAR returns are possible for any given\ndetector and the classification of these returns is inherently probabilistic.\nApplying a traditional NeRF training routine can result in the network learning\nphantom surfaces in free space between conflicting range measurements, similar\nto how floater aberrations may be produced by an image model. We show that by\nformulating loss as an integral of probability (rather than as an integral of\noptical density) the network can learn multiple peaks for a given ray, allowing\nthe sampling of first, nth, or strongest returns from a single output channel.\nCode is available at https://github.com/mcdermatt/PLINK\n","authors":["Matthew McDermott","Jason Rife"],"pdf_url":"https://arxiv.org/pdf/2411.01725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15748v2","updated":"2024-11-04T00:48:12Z","published":"2023-05-25T05:55:53Z","title":"ReactFace: Online Multiple Appropriate Facial Reaction Generation in\n Dyadic Interactions","summary":" In dyadic interaction, predicting the listener's facial reactions is\nchallenging as different reactions could be appropriate in response to the same\nspeaker's behaviour. Previous approaches predominantly treated this task as an\ninterpolation or fitting problem, emphasizing deterministic outcomes but\nignoring the diversity and uncertainty of human facial reactions. Furthermore,\nthese methods often failed to model short-range and long-range dependencies\nwithin the interaction context, leading to issues in the synchrony and\nappropriateness of the generated facial reactions. To address these\nlimitations, this paper reformulates the task as an extrapolation or prediction\nproblem, and proposes an novel framework (called ReactFace) to generate\nmultiple different but appropriate facial reactions from a speaker behaviour\nrather than merely replicating the corresponding listener facial behaviours.\nOur ReactFace generates multiple different but appropriate photo-realistic\nhuman facial reactions by: (i) learning an appropriate facial reaction\ndistribution representing multiple different but appropriate facial reactions;\nand (ii) synchronizing the generated facial reactions with the speaker verbal\nand non-verbal behaviours at each time stamp, resulting in realistic 2D facial\nreaction sequences. Experimental results demonstrate the effectiveness of our\napproach in generating multiple diverse, synchronized, and appropriate facial\nreactions from each speaker's behaviour. The quality of the generated facial\nreactions is intimately tied to the speaker's speech and facial expressions,\nachieved through our novel speaker-listener interaction modules. Our code is\nmade publicly available at \\url{https://github.com/lingjivoo/ReactFace}.\n","authors":["Cheng Luo","Siyang Song","Weicheng Xie","Micol Spitale","Zongyuan Ge","Linlin Shen","Hatice Gunes"],"pdf_url":"https://arxiv.org/pdf/2305.15748v2.pdf","comment":"Accepted to IEEE Transactions on Visualization and Computer Graphics\n (TVCG), 18 pages, 10 figures"},{"id":"http://arxiv.org/abs/2411.02673v1","updated":"2024-11-04T23:15:21Z","published":"2024-11-04T23:15:21Z","title":"Multi-Transmotion: Pre-trained Model for Human Motion Prediction","summary":" The ability of intelligent systems to predict human behaviors is crucial,\nparticularly in fields such as autonomous vehicle navigation and social\nrobotics. However, the complexity of human motion have prevented the\ndevelopment of a standardized dataset for human motion prediction, thereby\nhindering the establishment of pre-trained models. In this paper, we address\nthese limitations by integrating multiple datasets, encompassing both\ntrajectory and 3D pose keypoints, to propose a pre-trained model for human\nmotion prediction. We merge seven distinct datasets across varying modalities\nand standardize their formats. To facilitate multimodal pre-training, we\nintroduce Multi-Transmotion, an innovative transformer-based model designed for\ncross-modality pre-training. Additionally, we present a novel masking strategy\nto capture rich representations. Our methodology demonstrates competitive\nperformance across various datasets on several downstream tasks, including\ntrajectory prediction in the NBA and JTA datasets, as well as pose prediction\nin the AMASS and 3DPW datasets. The code is publicly available:\nhttps://github.com/vita-epfl/multi-transmotion\n","authors":["Yang Gao","Po-Chien Luan","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2411.02673v1.pdf","comment":"CoRL 2024"},{"id":"http://arxiv.org/abs/2411.02669v1","updated":"2024-11-04T23:07:51Z","published":"2024-11-04T23:07:51Z","title":"Semantic-Aligned Adversarial Evolution Triangle for High-Transferability\n Vision-Language Attack","summary":" Vision-language pre-training (VLP) models excel at interpreting both images\nand text but remain vulnerable to multimodal adversarial examples (AEs).\nAdvancing the generation of transferable AEs, which succeed across unseen\nmodels, is key to developing more robust and practical VLP models. Previous\napproaches augment image-text pairs to enhance diversity within the adversarial\nexample generation process, aiming to improve transferability by expanding the\ncontrast space of image-text features. However, these methods focus solely on\ndiversity around the current AEs, yielding limited gains in transferability. To\naddress this issue, we propose to increase the diversity of AEs by leveraging\nthe intersection regions along the adversarial trajectory during optimization.\nSpecifically, we propose sampling from adversarial evolution triangles composed\nof clean, historical, and current adversarial examples to enhance adversarial\ndiversity. We provide a theoretical analysis to demonstrate the effectiveness\nof the proposed adversarial evolution triangle. Moreover, we find that\nredundant inactive dimensions can dominate similarity calculations, distorting\nfeature matching and making AEs model-dependent with reduced transferability.\nHence, we propose to generate AEs in the semantic image-text feature contrast\nspace, which can project the original feature space into a semantic corpus\nsubspace. The proposed semantic-aligned subspace can reduce the image feature\nredundancy, thereby improving adversarial transferability. Extensive\nexperiments across different datasets and models demonstrate that the proposed\nmethod can effectively improve adversarial transferability and outperform\nstate-of-the-art adversarial attack methods. The code is released at\nhttps://github.com/jiaxiaojunQAQ/SA-AET.\n","authors":["Xiaojun Jia","Sensen Gao","Qing Guo","Ke Ma","Yihao Huang","Simeng Qin","Yang Liu","Ivor Tsang Fellow","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2411.02669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13163v4","updated":"2024-11-04T22:45:50Z","published":"2024-03-19T21:31:31Z","title":"DeblurDiNAT: A Generalizable Transformer for Perceptual Image Deblurring","summary":" Although prior state-of-the-art (SOTA) deblurring networks achieve high\nmetric scores on synthetic datasets, there are two challenges which prevent\nthem from perceptual image deblurring. First, a deblurring model overtrained on\nsynthetic datasets may collapse in a broad range of unseen real-world\nscenarios. Second, the conventional metrics PSNR and SSIM may not correctly\nreflect the perceptual quality observed by human eyes. To this end, we propose\nDeblurDiNAT, a generalizable and efficient encoder-decoder Transformer which\nrestores clean images visually close to the ground truth. We adopt an\nalternating dilation factor structure to capture local and global blur\npatterns. We propose a local cross-channel learner to assist self-attention\nlayers to learn short-range cross-channel relationships. In addition, we\npresent a linear feed-forward network and a non-linear dual-stage feature\nfusion module for faster feature propagation across the network. Compared to\nnearest competitors, our model demonstrates the strongest generalization\nability and achieves the best perceptual quality on mainstream image deblurring\ndatasets with 3%-68% fewer parameters.\n","authors":["Hanzhou Liu","Binghan Li","Chengkai Liu","Mi Lu"],"pdf_url":"https://arxiv.org/pdf/2403.13163v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02639v1","updated":"2024-11-04T21:56:48Z","published":"2024-11-04T21:56:48Z","title":"Active Prompt Tuning Enables Gpt-40 To Do Efficient Classification Of\n Microscopy Images","summary":" Traditional deep learning-based methods for classifying cellular features in\nmicroscopy images require time- and labor-intensive processes for training\nmodels. Among the current limitations are major time commitments from domain\nexperts for accurate ground truth preparation; and the need for a large amount\nof input image data. We previously proposed a solution that overcomes these\nchallenges using OpenAI's GPT-4(V) model on a pilot dataset (Iba-1\nimmuno-stained tissue sections from 11 mouse brains). Results on the pilot\ndataset were equivalent in accuracy and with a substantial improvement in\nthroughput efficiency compared to the baseline using a traditional\nConvolutional Neural Net (CNN)-based approach.\n The present study builds upon this framework using a second unique and\nsubstantially larger dataset of microscopy images. Our current approach uses a\nnewer and faster model, GPT-4o, along with improved prompts. It was evaluated\non a microscopy image dataset captured at low (10x) magnification from\ncresyl-violet-stained sections through the cerebellum of a total of 18 mouse\nbrains (9 Lurcher mice, 9 wild-type controls). We used our approach to classify\nthese images either as a control group or Lurcher mutant. Using 6 mice in the\nprompt set the results were correct classification for 11 out of the 12 mice\n(92%) with 96% higher efficiency, reduced image requirements, and lower demands\non time and effort of domain experts compared to the baseline method (snapshot\nensemble of CNN models). These results confirm that our approach is effective\nacross multiple datasets from different brain regions and magnifications, with\nminimal overhead.\n","authors":["Abhiram Kandiyana","Peter R. Mouton","Yaroslav Kolinko","Lawrence O. Hall","Dmitry Goldgof"],"pdf_url":"https://arxiv.org/pdf/2411.02639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02637v1","updated":"2024-11-04T21:55:52Z","published":"2024-11-04T21:55:52Z","title":"FUSECAPS: Investigating Feature Fusion Based Framework for Capsule\n Endoscopy Image Classification","summary":" In order to improve model accuracy, generalization, and class imbalance\nissues, this work offers a strong methodology for classifying endoscopic\nimages. We suggest a hybrid feature extraction method that combines\nconvolutional neural networks (CNNs), multi-layer perceptrons (MLPs), and\nradiomics. Rich, multi-scale feature extraction is made possible by this\ncombination, which captures both deep and handmade representations. These\nfeatures are then used by a classification head to classify diseases, producing\na model with higher generalization and accuracy. In this framework we have\nachieved a validation accuracy of 76.2% in the capsule endoscopy video frame\nclassification task.\n","authors":["Bidisha Chakraborty","Shree Mitra"],"pdf_url":"https://arxiv.org/pdf/2411.02637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02635v1","updated":"2024-11-04T21:54:11Z","published":"2024-11-04T21:54:11Z","title":"Data-Driven Hierarchical Open Set Recognition","summary":" This paper presents a novel data-driven hierarchical approach to open set\nrecognition (OSR) for robust perception in robotics and computer vision,\nutilizing constrained agglomerative clustering to automatically build a\nhierarchy of known classes in embedding space without requiring manual\nrelational information. The method, demonstrated on the Animals with Attributes\n2 (AwA2) dataset, achieves competitive results with an AUC ROC score of 0.82\nand utility score of 0.85, while introducing two classification approaches\n(score-based and traversal-based) and a new Concentration Centrality (CC)\nmetric for measuring hierarchical classification consistency. Although not\nsurpassing existing models in accuracy, the approach provides valuable\nadditional information about unknown classes through automatically generated\nhierarchies, requires no supplementary information beyond typical supervised\nmodel requirements, and introduces the Class Concentration Centrality (CCC)\nmetric for evaluating unknown class placement consistency, with future work\naimed at improving accuracy, validating the CC metric, and expanding to\nLarge-Scale Open-Set Classification Protocols for ImageNet.\n","authors":["Andrew Hannum","Max Conway","Mario Lopez","André Harrison"],"pdf_url":"https://arxiv.org/pdf/2411.02635v1.pdf","comment":"Accepted as Extended Abstract to the IEEE ICRA@40 2024"},{"id":"http://arxiv.org/abs/2406.14852v2","updated":"2024-11-04T21:51:07Z","published":"2024-06-21T03:53:37Z","title":"Is A Picture Worth A Thousand Words? Delving Into Spatial Reasoning for\n Vision Language Models","summary":" Large language models (LLMs) and vision-language models (VLMs) have\ndemonstrated remarkable performance across a wide range of tasks and domains.\nDespite this promise, spatial understanding and reasoning -- a fundamental\ncomponent of human cognition -- remains under-explored. We propose SpatialEval,\na novel benchmark that covers diverse aspects of spatial reasoning such as\nrelationship understanding, navigation, and counting. We conduct a\ncomprehensive evaluation of competitive language and vision-language models.\nOur findings reveal several counter-intuitive insights that have been\noverlooked in the literature: (1) Spatial reasoning poses significant\nchallenges where competitive models can fall behind random guessing; (2)\nDespite additional visual input, VLMs often under-perform compared to their LLM\ncounterparts; (3) When both textual and visual information is available,\nmulti-modal language models become less reliant on visual information if\nsufficient textual clues are provided. Additionally, we demonstrate that\nleveraging redundancy between vision and text can significantly enhance model\nperformance. We hope our study will inform the development of multimodal models\nto improve spatial intelligence and further close the gap with human\nintelligence.\n","authors":["Jiayu Wang","Yifei Ming","Zhenmei Shi","Vibhav Vineet","Xin Wang","Yixuan Li","Neel Joshi"],"pdf_url":"https://arxiv.org/pdf/2406.14852v2.pdf","comment":"Accepted to NeurIPS 2024"}]},"2024-11-03T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2409.15590v2","updated":"2024-11-03T23:51:33Z","published":"2024-09-23T22:48:04Z","title":"MapEx: Indoor Structure Exploration with Probabilistic Information Gain\n from Global Map Predictions","summary":" Exploration is a critical challenge in robotics, centered on understanding\nunknown environments. In this work, we focus on robots exploring structured\nindoor environments which are often predictable and composed of repeating\npatterns. Most existing approaches, such as conventional frontier approaches,\nhave difficulty leveraging the predictability and explore with simple\nheuristics such as `closest first'. Recent works use deep learning techniques\nto predict unknown regions of the map, using these predictions for information\ngain calculation. However, these approaches are often sensitive to the\npredicted map quality or do not reason over sensor coverage. To overcome these\nissues, our key insight is to jointly reason over what the robot can observe\nand its uncertainty to calculate probabilistic information gain. We introduce\nMapEx, a new exploration framework that uses predicted maps to form\nprobabilistic sensor model for information gain estimation. MapEx generates\nmultiple predicted maps based on observed information, and takes into\nconsideration both the computed variances of predicted maps and estimated\nvisible area to estimate the information gain of a given viewpoint. Experiments\non the real-world KTH dataset showed on average 12.4% improvement than\nrepresentative map-prediction based exploration and 25.4% improvement than\nnearest frontier approach.\n","authors":["Cherie Ho","Seungchan Kim","Brady Moon","Aditya Parandekar","Narek Harutyunyan","Chen Wang","Katia Sycara","Graeme Best","Sebastian Scherer"],"pdf_url":"https://arxiv.org/pdf/2409.15590v2.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2411.01707v1","updated":"2024-11-03T22:37:56Z","published":"2024-11-03T22:37:56Z","title":"Large-Scale Multi-Robot Coverage Path Planning on Grids with Path\n Deconfliction","summary":" We study Multi-Robot Coverage Path Planning (MCPP) on a 4-neighbor 2D grid G,\nwhich aims to compute paths for multiple robots to cover all cells of G.\nTraditional approaches are limited as they first compute coverage trees on a\nquadrant coarsened grid H and then employ the Spanning Tree Coverage (STC)\nparadigm to generate paths on G, making them inapplicable to grids with\npartially obstructed 2x2 blocks. To address this limitation, we reformulate the\nproblem directly on G, revolutionizing grid-based MCPP solving and establishing\nnew NP-hardness results. We introduce Extended-STC (ESTC), a novel paradigm\nthat extends STC to ensure complete coverage with bounded suboptimality, even\nwhen H includes partially obstructed blocks. Furthermore, we present LS-MCPP, a\nnew algorithmic framework that integrates ESTC with three novel types of\nneighborhood operators within a local search strategy to optimize coverage\npaths directly on G. Unlike prior grid-based MCPP work, our approach also\nincorporates a versatile post-processing procedure that applies Multi-Agent\nPath Finding (MAPF) techniques to MCPP for the first time, enabling a fusion of\nthese two important fields in multi-robot coordination. This procedure\neffectively resolves inter-robot conflicts and accommodates turning costs by\nsolving a MAPF variant, making our MCPP solutions more practical for real-world\napplications. Extensive experiments demonstrate that our approach significantly\nimproves solution quality and efficiency, managing up to 100 robots on grids as\nlarge as 256x256 within minutes of runtime. Validation with physical robots\nconfirms the feasibility of our solutions under real-world conditions.\n","authors":["Jingtao Tang","Zining Mao","Hang Ma"],"pdf_url":"https://arxiv.org/pdf/2411.01707v1.pdf","comment":"Submitted to T-RO"},{"id":"http://arxiv.org/abs/2411.01665v1","updated":"2024-11-03T19:34:58Z","published":"2024-11-03T19:34:58Z","title":"Neural Inverse Source Problems","summary":" Reconstructing unknown external source functions is an important perception\ncapability for a large range of robotics domains including manipulation,\naerial, and underwater robotics. In this work, we propose a Physics-Informed\nNeural Network (PINN [1]) based approach for solving the inverse source\nproblems in robotics, jointly identifying unknown source functions and the\ncomplete state of a system given partial and noisy observations. Our approach\ndemonstrates several advantages over prior works (Finite Element Methods (FEM)\nand data-driven approaches): it offers flexibility in integrating diverse\nconstraints and boundary conditions; eliminates the need for complex\ndiscretizations (e.g., meshing); easily accommodates gradients from real\nmeasurements; and does not limit performance based on the diversity and quality\nof training data. We validate our method across three simulation and real-world\nscenarios involving up to 4th order partial differential equations (PDEs),\nconstraints such as Signorini and Dirichlet, and various regression losses\nincluding Chamfer distance and L2 norm.\n","authors":["Youngsun Wi","Jayjun Lee","Miquel Oller","Nima Fazeli"],"pdf_url":"https://arxiv.org/pdf/2411.01665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01639v1","updated":"2024-11-03T17:32:00Z","published":"2024-11-03T17:32:00Z","title":"Know Where You're Uncertain When Planning with Multimodal Foundation\n Models: A Formal Framework","summary":" Multimodal foundation models offer a promising framework for robotic\nperception and planning by processing sensory inputs to generate actionable\nplans. However, addressing uncertainty in both perception (sensory\ninterpretation) and decision-making (plan generation) remains a critical\nchallenge for ensuring task reliability. We present a comprehensive framework\nto disentangle, quantify, and mitigate these two forms of uncertainty. We first\nintroduce a framework for uncertainty disentanglement, isolating perception\nuncertainty arising from limitations in visual understanding and decision\nuncertainty relating to the robustness of generated plans.\n To quantify each type of uncertainty, we propose methods tailored to the\nunique properties of perception and decision-making: we use conformal\nprediction to calibrate perception uncertainty and introduce\nFormal-Methods-Driven Prediction (FMDP) to quantify decision uncertainty,\nleveraging formal verification techniques for theoretical guarantees. Building\non this quantification, we implement two targeted intervention mechanisms: an\nactive sensing process that dynamically re-observes high-uncertainty scenes to\nenhance visual input quality and an automated refinement procedure that\nfine-tunes the model on high-certainty data, improving its capability to meet\ntask specifications. Empirical validation in real-world and simulated robotic\ntasks demonstrates that our uncertainty disentanglement framework reduces\nvariability by up to 40% and enhances task success rates by 5% compared to\nbaselines. These improvements are attributed to the combined effect of both\ninterventions and highlight the importance of uncertainty disentanglement which\nfacilitates targeted interventions that enhance the robustness and reliability\nof autonomous systems.\n","authors":["Neel P. Bhatt","Yunhao Yang","Rohan Siva","Daniel Milan","Ufuk Topcu","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2411.01639v1.pdf","comment":"Fine-tuned models, code, and datasets are available at\n https://tinyurl.com/uncertainty-disentanglement"},{"id":"http://arxiv.org/abs/2405.07391v3","updated":"2024-11-03T16:22:30Z","published":"2024-05-12T22:51:35Z","title":"AnyRotate: Gravity-Invariant In-Hand Object Rotation with Sim-to-Real\n Touch","summary":" Human hands are capable of in-hand manipulation in the presence of different\nhand motions. For a robot hand, harnessing rich tactile information to achieve\nthis level of dexterity still remains a significant challenge. In this paper,\nwe present AnyRotate, a system for gravity-invariant multi-axis in-hand object\nrotation using dense featured sim-to-real touch. We tackle this problem by\ntraining a dense tactile policy in simulation and present a sim-to-real method\nfor rich tactile sensing to achieve zero-shot policy transfer. Our formulation\nallows the training of a unified policy to rotate unseen objects about\narbitrary rotation axes in any hand direction. In our experiments, we highlight\nthe benefit of capturing detailed contact information when handling objects of\nvarying properties. Interestingly, we found rich multi-fingered tactile sensing\ncan detect unstable grasps and provide a reactive behavior that improves the\nrobustness of the policy. The project website can be found at\nhttps://maxyang27896.github.io/anyrotate/.\n","authors":["Max Yang","Chenghua Lu","Alex Church","Yijiong Lin","Chris Ford","Haoran Li","Efi Psomopoulou","David A. W. Barton","Nathan F. Lepora"],"pdf_url":"https://arxiv.org/pdf/2405.07391v3.pdf","comment":"Project website can be found at\n https://maxyang27896.github.io/anyrotate/"},{"id":"http://arxiv.org/abs/2409.18053v2","updated":"2024-11-03T15:59:58Z","published":"2024-09-26T16:58:04Z","title":"DualAD: Dual-Layer Planning for Reasoning in Autonomous Driving","summary":" We present a novel autonomous driving framework, DualAD, designed to imitate\nhuman reasoning during driving. DualAD comprises two layers: a rule-based\nmotion planner at the bottom layer that handles routine driving tasks requiring\nminimal reasoning, and an upper layer featuring a rule-based text encoder that\nconverts driving scenarios from absolute states into text description. This\ntext is then processed by a large language model (LLM) to make driving\ndecisions. The upper layer intervenes in the bottom layer's decisions when\npotential danger is detected, mimicking human reasoning in critical situations.\nClosed-loop experiments demonstrate that DualAD, using a zero-shot pre-trained\nmodel, significantly outperforms rule-based motion planners that lack reasoning\nabilities. Our experiments also highlight the effectiveness of the text\nencoder, which considerably enhances the model's scenario understanding.\nAdditionally, the integrated DualAD model improves with stronger LLMs,\nindicating the framework's potential for further enhancement. Code and\nbenchmarks are available at github.com/TUM-AVS/DualAD.\n","authors":["Dingrui Wang","Marc Kaufeld","Johannes Betz"],"pdf_url":"https://arxiv.org/pdf/2409.18053v2.pdf","comment":"Autonomous Driving, Large Language Models (LLMs), Human Reasoning,\n Critical Scenario"},{"id":"http://arxiv.org/abs/2411.01608v1","updated":"2024-11-03T15:27:26Z","published":"2024-11-03T15:27:26Z","title":"GITSR: Graph Interaction Transformer-based Scene Representation for\n Multi Vehicle Collaborative Decision-making","summary":" In this study, we propose GITSR, an effective framework for Graph Interaction\nTransformer-based Scene Representation for multi-vehicle collaborative\ndecision-making in intelligent transportation system. In the context of mixed\ntraffic where Connected Automated Vehicles (CAVs) and Human Driving Vehicles\n(HDVs) coexist, in order to enhance the understanding of the environment by\nCAVs to improve decision-making capabilities, this framework focuses on\nefficient scene representation and the modeling of spatial interaction\nbehaviors of traffic states. We first extract features of the driving\nenvironment based on the background of intelligent networking. Subsequently,\nthe local scene representation, which is based on the agent-centric and dynamic\noccupation grid, is calculated by the Transformer module. Besides, feasible\nregion of the map is captured through the multi-head attention mechanism to\nreduce the collision of vehicles. Notably, spatial interaction behaviors, based\non motion information, are modeled as graph structures and extracted via Graph\nNeural Network (GNN). Ultimately, the collaborative decision-making among\nmultiple vehicles is formulated as a Markov Decision Process (MDP), with\ndriving actions output by Reinforcement Learning (RL) algorithms. Our\nalgorithmic validation is executed within the extremely challenging scenario of\nhighway off-ramp task, thereby substantiating the superiority of agent-centric\napproach to scene representation. Simulation results demonstrate that the GITSR\nmethod can not only effectively capture scene representation but also extract\nspatial interaction data, outperforming the baseline method across various\ncomparative metrics.\n","authors":["Xingyu Hu","Lijun Zhang","Dejian Meng","Ye Han","Lisha Yuan"],"pdf_url":"https://arxiv.org/pdf/2411.01608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01603v1","updated":"2024-11-03T15:16:28Z","published":"2024-11-03T15:16:28Z","title":"An Aerial Transport System in Marine GNSS-Denied Environment","summary":" This paper presents an autonomous aerial system specifically engineered for\noperation in challenging marine GNSS-denied environments, aimed at transporting\nsmall cargo from a target vessel. In these environments, characterized by\nweakly textured sea surfaces with few feature points, chaotic deck oscillations\ndue to waves, and significant wind gusts, conventional navigation methods often\nprove inadequate. Leveraging the DJI M300 platform, our system is designed to\nautonomously navigate and transport cargo while overcoming these environmental\nchallenges. In particular, this paper proposes an anchor-based localization\nmethod using ultrawideband (UWB) and QR codes facilities, which decouples the\nUAV's attitude from that of the moving landing platform, thus reducing control\noscillations caused by platform movement. Additionally, a motor-driven\nattachment mechanism for cargo is designed, which enhances the UAV's field of\nview during descent and ensures a reliable attachment to the cargo upon\nlanding. The system's reliability and effectiveness were progressively enhanced\nthrough multiple outdoor experimental iterations and were validated by the\nsuccessful cargo transport during the 2024 Mohamed BinZayed International\nRobotics Challenge (MBZIRC2024) competition. Crucially, the system addresses\nuncertainties and interferences inherent in maritime transportation missions\nwithout prior knowledge of cargo locations on the deck and with strict\nlimitations on intervention throughout the transportation.\n","authors":["Jianjun Sun","Zhenwei Niu","Yihao Dong","Fenglin Zhang","Muhayy Ud Din","Lakmal Seneviratne","Defu Lin","Irfan Hussain","Shaoming He"],"pdf_url":"https://arxiv.org/pdf/2411.01603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21348v2","updated":"2024-11-03T15:01:14Z","published":"2024-07-31T05:38:49Z","title":"SuperVINS: A Real-Time Visual-Inertial SLAM Framework for Challenging\n Imaging Conditions","summary":" The traditional visual-inertial SLAM system often struggles with stability\nunder low-light or motion-blur conditions, leading to potential lost of\ntrajectory tracking. High accuracy and robustness are essential for the\nlong-term and stable localization capabilities of SLAM systems. Addressing the\nchallenges of enhancing robustness and accuracy in visual-inertial SLAM, this\npaper propose SuperVINS, a real-time visual-inertial SLAM framework designed\nfor challenging imaging conditions. In contrast to geometric modeling, deep\nlearning features are capable of fully leveraging the implicit information\npresent in images, which is often not captured by geometric features.\nTherefore, SuperVINS, developed as an enhancement of VINS-Fusion, integrates\nthe deep learning neural network model SuperPoint for feature point extraction\nand loop closure detection. At the same time, a deep learning neural network\nLightGlue model for associating feature points is integrated in front-end\nfeature matching. A feature matching enhancement strategy based on the RANSAC\nalgorithm is proposed. The system is allowed to set different masks and RANSAC\nthresholds for various environments, thereby balancing computational cost and\nlocalization accuracy. Additionally, it allows for flexible training of\nspecific SuperPoint bag of words tailored for loop closure detection in\nparticular environments. The system enables real-time localization and mapping.\nExperimental validation on the well-known EuRoC dataset demonstrates that\nSuperVINS is comparable to other visual-inertial SLAM system in accuracy and\nrobustness across the most challenging sequences. This paper analyzes the\nadvantages of SuperVINS in terms of accuracy, real-time performance, and\nrobustness. To facilitate knowledge exchange within the field, we have made the\ncode for this paper publicly available.\n","authors":["Hongkun Luo","Yang Liu","Chi Guo","Zengke Li","Weiwei Song"],"pdf_url":"https://arxiv.org/pdf/2407.21348v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01568v1","updated":"2024-11-03T13:46:30Z","published":"2024-11-03T13:46:30Z","title":"Addressing Failures in Robotics using Vision-Based Language Models\n (VLMs) and Behavior Trees (BT)","summary":" In this paper, we propose an approach that combines Vision Language Models\n(VLMs) and Behavior Trees (BTs) to address failures in robotics. Current\nrobotic systems can handle known failures with pre-existing recovery\nstrategies, but they are often ill-equipped to manage unknown failures or\nanomalies. We introduce VLMs as a monitoring tool to detect and identify\nfailures during task execution. Additionally, VLMs generate missing conditions\nor skill templates that are then incorporated into the BT, ensuring the system\ncan autonomously address similar failures in future tasks. We validate our\napproach through simulations in several failure scenarios.\n","authors":["Faseeh Ahmad","Jonathan Styrud","Volker Krueger"],"pdf_url":"https://arxiv.org/pdf/2411.01568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09645v2","updated":"2024-11-03T13:21:57Z","published":"2024-04-15T10:24:32Z","title":"Real-world Instance-specific Image Goal Navigation: Bridging Domain Gaps\n via Contrastive Learning","summary":" Improving instance-specific image goal navigation (InstanceImageNav), which\nlocates the identical object in a real-world environment from a query image, is\nessential for robotic systems to assist users in finding desired objects. The\nchallenge lies in the domain gap between low-quality images observed by the\nmoving robot, characterized by motion blur and low-resolution, and high-quality\nquery images provided by the user. Such domain gaps could significantly reduce\nthe task success rate but have not been the focus of previous work. To address\nthis, we propose a novel method called Few-shot Cross-quality Instance-aware\nAdaptation (CrossIA), which employs contrastive learning with an instance\nclassifier to align features between massive low- and few high-quality images.\nThis approach effectively reduces the domain gap by bringing the latent\nrepresentations of cross-quality images closer on an instance basis.\nAdditionally, the system integrates an object image collection with a\npre-trained deblurring model to enhance the observed image quality. Our method\nfine-tunes the SimSiam model, pre-trained on ImageNet, using CrossIA. We\nevaluated our method's effectiveness through an InstanceImageNav task with 20\ndifferent types of instances, where the robot identifies the same instance in a\nreal-world environment as a high-quality query image. Our experiments showed\nthat our method improves the task success rate by up to three times compared to\nthe baseline, a conventional approach based on SuperGlue. These findings\nhighlight the potential of leveraging contrastive learning and image\nenhancement techniques to bridge the domain gap and improve object localization\nin robotic applications. The project website is\nhttps://emergentsystemlabstudent.github.io/DomainBridgingNav/.\n","authors":["Taichi Sakaguchi","Akira Taniguchi","Yoshinobu Hagiwara","Lotfi El Hafi","Shoichi Hasegawa","Tadahiro Taniguchi"],"pdf_url":"https://arxiv.org/pdf/2404.09645v2.pdf","comment":"See website at\n https://emergentsystemlabstudent.github.io/DomainBridgingNav/. Accepted to\n IEEE IRC2024"},{"id":"http://arxiv.org/abs/2306.06656v2","updated":"2024-11-03T11:46:02Z","published":"2023-06-11T12:00:33Z","title":"PVPUFormer: Probabilistic Visual Prompt Unified Transformer for\n Interactive Image Segmentation","summary":" Integration of diverse visual prompts like clicks, scribbles, and boxes in\ninteractive image segmentation significantly facilitates users' interaction as\nwell as improves interaction efficiency. However, existing studies primarily\nencode the position or pixel regions of prompts without considering the\ncontextual areas around them, resulting in insufficient prompt feedback, which\nis not conducive to performance acceleration. To tackle this problem, this\npaper proposes a simple yet effective Probabilistic Visual Prompt Unified\nTransformer (PVPUFormer) for interactive image segmentation, which allows users\nto flexibly input diverse visual prompts with the probabilistic prompt encoding\nand feature post-processing to excavate sufficient and robust prompt features\nfor performance boosting. Specifically, we first propose a Probabilistic\nPrompt-unified Encoder (PPuE) to generate a unified one-dimensional vector by\nexploring both prompt and non-prompt contextual information, offering richer\nfeedback cues to accelerate performance improvement. On this basis, we further\npresent a Prompt-to-Pixel Contrastive (P$^2$C) loss to accurately align both\nprompt and pixel features, bridging the representation gap between them to\noffer consistent feature representations for mask prediction. Moreover, our\napproach designs a Dual-cross Merging Attention (DMA) module to implement\nbidirectional feature interaction between image and prompt features, generating\nnotable features for performance improvement. A comprehensive variety of\nexperiments on several challenging datasets demonstrates that the proposed\ncomponents achieve consistent improvements, yielding state-of-the-art\ninteractive segmentation performance. Our code is available at\nhttps://github.com/XuZhang1211/PVPUFormer.\n","authors":["Xu Zhang","Kailun Yang","Jiacheng Lin","Jin Yuan","Zhiyong Li","Shutao Li"],"pdf_url":"https://arxiv.org/pdf/2306.06656v2.pdf","comment":"Accepted to IEEE Transactions on Image Processing (TIP). Code is\n available at https://github.com/XuZhang1211/PVPUFormer"},{"id":"http://arxiv.org/abs/2411.01475v1","updated":"2024-11-03T08:22:02Z","published":"2024-11-03T08:22:02Z","title":"Interaction-Aware Trajectory Prediction for Safe Motion Planning in\n Autonomous Driving: A Transformer-Transfer Learning Approach","summary":" A critical aspect of safe and efficient motion planning for autonomous\nvehicles (AVs) is to handle the complex and uncertain behavior of surrounding\nhuman-driven vehicles (HDVs). Despite intensive research on driver behavior\nprediction, existing approaches typically overlook the interactions between AVs\nand HDVs assuming that HDV trajectories are not affected by AV actions. To\naddress this gap, we present a transformer-transfer learning-based\ninteraction-aware trajectory predictor for safe motion planning of autonomous\ndriving, focusing on a vehicle-to-vehicle (V2V) interaction scenario consisting\nof an AV and an HDV. Specifically, we construct a transformer-based\ninteraction-aware trajectory predictor using widely available datasets of HDV\ntrajectory data and further transfer the learned predictor using a small set of\nAV-HDV interaction data. Then, to better incorporate the proposed trajectory\npredictor into the motion planning module of AVs, we introduce an uncertainty\nquantification method to characterize the errors of the predictor, which are\nintegrated into the path-planning process. Our experimental results demonstrate\nthe value of explicitly considering interactions and handling uncertainties.\n","authors":["Jinhao Liang","Chaopeng Tan","Longhao Yan","Jingyuan Zhou","Guodong Yin","Kaidi Yang"],"pdf_url":"https://arxiv.org/pdf/2411.01475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10840v3","updated":"2024-11-03T06:31:10Z","published":"2024-03-16T07:26:50Z","title":"MSI-NeRF: Linking Omni-Depth with View Synthesis through Multi-Sphere\n Image aided Generalizable Neural Radiance Field","summary":" Panoramic observation using fisheye cameras is significant in virtual reality\n(VR) and robot perception. However, panoramic images synthesized by traditional\nmethods lack depth information and can only provide three degrees-of-freedom\n(3DoF) rotation rendering in VR applications. To fully preserve and exploit the\nparallax information within the original fisheye cameras, we introduce\nMSI-NeRF, which combines deep learning omnidirectional depth estimation and\nnovel view synthesis. We construct a multi-sphere image as a cost volume\nthrough feature extraction and warping of the input images. We further build an\nimplicit radiance field using spatial points and interpolated 3D feature\nvectors as input, which can simultaneously realize omnidirectional depth\nestimation and 6DoF view synthesis. Leveraging the knowledge from depth\nestimation task, our method can learn scene appearance by source view\nsupervision only. It does not require novel target views and can be trained\nconveniently on existing panorama depth estimation datasets. Our network has\nthe generalization ability to reconstruct unknown scenes efficiently using only\nfour images. Experimental results show that our method outperforms existing\nmethods in both depth estimation and novel view synthesis tasks.\n","authors":["Dongyu Yan","Guanyu Huang","Fengyu Quan","Haoyao Chen"],"pdf_url":"https://arxiv.org/pdf/2403.10840v3.pdf","comment":"10 pages, 9 figures, Accepted to IEEE/CVF Winter Conference on\n Applications of Computer Vision"},{"id":"http://arxiv.org/abs/2401.13957v2","updated":"2024-11-03T03:36:47Z","published":"2024-01-25T05:29:04Z","title":"Automatic Tissue Traction Using Miniature Force-Sensing Forceps for\n Minimally Invasive Surgery","summary":" A common limitation of autonomous tissue manipulation in robotic minimally\ninvasive surgery (MIS) is the absence of force sensing and control at the tool\nlevel. Recently, our team has developed miniature force-sensing forceps that\ncan simultaneously measure the grasping and pulling forces during tissue\nmanipulation. Based on this design, here we further present a method to\nautomate tissue traction that comprises grasping and pulling stages. During\nthis process, the grasping and pulling forces can be controlled either\nseparately or simultaneously through force decoupling. The force controller is\nbuilt upon a static model of tissue manipulation, considering the interaction\nbetween the force-sensing forceps and soft tissue. The efficacy of this force\ncontrol approach is validated through a series of experiments comparing\ntargeted, estimated, and actual reference forces. To verify the feasibility of\nthe proposed method in surgical applications, various tissue resections are\nconducted on ex vivo tissues employing a dual-arm robotic setup. Finally, we\ndiscuss the benefits of multi-force control in tissue traction, evidenced\nthrough comparative analyses of various ex vivo tissue resections with and\nwithout the proposed method, and the potential generalization with traction on\ndifferent tissues. The results affirm the feasibility of implementing automatic\ntissue traction using miniature forceps with multi-force control, suggesting\nits potential to promote autonomous MIS. A video demonstrating the experiments\ncan be found at https://youtu.be/f5gXuXe67Ak.\n","authors":["Tangyou Liu","Xiaoyi Wang","Jay Katupitiya","Jiaole Wang","Liao Wu"],"pdf_url":"https://arxiv.org/pdf/2401.13957v2.pdf","comment":"15 pages, 14 figures, accepted by T-RO"},{"id":"http://arxiv.org/abs/2306.06766v3","updated":"2024-11-03T02:09:53Z","published":"2023-06-11T20:33:22Z","title":"Digital Twin-Enhanced Wireless Indoor Navigation: Achieving Efficient\n Environment Sensing with Zero-Shot Reinforcement Learning","summary":" Millimeter-wave (mmWave) communication is a vital component of future\ngenerations of mobile networks, offering not only high data rates but also\nprecise beams, making it ideal for indoor navigation in complex environments.\nHowever, the challenges of multipath propagation and noisy signal measurements\nin indoor spaces complicate the use of mmWave signals for navigation tasks.\nTraditional physics-based methods, such as following the angle of arrival\n(AoA), often fall short in complex scenarios, highlighting the need for more\nsophisticated approaches. Digital twins, as virtual replicas of physical\nenvironments, offer a powerful tool for simulating and optimizing mmWave signal\npropagation in such settings. By creating detailed, physics-based models of\nreal-world spaces, digital twins enable the training of machine learning\nalgorithms in virtual environments, reducing the costs and limitations of\nphysical testing. Despite their advantages, current machine learning models\ntrained in digital twins often overfit specific virtual environments and\nrequire costly retraining when applied to new scenarios. In this paper, we\npropose a Physics-Informed Reinforcement Learning (PIRL) approach that\nleverages the physical insights provided by digital twins to shape the\nreinforcement learning (RL) reward function. By integrating physics-based\nmetrics such as signal strength, AoA, and path reflections into the learning\nprocess, PIRL enables efficient learning and improved generalization to new\nenvironments without retraining. Our experiments demonstrate that the proposed\nPIRL, supported by digital twin simulations, outperforms traditional heuristics\nand standard RL models, achieving zero-shot generalization in unseen\nenvironments and offering a cost-effective, scalable solution for wireless\nindoor navigation.\n","authors":["Tao Li","Haozhe Lei","Hao Guo","Mingsheng Yin","Yaqi Hu","Quanyan Zhu","Sundeep Rangan"],"pdf_url":"https://arxiv.org/pdf/2306.06766v3.pdf","comment":"Submitted to IEEE Open Journal of the Communications Society"},{"id":"http://arxiv.org/abs/2411.01396v1","updated":"2024-11-03T01:21:43Z","published":"2024-11-03T01:21:43Z","title":"Exploring the Edges of Latent State Clusters for Goal-Conditioned\n Reinforcement Learning","summary":" Exploring unknown environments efficiently is a fundamental challenge in\nunsupervised goal-conditioned reinforcement learning. While selecting\nexploratory goals at the frontier of previously explored states is an effective\nstrategy, the policy during training may still have limited capability of\nreaching rare goals on the frontier, resulting in reduced exploratory behavior.\nWe propose \"Cluster Edge Exploration\" ($CE^2$), a new goal-directed exploration\nalgorithm that when choosing goals in sparsely explored areas of the state\nspace gives priority to goal states that remain accessible to the agent. The\nkey idea is clustering to group states that are easily reachable from one\nanother by the current policy under training in a latent space and traversing\nto states holding significant exploration potential on the boundary of these\nclusters before doing exploratory behavior. In challenging robotics\nenvironments including navigating a maze with a multi-legged ant robot,\nmanipulating objects with a robot arm on a cluttered tabletop, and rotating\nobjects in the palm of an anthropomorphic robotic hand, $CE^2$ demonstrates\nsuperior efficiency in exploration compared to baseline methods and ablations.\n","authors":["Yuanlin Duan","Guofeng Cui","He Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.01396v1.pdf","comment":"NeurIPS2024 Poster"},{"id":"http://arxiv.org/abs/2411.01387v1","updated":"2024-11-03T00:48:37Z","published":"2024-11-03T00:48:37Z","title":"Wallbounce : Push wall to navigate with Contact-Implicit MPC","summary":" In this work, we introduce a framework that enables highly maneuverable\nlocomotion using non-periodic contacts. This task is challenging for\ntraditional optimization and planning methods to handle due to difficulties in\nspecifying contact mode sequences in real-time. To address this, we use a\nbi-level contact-implicit planner and hybrid model predictive controller to\ndraft and execute a motion plan. We investigate how this method allows us to\nplan arm contact events on the shmoobot, a smaller ballbot, which uses an\ninverse mouse-ball drive to achieve dynamic balancing with a low number of\nactuators. Through multiple experiments we show how the arms allow for\nacceleration, deceleration and dynamic obstacle avoidance that are not\nachievable with the mouse-ball drive alone. This demonstrates how a holistic\napproach to locomotion can increase the control authority of unique robot\nmorpohologies without additional hardware by leveraging robot arms that are\ntypically used only for manipulation. Project website:\nhttps://cmushmoobot.github.io/Wallbounce\n","authors":["Xiaohan Liu","Cunxi Dai","John Z. Zhang","Arun Bishop","Zachary Manchester","Ralph Hollis"],"pdf_url":"https://arxiv.org/pdf/2411.01387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02446v1","updated":"2024-11-03T01:35:06Z","published":"2024-11-03T01:35:06Z","title":"Learning World Models for Unconstrained Goal Navigation","summary":" Learning world models offers a promising avenue for goal-conditioned\nreinforcement learning with sparse rewards. By allowing agents to plan actions\nor exploratory goals without direct interaction with the environment, world\nmodels enhance exploration efficiency. The quality of a world model hinges on\nthe richness of data stored in the agent's replay buffer, with expectations of\nreasonable generalization across the state space surrounding recorded\ntrajectories. However, challenges arise in generalizing learned world models to\nstate transitions backward along recorded trajectories or between states across\ndifferent trajectories, hindering their ability to accurately model real-world\ndynamics. To address these challenges, we introduce a novel goal-directed\nexploration algorithm, MUN (short for \"World Models for Unconstrained Goal\nNavigation\"). This algorithm is capable of modeling state transitions between\narbitrary subgoal states in the replay buffer, thereby facilitating the\nlearning of policies to navigate between any \"key\" states. Experimental results\ndemonstrate that MUN strengthens the reliability of world models and\nsignificantly improves the policy's capacity to generalize across new goal\nsettings.\n","authors":["Yuanlin Duan","Wensen Mao","He Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.02446v1.pdf","comment":"NeurIPS2024 Poster. arXiv admin note: substantial text overlap with\n arXiv:2411.01396"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.15590v2","updated":"2024-11-03T23:51:33Z","published":"2024-09-23T22:48:04Z","title":"MapEx: Indoor Structure Exploration with Probabilistic Information Gain\n from Global Map Predictions","summary":" Exploration is a critical challenge in robotics, centered on understanding\nunknown environments. In this work, we focus on robots exploring structured\nindoor environments which are often predictable and composed of repeating\npatterns. Most existing approaches, such as conventional frontier approaches,\nhave difficulty leveraging the predictability and explore with simple\nheuristics such as `closest first'. Recent works use deep learning techniques\nto predict unknown regions of the map, using these predictions for information\ngain calculation. However, these approaches are often sensitive to the\npredicted map quality or do not reason over sensor coverage. To overcome these\nissues, our key insight is to jointly reason over what the robot can observe\nand its uncertainty to calculate probabilistic information gain. We introduce\nMapEx, a new exploration framework that uses predicted maps to form\nprobabilistic sensor model for information gain estimation. MapEx generates\nmultiple predicted maps based on observed information, and takes into\nconsideration both the computed variances of predicted maps and estimated\nvisible area to estimate the information gain of a given viewpoint. Experiments\non the real-world KTH dataset showed on average 12.4% improvement than\nrepresentative map-prediction based exploration and 25.4% improvement than\nnearest frontier approach.\n","authors":["Cherie Ho","Seungchan Kim","Brady Moon","Aditya Parandekar","Narek Harutyunyan","Chen Wang","Katia Sycara","Graeme Best","Sebastian Scherer"],"pdf_url":"https://arxiv.org/pdf/2409.15590v2.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2411.01713v1","updated":"2024-11-03T23:36:53Z","published":"2024-11-03T23:36:53Z","title":"Rethinking Weight Decay for Robust Fine-Tuning of Foundation Models","summary":" Modern optimizers such as AdamW, equipped with momentum and adaptive learning\nrate, are designed to escape local minima and explore the vast parameter space.\nThis exploration is beneficial for finding good loss basins when training from\nscratch. It is not necessarily ideal when resuming from a powerful foundation\nmodel because it can lead to large deviations from the pre-trained\ninitialization and, consequently, worse robustness and generalization. At the\nsame time, strong regularization on all parameters can lead to under-fitting.\nWe hypothesize that selectively regularizing the parameter space is the key to\nfitting and retraining the pre-trained knowledge. This paper proposes a new\nweight decay technique, Selective Projection Decay (SPD), that selectively\nimposes a strong penalty on certain layers while allowing others to change\nfreely. Intuitively, SPD expands and contracts the parameter search space for\nlayers with consistent and inconsistent loss reduction, respectively.\nExperimentally, when equipped with SPD, Adam consistently provides better\nin-distribution generalization and out-of-distribution robustness performance\non multiple popular vision and language benchmarks. Code available\nat~\\url{https://github.com/GT-RIPL/Selective-Projection-Decay.git}\n","authors":["Junjiao Tian","Chengyue Huang","Zsolt Kira"],"pdf_url":"https://arxiv.org/pdf/2411.01713v1.pdf","comment":"Accepted to Neurips 2024"},{"id":"http://arxiv.org/abs/2410.02401v4","updated":"2024-11-03T22:57:56Z","published":"2024-10-03T11:29:09Z","title":"SynCo: Synthetic Hard Negatives in Contrastive Learning for Better\n Unsupervised Visual Representations","summary":" Contrastive learning has become a dominant approach in self-supervised visual\nrepresentation learning. Hard negatives - samples closely resembling the anchor\n- are key to enhancing learned representations' discriminative power. However,\nefficiently leveraging hard negatives remains challenging. We introduce SynCo\n(Synthetic Negatives in Contrastive learning), a novel approach that improves\nmodel performance by generating synthetic hard negatives on the representation\nspace. Building on the MoCo framework, SynCo introduces six strategies for\ncreating diverse synthetic hard negatives on-the-fly with minimal computational\noverhead. SynCo achieves faster training and better representation learning,\nreaching 67.9% top-1 accuracy on ImageNet ILSVRC-2012 linear evaluation after\n200 pretraining epochs, surpassing MoCo's 67.5% using the same ResNet-50\nencoder. It also transfers more effectively to detection tasks: on PASCAL VOC,\nit outperforms both the supervised baseline and MoCo with 82.5% AP; on COCO, it\nsets new benchmarks with 40.9% AP for bounding box detection and 35.5% AP for\ninstance segmentation. Our synthetic hard negative generation approach\nsignificantly enhances visual representations learned through self-supervised\ncontrastive learning. Code is available at\nhttps://github.com/giakoumoglou/synco.\n","authors":["Nikolaos Giakoumoglou","Tania Stathaki"],"pdf_url":"https://arxiv.org/pdf/2410.02401v4.pdf","comment":"10 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2410.09474v2","updated":"2024-11-03T22:31:31Z","published":"2024-10-12T10:27:23Z","title":"Distilling Invariant Representations with Dual Augmentation","summary":" Knowledge distillation (KD) has been widely used to transfer knowledge from\nlarge, accurate models (teachers) to smaller, efficient ones (students). Recent\nmethods have explored enforcing consistency by incorporating causal\ninterpretations to distill invariant representations. In this work, we extend\nthis line of research by introducing a dual augmentation strategy to promote\ninvariant feature learning in both teacher and student models. Our approach\nleverages different augmentations applied to both models during distillation,\npushing the student to capture robust, transferable features. This dual\naugmentation strategy complements invariant causal distillation by ensuring\nthat the learned representations remain stable across a wider range of data\nvariations and transformations. Extensive experiments on CIFAR-100 demonstrate\nthe effectiveness of this approach, achieving competitive results in\nsame-architecture KD.\n","authors":["Nikolaos Giakoumoglou","Tania Stathaki"],"pdf_url":"https://arxiv.org/pdf/2410.09474v2.pdf","comment":"This paper presents preliminary results from a project that we have\n since discontinued, as our research focus has shifted to new directions"},{"id":"http://arxiv.org/abs/2403.14468v4","updated":"2024-11-03T21:16:54Z","published":"2024-03-21T15:15:00Z","title":"AnyV2V: A Tuning-Free Framework For Any Video-to-Video Editing Tasks","summary":" In the dynamic field of digital content creation using generative models,\nstate-of-the-art video editing models still do not offer the level of quality\nand control that users desire. Previous works on video editing either extended\nfrom image-based generative models in a zero-shot manner or necessitated\nextensive fine-tuning, which can hinder the production of fluid video edits.\nFurthermore, these methods frequently rely on textual input as the editing\nguidance, leading to ambiguities and limiting the types of edits they can\nperform. Recognizing these challenges, we introduce AnyV2V, a novel tuning-free\nparadigm designed to simplify video editing into two primary steps: (1)\nemploying an off-the-shelf image editing model to modify the first frame, (2)\nutilizing an existing image-to-video generation model to generate the edited\nvideo through temporal feature injection. AnyV2V can leverage any existing\nimage editing tools to support an extensive array of video editing tasks,\nincluding prompt-based editing, reference-based style transfer, subject-driven\nediting, and identity manipulation, which were unattainable by previous\nmethods. AnyV2V can also support any video length. Our evaluation shows that\nAnyV2V achieved CLIP-scores comparable to other baseline methods. Furthermore,\nAnyV2V significantly outperformed these baselines in human evaluations,\ndemonstrating notable improvements in visual consistency with the source video\nwhile producing high-quality edits across all editing tasks.\n","authors":["Max Ku","Cong Wei","Weiming Ren","Harry Yang","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2403.14468v4.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR 2024)\n (11/2024)"},{"id":"http://arxiv.org/abs/2411.01683v1","updated":"2024-11-03T20:46:50Z","published":"2024-11-03T20:46:50Z","title":"ROAD-Waymo: Action Awareness at Scale for Autonomous Driving","summary":" Autonomous Vehicle (AV) perception systems require more than simply seeing,\nvia e.g., object detection or scene segmentation. They need a holistic\nunderstanding of what is happening within the scene for safe interaction with\nother road users. Few datasets exist for the purpose of developing and training\nalgorithms to comprehend the actions of other road users. This paper presents\nROAD-Waymo, an extensive dataset for the development and benchmarking of\ntechniques for agent, action, location and event detection in road scenes,\nprovided as a layer upon the (US) Waymo Open dataset. Considerably larger and\nmore challenging than any existing dataset (and encompassing multiple cities),\nit comes with 198k annotated video frames, 54k agent tubes, 3.9M bounding boxes\nand a total of 12.4M labels. The integrity of the dataset has been confirmed\nand enhanced via a novel annotation pipeline designed for automatically\nidentifying violations of requirements specifically designed for this dataset.\nAs ROAD-Waymo is compatible with the original (UK) ROAD dataset, it provides\nthe opportunity to tackle domain adaptation between real-world road scenarios\nin different countries within a novel benchmark: ROAD++.\n","authors":["Salman Khan","Izzeddin Teeti","Reza Javanmard Alitappeh","Mihaela C. Stoian","Eleonora Giunchiglia","Gurkirt Singh","Andrew Bradley","Fabio Cuzzolin"],"pdf_url":"https://arxiv.org/pdf/2411.01683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00985v3","updated":"2024-11-03T20:34:35Z","published":"2024-06-03T04:43:56Z","title":"ParallelEdits: Efficient Multi-Aspect Text-Driven Image Editing with\n Attention Grouping","summary":" Text-driven image synthesis has made significant advancements with the\ndevelopment of diffusion models, transforming how visual content is generated\nfrom text prompts. Despite these advances, text-driven image editing, a key\narea in computer graphics, faces unique challenges. A major challenge is making\nsimultaneous edits across multiple objects or attributes. Applying these\nmethods sequentially for multi-attribute edits increases computational demands\nand efficiency losses. In this paper, we address these challenges with\nsignificant contributions. Our main contribution is the development of\nParallelEdits, a method that seamlessly manages simultaneous edits across\nmultiple attributes. In contrast to previous approaches, ParallelEdits not only\npreserves the quality of single attribute edits but also significantly improves\nthe performance of multitasking edits. This is achieved through innovative\nattention distribution mechanism and multi-branch design that operates across\nseveral processing heads. Additionally, we introduce the PIE-Bench++ dataset,\nan expansion of the original PIE-Bench dataset, to better support evaluating\nimage-editing tasks involving multiple objects and attributes simultaneously.\nThis dataset is a benchmark for evaluating text-driven image editing methods in\nmultifaceted scenarios.\n","authors":["Mingzhen Huang","Jialing Cai","Shan Jia","Vishnu Suresh Lokhande","Siwei Lyu"],"pdf_url":"https://arxiv.org/pdf/2406.00985v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13743v3","updated":"2024-11-03T20:22:32Z","published":"2024-06-19T18:00:07Z","title":"GenAI-Bench: Evaluating and Improving Compositional Text-to-Visual\n Generation","summary":" While text-to-visual models now produce photo-realistic images and videos,\nthey struggle with compositional text prompts involving attributes,\nrelationships, and higher-order reasoning such as logic and comparison. In this\nwork, we conduct an extensive human study on GenAI-Bench to evaluate the\nperformance of leading image and video generation models in various aspects of\ncompositional text-to-visual generation. We also compare automated evaluation\nmetrics against our collected human ratings and find that VQAScore -- a metric\nmeasuring the likelihood that a VQA model views an image as accurately\ndepicting the prompt -- significantly outperforms previous metrics such as\nCLIPScore. In addition, VQAScore can improve generation in a black-box manner\n(without finetuning) via simply ranking a few (3 to 9) candidate images.\nRanking by VQAScore is 2x to 3x more effective than other scoring methods like\nPickScore, HPSv2, and ImageReward at improving human alignment ratings for\nDALL-E 3 and Stable Diffusion, especially on compositional prompts that require\nadvanced visio-linguistic reasoning. We release a new GenAI-Rank benchmark with\nover 40,000 human ratings to evaluate scoring metrics on ranking images\ngenerated from the same prompt. Lastly, we discuss promising areas for\nimprovement in VQAScore, such as addressing fine-grained visual details. We\nwill release all human ratings (over 80,000) to facilitate scientific\nbenchmarking of both generative models and automated metrics.\n","authors":["Baiqi Li","Zhiqiu Lin","Deepak Pathak","Jiayao Li","Yixin Fei","Kewen Wu","Tiffany Ling","Xide Xia","Pengchuan Zhang","Graham Neubig","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2406.13743v3.pdf","comment":"We open-source our dataset, model, and code at:\n https://linzhiqiu.github.io/papers/genai_bench ; Project page:\n https://linzhiqiu.github.io/papers/genai_bench ; GenAI-Bench was first\n introduced in arxiv:2404.01291. This article extends it with an additional\n GenAI-Rank benchmark"},{"id":"http://arxiv.org/abs/2411.01669v1","updated":"2024-11-03T19:49:52Z","published":"2024-11-03T19:49:52Z","title":"MamT$^4$: Multi-view Attention Networks for Mammography Cancer\n Classification","summary":" In this study, we introduce a novel method, called MamT$^4$, which is used\nfor simultaneous analysis of four mammography images. A decision is made based\non one image of a breast, with attention also devoted to three additional\nimages: another view of the same breast and two images of the other breast.\nThis approach enables the algorithm to closely replicate the practice of a\nradiologist who reviews the entire set of mammograms for a patient.\nFurthermore, this paper emphasizes the preprocessing of images, specifically\nproposing a cropping model (U-Net based on ResNet-34) to help the method remove\nimage artifacts and focus on the breast region. To the best of our knowledge,\nthis study is the first to achieve a ROC-AUC of 84.0 $\\pm$ 1.7 and an F1 score\nof 56.0 $\\pm$ 1.3 on an independent test dataset of Vietnam digital mammography\n(VinDr-Mammo), which is preprocessed with the cropping model.\n","authors":["Alisher Ibragimov","Sofya Senotrusova","Arsenii Litvinov","Egor Ushakov","Evgeny Karpulevich","Yury Markin"],"pdf_url":"https://arxiv.org/pdf/2411.01669v1.pdf","comment":"The crop model is available here:\n https://github.com/ispras/mammo_crop"},{"id":"http://arxiv.org/abs/2403.07536v2","updated":"2024-11-03T19:21:04Z","published":"2024-03-12T11:19:46Z","title":"LaB-GATr: geometric algebra transformers for large biomedical surface\n and volume meshes","summary":" Many anatomical structures can be described by surface or volume meshes.\nMachine learning is a promising tool to extract information from these 3D\nmodels. However, high-fidelity meshes often contain hundreds of thousands of\nvertices, which creates unique challenges in building deep neural network\narchitectures. Furthermore, patient-specific meshes may not be canonically\naligned which limits the generalisation of machine learning algorithms. We\npropose LaB-GATr, a transfomer neural network with geometric tokenisation that\ncan effectively learn with large-scale (bio-)medical surface and volume meshes\nthrough sequence compression and interpolation. Our method extends the recently\nproposed geometric algebra transformer (GATr) and thus respects all Euclidean\nsymmetries, i.e. rotation, translation and reflection, effectively mitigating\nthe problem of canonical alignment between patients. LaB-GATr achieves\nstate-of-the-art results on three tasks in cardiovascular hemodynamics\nmodelling and neurodevelopmental phenotype prediction, featuring meshes of up\nto 200,000 vertices. Our results demonstrate that LaB-GATr is a powerful\narchitecture for learning with high-fidelity meshes which has the potential to\nenable interesting downstream applications. Our implementation is publicly\navailable.\n","authors":["Julian Suk","Baris Imre","Jelmer M. Wolterink"],"pdf_url":"https://arxiv.org/pdf/2403.07536v2.pdf","comment":"First published in \"Medical Image Computing and Computer Assisted\n Intervention\" (MICCAI), pp 185-195, 2024 by Springer Nature"},{"id":"http://arxiv.org/abs/2410.22489v2","updated":"2024-11-03T19:00:34Z","published":"2024-10-29T19:28:41Z","title":"Multimodality Helps Few-Shot 3D Point Cloud Semantic Segmentation","summary":" Few-shot 3D point cloud segmentation (FS-PCS) aims at generalizing models to\nsegment novel categories with minimal annotated support samples. While existing\nFS-PCS methods have shown promise, they primarily focus on unimodal point cloud\ninputs, overlooking the potential benefits of leveraging multimodal\ninformation. In this paper, we address this gap by introducing a cost-free\nmultimodal FS-PCS setup, utilizing textual labels and the potentially available\n2D image modality. Under this easy-to-achieve setup, we present the MultiModal\nFew-Shot SegNet (MM-FSS), a model effectively harnessing complementary\ninformation from multiple modalities. MM-FSS employs a shared backbone with two\nheads to extract intermodal and unimodal visual features, and a pretrained text\nencoder to generate text embeddings. To fully exploit the multimodal\ninformation, we propose a Multimodal Correlation Fusion (MCF) module to\ngenerate multimodal correlations, and a Multimodal Semantic Fusion (MSF) module\nto refine the correlations using text-aware semantic guidance. Additionally, we\npropose a simple yet effective Test-time Adaptive Cross-modal Calibration\n(TACC) technique to mitigate training bias, further improving generalization.\nExperimental results on S3DIS and ScanNet datasets demonstrate significant\nperformance improvements achieved by our method. The efficacy of our approach\nindicates the benefits of leveraging commonly-ignored free modalities for\nFS-PCS, providing valuable insights for future research. The code is available\nat https://github.com/ZhaochongAn/Multimodality-3D-Few-Shot\n","authors":["Zhaochong An","Guolei Sun","Yun Liu","Runjia Li","Min Wu","Ming-Ming Cheng","Ender Konukoglu","Serge Belongie"],"pdf_url":"https://arxiv.org/pdf/2410.22489v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01656v1","updated":"2024-11-03T18:57:19Z","published":"2024-11-03T18:57:19Z","title":"Degradation-Aware Residual-Conditioned Optimal Transport for Unified\n Image Restoration","summary":" All-in-one image restoration has emerged as a practical and promising\nlow-level vision task for real-world applications. In this context, the key\nissue lies in how to deal with different types of degraded images\nsimultaneously. In this work, we present a Degradation-Aware\nResidual-Conditioned Optimal Transport (DA-RCOT) approach that models\n(all-in-one) image restoration as an optimal transport (OT) problem for\nunpaired and paired settings, introducing the transport residual as a\ndegradation-specific cue for both the transport cost and the transport map.\nSpecifically, we formalize image restoration with a residual-guided OT\nobjective by exploiting the degradation-specific patterns of the Fourier\nresidual in the transport cost. More crucially, we design the transport map for\nrestoration as a two-pass DA-RCOT map, in which the transport residual is\ncomputed in the first pass and then encoded as multi-scale residual embeddings\nto condition the second-pass restoration. This conditioning process injects\nintrinsic degradation knowledge (e.g., degradation type and level) and\nstructural information from the multi-scale residual embeddings into the OT\nmap, which thereby can dynamically adjust its behaviors for all-in-one\nrestoration. Extensive experiments across five degradations demonstrate the\nfavorable performance of DA-RCOT as compared to state-of-the-art methods, in\nterms of distortion measures, perceptual quality, and image structure\npreservation. Notably, DA-RCOT delivers superior adaptability to real-world\nscenarios even with multiple degradations and shows distinctive robustness to\nboth degradation levels and the number of degradations.\n","authors":["Xiaole Tang","Xiang Gu","Xiaoyi He","Xin Hu","Jian Sun"],"pdf_url":"https://arxiv.org/pdf/2411.01656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01652v1","updated":"2024-11-03T18:30:37Z","published":"2024-11-03T18:30:37Z","title":"Optimizing Gastrointestinal Diagnostics: A CNN-Based Model for VCE Image\n Classification","summary":" In recent years, the diagnosis of gastrointestinal (GI) diseases has advanced\ngreatly with the advent of high-tech video capsule endoscopy (VCE) technology,\nwhich allows for non-invasive observation of the digestive system. The MisaHub\nCapsule Vision Challenge encourages the development of vendor-independent\nartificial intelligence models that can autonomously classify GI anomalies from\nVCE images. This paper presents CNN architecture designed specifically for\nmulticlass classification of ten gut pathologies, including angioectasia,\nbleeding, erosion, erythema, foreign bodies, lymphangiectasia, polyps, ulcers,\nand worms as well as their normal state.\n","authors":["Vaneeta Ahlawat","Rohit Sharma"," Urush"],"pdf_url":"https://arxiv.org/pdf/2411.01652v1.pdf","comment":"11 pages, 7 figuers"},{"id":"http://arxiv.org/abs/2405.18415v2","updated":"2024-11-03T18:23:45Z","published":"2024-05-28T17:57:06Z","title":"Why are Visually-Grounded Language Models Bad at Image Classification?","summary":" Image classification is one of the most fundamental capabilities of machine\nvision intelligence. In this work, we revisit the image classification task\nusing visually-grounded language models (VLMs) such as GPT-4V and LLaVA. We\nfind that existing proprietary and public VLMs, despite often using CLIP as a\nvision encoder and having many more parameters, significantly underperform CLIP\non standard image classification benchmarks like ImageNet. To understand the\nreason, we explore several hypotheses concerning the inference algorithms,\ntraining objectives, and data processing in VLMs. Our analysis reveals that the\nprimary cause is data-related: critical information for image classification is\nencoded in the VLM's latent space but can only be effectively decoded with\nenough training data. Specifically, there is a strong correlation between the\nfrequency of class exposure during VLM training and instruction-tuning and the\nVLM's performance in those classes; when trained with sufficient data, VLMs can\nmatch the accuracy of state-of-the-art classification models. Based on these\nfindings, we enhance a VLM by integrating classification-focused datasets into\nits training, and demonstrate that the enhanced classification performance of\nthe VLM transfers to its general capabilities, resulting in an improvement of\n11.8% on the newly collected ImageWikiQA dataset.\n","authors":["Yuhui Zhang","Alyssa Unell","Xiaohan Wang","Dhruba Ghosh","Yuchang Su","Ludwig Schmidt","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2405.18415v2.pdf","comment":"Published at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2401.15563v3","updated":"2024-11-03T18:04:52Z","published":"2024-01-28T04:07:59Z","title":"BrepGen: A B-rep Generative Diffusion Model with Structured Latent\n Geometry","summary":" This paper presents BrepGen, a diffusion-based generative approach that\ndirectly outputs a Boundary representation (B-rep) Computer-Aided Design (CAD)\nmodel. BrepGen represents a B-rep model as a novel structured latent geometry\nin a hierarchical tree. With the root node representing a whole CAD solid, each\nelement of a B-rep model (i.e., a face, an edge, or a vertex) progressively\nturns into a child-node from top to bottom. B-rep geometry information goes\ninto the nodes as the global bounding box of each primitive along with a latent\ncode describing the local geometric shape. The B-rep topology information is\nimplicitly represented by node duplication. When two faces share an edge, the\nedge curve will appear twice in the tree, and a T-junction vertex with three\nincident edges appears six times in the tree with identical node features.\nStarting from the root and progressing to the leaf, BrepGen employs\nTransformer-based diffusion models to sequentially denoise node features while\nduplicated nodes are detected and merged, recovering the B-Rep topology\ninformation. Extensive experiments show that BrepGen advances the task of CAD\nB-rep generation, surpassing existing methods on various benchmarks. Results on\nour newly collected furniture dataset further showcase its exceptional\ncapability in generating complicated geometry. While previous methods were\nlimited to generating simple prismatic shapes, BrepGen incorporates free-form\nand doubly-curved surfaces for the first time. Additional applications of\nBrepGen include CAD autocomplete and design interpolation. The code, pretrained\nmodels, and dataset are available at https://github.com/samxuxiang/BrepGen.\n","authors":["Xiang Xu","Joseph G. Lambourne","Pradeep Kumar Jayaraman","Zhengqing Wang","Karl D. D. Willis","Yasutaka Furukawa"],"pdf_url":"https://arxiv.org/pdf/2401.15563v3.pdf","comment":"Accepted to ACM SIGGRAPH 2024. Code at\n https://github.com/samxuxiang/BrepGen"}]},"2024-11-02T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2411.01360v1","updated":"2024-11-02T20:35:13Z","published":"2024-11-02T20:35:13Z","title":"Use Digital Twins to Support Fault Diagnosis From System-level\n Condition-monitoring Data","summary":" Deep learning models have created great opportunities for data-driven fault\ndiagnosis but they require large amount of labeled failure data for training.\nIn this paper, we propose to use a digital twin to support developing\ndata-driven fault diagnosis model to reduce the amount of failure data used in\nthe training process. The developed fault diagnosis models are also able to\ndiagnose component-level failures based on system-level condition-monitoring\ndata. The proposed framework is evaluated on a real-world robot system. The\nresults showed that the deep learning model trained by digital twins is able to\ndiagnose the locations and modes of 9 faults/failure from $4$ different motors.\nHowever, the performance of the model trained by a digital twin can still be\nimproved, especially when the digital twin model has some discrepancy with the\nreal system.\n","authors":["Killian Mc Court","Xavier Mc Court","Shijia Du","Zhiguo Zeng"],"pdf_url":"https://arxiv.org/pdf/2411.01360v1.pdf","comment":"6 pages, 4 figure. Paper submitted to 2025 22nd International\n Multi-Conference on Systems, Signals & Devices (SSD)"},{"id":"http://arxiv.org/abs/2411.01349v1","updated":"2024-11-02T19:33:28Z","published":"2024-11-02T19:33:28Z","title":"The Role of Domain Randomization in Training Diffusion Policies for\n Whole-Body Humanoid Control","summary":" Humanoids have the potential to be the ideal embodiment in environments\ndesigned for humans. Thanks to the structural similarity to the human body,\nthey benefit from rich sources of demonstration data, e.g., collected via\nteleoperation, motion capture, or even using videos of humans performing tasks.\nHowever, distilling a policy from demonstrations is still a challenging\nproblem. While Diffusion Policies (DPs) have shown impressive results in\nrobotic manipulation, their applicability to locomotion and humanoid control\nremains underexplored. In this paper, we investigate how dataset diversity and\nsize affect the performance of DPs for humanoid whole-body control. In a\nsimulated IsaacGym environment, we generate synthetic demonstrations by\ntraining Adversarial Motion Prior (AMP) agents under various Domain\nRandomization (DR) conditions, and we compare DPs fitted to datasets of\ndifferent size and diversity. Our findings show that, although DPs can achieve\nstable walking behavior, successful training of locomotion policies requires\nsignificantly larger and more diverse datasets compared to manipulation tasks,\neven in simple scenarios.\n","authors":["Oleg Kaidanov","Firas Al-Hafez","Yusuf Suvari","Boris Belousov","Jan Peters"],"pdf_url":"https://arxiv.org/pdf/2411.01349v1.pdf","comment":"Conference on Robot Learning, Workshop on Whole-Body Control and\n Bimanual Manipulation"},{"id":"http://arxiv.org/abs/2408.04295v2","updated":"2024-11-02T18:07:20Z","published":"2024-08-08T08:18:05Z","title":"Assigning Credit with Partial Reward Decoupling in Multi-Agent Proximal\n Policy Optimization","summary":" Multi-agent proximal policy optimization (MAPPO) has recently demonstrated\nstate-of-the-art performance on challenging multi-agent reinforcement learning\ntasks. However, MAPPO still struggles with the credit assignment problem,\nwherein the sheer difficulty in ascribing credit to individual agents' actions\nscales poorly with team size. In this paper, we propose a multi-agent\nreinforcement learning algorithm that adapts recent developments in credit\nassignment to improve upon MAPPO. Our approach leverages partial reward\ndecoupling (PRD), which uses a learned attention mechanism to estimate which of\na particular agent's teammates are relevant to its learning updates. We use\nthis estimate to dynamically decompose large groups of agents into smaller,\nmore manageable subgroups. We empirically demonstrate that our approach,\nPRD-MAPPO, decouples agents from teammates that do not influence their expected\nfuture reward, thereby streamlining credit assignment. We additionally show\nthat PRD-MAPPO yields significantly higher data efficiency and asymptotic\nperformance compared to both MAPPO and other state-of-the-art methods across\nseveral multi-agent tasks, including StarCraft II. Finally, we propose a\nversion of PRD-MAPPO that is applicable to \\textit{shared} reward settings,\nwhere PRD was previously not applicable, and empirically show that this also\nleads to performance improvements over MAPPO.\n","authors":["Aditya Kapoor","Benjamin Freed","Howie Choset","Jeff Schneider"],"pdf_url":"https://arxiv.org/pdf/2408.04295v2.pdf","comment":"20 pages, 5 figures, 12 tables, Reinforcement Learning Journal and\n Reinforcement Learning Conference 2024"},{"id":"http://arxiv.org/abs/2403.16967v5","updated":"2024-11-02T18:04:23Z","published":"2024-03-25T17:26:08Z","title":"Visual Whole-Body Control for Legged Loco-Manipulation","summary":" We study the problem of mobile manipulation using legged robots equipped with\nan arm, namely legged loco-manipulation. The robot legs, while usually utilized\nfor mobility, offer an opportunity to amplify the manipulation capabilities by\nconducting whole-body control. That is, the robot can control the legs and the\narm at the same time to extend its workspace. We propose a framework that can\nconduct the whole-body control autonomously with visual observations. Our\napproach, namely Visual Whole-Body Control(VBC), is composed of a low-level\npolicy using all degrees of freedom to track the body velocities along with the\nend-effector position, and a high-level policy proposing the velocities and\nend-effector position based on visual inputs. We train both levels of policies\nin simulation and perform Sim2Real transfer for real robot deployment. We\nperform extensive experiments and show significant improvements over baselines\nin picking up diverse objects in different configurations (heights, locations,\norientations) and environments.\n","authors":["Minghuan Liu","Zixuan Chen","Xuxin Cheng","Yandong Ji","Ri-Zhao Qiu","Ruihan Yang","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16967v5.pdf","comment":"CoRL 2024 Oral. Project page: https://wholebody-b1.github.io"},{"id":"http://arxiv.org/abs/2411.01321v1","updated":"2024-11-02T17:53:45Z","published":"2024-11-02T17:53:45Z","title":"Control Strategies for Pursuit-Evasion Under Occlusion Using Visibility\n and Safety Barrier Functions","summary":" This paper develops a control strategy for pursuit-evasion problems in\nenvironments with occlusions. We address the challenge of a mobile pursuer\nkeeping a mobile evader within its field of view (FoV) despite line-of-sight\nobstructions. The signed distance function (SDF) of the FoV is used to\nformulate visibility as a control barrier function (CBF) constraint on the\npursuer's control inputs. Similarly, obstacle avoidance is formulated as a CBF\nconstraint based on the SDF of the obstacle set. While the visibility and\nsafety CBFs are Lipschitz continuous, they are not differentiable everywhere,\nnecessitating the use of generalized gradients. To achieve non-myopic pursuit,\nwe generate reference control trajectories leading to evader visibility using a\nsampling-based kinodynamic planner. The pursuer then tracks this reference via\nconvex optimization under the CBF constraints. We validate our approach in\nCARLA simulations and real-world robot experiments, demonstrating successful\nvisibility maintenance using only onboard sensing, even under severe occlusions\nand dynamic evader movements.\n","authors":["Minnan Zhou","Mustafa Shaikh","Vatsalya Chaubey","Patrick Haggerty","Shumon Koga","Dimitra Panagou","Nikolay Atanasov"],"pdf_url":"https://arxiv.org/pdf/2411.01321v1.pdf","comment":"7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.01297v1","updated":"2024-11-02T16:06:29Z","published":"2024-11-02T16:06:29Z","title":"Receding Hamiltonian-Informed Optimal Neural Control and State\n Estimation for Closed-Loop Dynamical Systems","summary":" This paper formalizes Hamiltonian-Informed Optimal Neural (Hion) controllers,\na novel class of neural network-based controllers for dynamical systems and\nexplicit non-linear model predictive control. Hion controllers estimate future\nstates and compute optimal control inputs using Pontryagin's Maximum Principle.\nThe proposed framework allows for customization of transient behavior,\naddressing limitations of existing methods. The Taylored Multi-Faceted Approach\nfor Neural ODE and Optimal Control (T-mano) architecture facilitates training\nand ensures accurate state estimation. Optimal control strategies are\ndemonstrated for both linear and non-linear dynamical systems.\n","authors":["Josue N. Rivera","Dengfeng Sun"],"pdf_url":"https://arxiv.org/pdf/2411.01297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01286v1","updated":"2024-11-02T15:32:55Z","published":"2024-11-02T15:32:55Z","title":"Mixed-Integer MPC-Based Motion Planning Using Hybrid Zonotopes with\n Tight Relaxations","summary":" Autonomous vehicle (AV) motion planning problems often involve non-convex\nconstraints, which present a major barrier to applying model predictive control\n(MPC) in real time on embedded hardware. This paper presents an approach for\nefficiently solving mixed-integer MPC motion planning problems using a hybrid\nzonotope representation of the obstacle-free space. The MPC optimization\nproblem is formulated as a multi-stage mixed-integer quadratic program (MIQP)\nusing a hybrid zonotope representation of the non-convex constraints.\nRisk-aware planning is supported by assigning costs to different regions of the\nobstacle-free space within the MPC cost function. A multi-stage MIQP solver is\npresented that exploits the structure of the hybrid zonotope constraints. For\nsome hybrid zonotope representations, it is shown that the convex relaxation is\ntight, i.e., equal to the convex hull. In conjunction with logical constraints\nderived from the AV motion planning context, this property is leveraged to\ngenerate tight quadratic program (QP) sub-problems within a branch-and-bound\nmixed-integer solver. The hybrid zonotope structure is further leveraged to\nreduce the number of matrix factorizations that need to be computed within the\nQP sub-problems. Simulation studies are presented for obstacle-avoidance and\nrisk-aware motion planning problems using polytopic maps and occupancy grids.\nIn most cases, the proposed solver finds the optimal solution an order of\nmagnitude faster than a state-of-the-art commercial solver.\nProcessor-in-the-loop studies demonstrate the utility of the solver for\nreal-time implementations on embedded hardware.\n","authors":["Joshua A. Robbins","Jacob A. Siefert","Sean Brennan","Herschel C. Pangborn"],"pdf_url":"https://arxiv.org/pdf/2411.01286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01284v1","updated":"2024-11-02T15:28:06Z","published":"2024-11-02T15:28:06Z","title":"Task-Oriented Hierarchical Object Decomposition for Visuomotor Control","summary":" Good pre-trained visual representations could enable robots to learn\nvisuomotor policy efficiently. Still, existing representations take a\none-size-fits-all-tasks approach that comes with two important drawbacks: (1)\nBeing completely task-agnostic, these representations cannot effectively ignore\nany task-irrelevant information in the scene, and (2) They often lack the\nrepresentational capacity to handle unconstrained/complex real-world scenes.\nInstead, we propose to train a large combinatorial family of representations\norganized by scene entities: objects and object parts. This hierarchical object\ndecomposition for task-oriented representations (HODOR) permits selectively\nassembling different representations specific to each task while scaling in\nrepresentational capacity with the complexity of the scene and the task. In our\nexperiments, we find that HODOR outperforms prior pre-trained representations,\nboth scene vector representations and object-centric representations, for\nsample-efficient imitation learning across 5 simulated and 5 real-world\nmanipulation tasks. We further find that the invariances captured in HODOR are\ninherited into downstream policies, which can robustly generalize to\nout-of-distribution test conditions, permitting zero-shot skill chaining.\nAppendix, code, and videos: https://sites.google.com/view/hodor-corl24.\n","authors":["Jianing Qian","Yunshuang Li","Bernadette Bucher","Dinesh Jayaraman"],"pdf_url":"https://arxiv.org/pdf/2411.01284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04837v2","updated":"2024-11-02T15:03:17Z","published":"2024-09-07T14:25:08Z","title":"Context-Aware Replanning with Pre-explored Semantic Map for Object\n Navigation","summary":" Pre-explored Semantic Maps, constructed through prior exploration using\nvisual language models (VLMs), have proven effective as foundational elements\nfor training-free robotic applications. However, existing approaches assume the\nmap's accuracy and do not provide effective mechanisms for revising decisions\nbased on incorrect maps. To address this, we introduce Context-Aware Replanning\n(CARe), which estimates map uncertainty through confidence scores and\nmulti-view consistency, enabling the agent to revise erroneous decisions\nstemming from inaccurate maps without requiring additional labels. We\ndemonstrate the effectiveness of our proposed method by integrating it with two\nmodern mapping backbones, VLMaps and OpenMask3D, and observe significant\nperformance improvements in object navigation tasks. More details can be found\non the project page: https://care-maps.github.io/\n","authors":["Po-Chen Ko","Hung-Ting Su","Ching-Yuan Chen","Jia-Fong Yeh","Min Sun","Winston H. Hsu"],"pdf_url":"https://arxiv.org/pdf/2409.04837v2.pdf","comment":"CoRL 2024 camera ready. The first three authors contributed equally,\n and their order of authorship is interchangeable. Project page:\n https://care-maps.github.io/"},{"id":"http://arxiv.org/abs/2411.01274v1","updated":"2024-11-02T14:53:26Z","published":"2024-11-02T14:53:26Z","title":"Efficient Collaborative Navigation through Perception Fusion for\n Multi-Robots in Unknown Environments","summary":" For tasks conducted in unknown environments with efficiency requirements,\nreal-time navigation of multi-robot systems remains challenging due to\nunfamiliarity with surroundings.In this paper, we propose a novel multi-robot\ncollaborative planning method that leverages the perception of different robots\nto intelligently select search directions and improve planning efficiency.\nSpecifically, a foundational planner is employed to ensure reliable exploration\ntowards targets in unknown environments and we introduce Graph Attention\nArchitecture with Information Gain Weight(GIWT) to synthesizes the information\nfrom the target robot and its teammates to facilitate effective navigation\naround obstacles.In GIWT, after regionally encoding the relative positions of\nthe robots along with their perceptual features, we compute the shared\nattention scores and incorporate the information gain obtained from neighboring\nrobots as a supplementary weight. We design a corresponding expert data\ngeneration scheme to simulate real-world decision-making conditions for network\ntraining. Simulation experiments and real robot tests demonstrates that the\nproposed method significantly improves efficiency and enables collaborative\nplanning for multiple robots. Our method achieves approximately 82% accuracy on\nthe expert dataset and reduces the average path length by about 8% and 6%\nacross two types of tasks compared to the fundamental planner in ROS tests, and\na path length reduction of over 6% in real-world experiments.\n","authors":["Qingquan Lin","Weining Lu","Litong Meng","Chenxi Li","Bin Liang"],"pdf_url":"https://arxiv.org/pdf/2411.01274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01227v1","updated":"2024-11-02T12:15:32Z","published":"2024-11-02T12:15:32Z","title":"Rotational Odometry using Ultra Low Resolution Thermal Cameras","summary":" This letter provides what is, to the best of our knowledge, a first study on\nthe applicability of ultra-low-resolution thermal cameras for providing\nrotational odometry measurements to navigational devices such as rovers and\ndrones. Our use of an ultra-low-resolution thermal camera instead of other\nmodalities such as an RGB camera is motivated by its robustness to lighting\nconditions, while being one order of magnitude less cost-expensive compared to\nhigher-resolution thermal cameras. After setting up a custom data acquisition\nsystem and acquiring thermal camera data together with its associated\nrotational speed label, we train a small 4-layer Convolutional Neural Network\n(CNN) for regressing the rotational speed from the thermal data. Experiments\nand ablation studies are conducted for determining the impact of thermal camera\nresolution and the number of successive frames on the CNN estimation precision.\nFinally, our novel dataset for the study of low-resolution thermal odometry is\nopenly released with the hope of benefiting future research.\n","authors":["Ali Safa"],"pdf_url":"https://arxiv.org/pdf/2411.01227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01226v1","updated":"2024-11-02T12:15:29Z","published":"2024-11-02T12:15:29Z","title":"MonoPlane: Exploiting Monocular Geometric Cues for Generalizable 3D\n Plane Reconstruction","summary":" This paper presents a generalizable 3D plane detection and reconstruction\nframework named MonoPlane. Unlike previous robust estimator-based works (which\nrequire multiple images or RGB-D input) and learning-based works (which suffer\nfrom domain shift), MonoPlane combines the best of two worlds and establishes a\nplane reconstruction pipeline based on monocular geometric cues, resulting in\naccurate, robust and scalable 3D plane detection and reconstruction in the\nwild. Specifically, we first leverage large-scale pre-trained neural networks\nto obtain the depth and surface normals from a single image. These monocular\ngeometric cues are then incorporated into a proximity-guided RANSAC framework\nto sequentially fit each plane instance. We exploit effective 3D point\nproximity and model such proximity via a graph within RANSAC to guide the plane\nfitting from noisy monocular depths, followed by image-level multi-plane joint\noptimization to improve the consistency among all plane instances. We further\ndesign a simple but effective pipeline to extend this single-view solution to\nsparse-view 3D plane reconstruction. Extensive experiments on a list of\ndatasets demonstrate our superior zero-shot generalizability over baselines,\nachieving state-of-the-art plane reconstruction performance in a transferring\nsetting. Our code is available at https://github.com/thuzhaowang/MonoPlane .\n","authors":["Wang Zhao","Jiachen Liu","Sheng Zhang","Yishu Li","Sili Chen","Sharon X Huang","Yong-Jin Liu","Hengkai Guo"],"pdf_url":"https://arxiv.org/pdf/2411.01226v1.pdf","comment":"IROS 2024 (oral)"},{"id":"http://arxiv.org/abs/2411.01200v1","updated":"2024-11-02T10:09:08Z","published":"2024-11-02T10:09:08Z","title":"GarmentLab: A Unified Simulation and Benchmark for Garment Manipulation","summary":" Manipulating garments and fabrics has long been a critical endeavor in the\ndevelopment of home-assistant robots. However, due to complex dynamics and\ntopological structures, garment manipulations pose significant challenges.\nRecent successes in reinforcement learning and vision-based methods offer\npromising avenues for learning garment manipulation. Nevertheless, these\napproaches are severely constrained by current benchmarks, which offer limited\ndiversity of tasks and unrealistic simulation behavior. Therefore, we present\nGarmentLab, a content-rich benchmark and realistic simulation designed for\ndeformable object and garment manipulation. Our benchmark encompasses a diverse\nrange of garment types, robotic systems and manipulators. The abundant tasks in\nthe benchmark further explores of the interactions between garments, deformable\nobjects, rigid bodies, fluids, and human body. Moreover, by incorporating\nmultiple simulation methods such as FEM and PBD, along with our proposed\nsim-to-real algorithms and real-world benchmark, we aim to significantly narrow\nthe sim-to-real gap. We evaluate state-of-the-art vision methods, reinforcement\nlearning, and imitation learning approaches on these tasks, highlighting the\nchallenges faced by current algorithms, notably their limited generalization\ncapabilities. Our proposed open-source environments and comprehensive analysis\nshow promising boost to future research in garment manipulation by unlocking\nthe full potential of these methods. We guarantee that we will open-source our\ncode as soon as possible. You can watch the videos in supplementary files to\nlearn more about the details of our work. Our project page is available at:\nhttps://garmentlab.github.io/\n","authors":["Haoran Lu","Ruihai Wu","Yitong Li","Sijie Li","Ziyu Zhu","Chuanruo Ning","Yan Shen","Longzan Luo","Yuanpei Chen","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2411.01200v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2403.05500v4","updated":"2024-11-02T08:04:16Z","published":"2024-03-08T18:17:56Z","title":"Using Fiber Optic Bundles to Miniaturize Vision-Based Tactile Sensors","summary":" Vision-based tactile sensors have recently become popular due to their\ncombination of low cost, very high spatial resolution, and ease of integration\nusing widely available miniature cameras. The associated field of view and\nfocal length, however, are difficult to package in a human-sized finger. In\nthis paper we employ optical fiber bundles to achieve a form factor that, at 15\nmm diameter, is smaller than an average human fingertip. The electronics and\ncamera are also located remotely, further reducing package size. The sensor\nachieves a spatial resolution of 0.22 mm and a minimum force resolution 5 mN\nfor normal and shear contact forces. With these attributes, the DIGIT Pinki\nsensor is suitable for applications such as robotic and teleoperated digital\npalpation. We demonstrate its utility for palpation of the prostate gland and\nshow that it can achieve clinically relevant discrimination of prostate\nstiffness for phantom and ex vivo tissue.\n","authors":["Julia Di","Zdravko Dugonjic","Will Fu","Tingfan Wu","Romeo Mercado","Kevin Sawyer","Victoria Rose Most","Gregg Kammerer","Stefanie Speidel","Richard E. Fan","Geoffrey Sonn","Mark R. Cutkosky","Mike Lambeta","Roberto Calandra"],"pdf_url":"https://arxiv.org/pdf/2403.05500v4.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n The CAD design files of DIGIT Pinki are available at\n https://github.com/facebookresearch/digit-design"},{"id":"http://arxiv.org/abs/2410.23059v2","updated":"2024-11-02T07:35:02Z","published":"2024-10-30T14:33:22Z","title":"FilMBot: A High-Speed Soft Parallel Robotic Micromanipulator","summary":" Soft robotic manipulators are generally slow despite their great\nadaptability, resilience, and compliance. This limitation also extends to\ncurrent soft robotic micromanipulators. Here, we introduce FilMBot, a 3-DOF\nfilm-based, electromagnetically actuated, soft kinematic robotic\nmicromanipulator achieving speeds up to 2117 $\\deg$/s and 2456 $\\deg$/s in\n$\\alpha$ and $\\beta$ angular motions, with corresponding linear velocities of\n1.61 m/s and 1.92 m/s using a 4-cm needle end-effector, and 1.57 m/s along the\nZ axis. The robot can reach ~1.50 m/s in path-following tasks, operates at\nfrequencies up to 30 Hz, and remains functional up to 50 Hz. It demonstrates\nhigh precision (~6.3 $\\mu$m, or ~0.05% of its workspace) in small\npath-following tasks. The novel combination of the low-stiffness soft kinematic\nfilm structure and strong electromagnetic actuation in FilMBot opens new\navenues for soft robotics. Furthermore, its simple construction and\ninexpensive, readily accessible components could broaden the application of\nmicromanipulators beyond current academic and professional users.\n","authors":["Jiangkun Yu","Houari Bettahar","Hakan Kandemir","Quan Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.23059v2.pdf","comment":"12 pages, 15 figures"},{"id":"http://arxiv.org/abs/2410.22931v3","updated":"2024-11-02T06:09:23Z","published":"2024-10-30T11:37:47Z","title":"GPTR: Gaussian Process Trajectory Representation for Continuous-Time\n Motion Estimation","summary":" Continuous-time trajectory representation has gained significant popularity\nin recent years, as it offers an elegant formulation that allows the fusion of\na larger number of sensors and sensing modalities, overcoming limitations of\ntraditional discrete-time frameworks. To bolster the adoption of the\ncontinuous-time paradigm, we propose a so-called Gaussian Process Trajectory\nRepresentation (GPTR) framework for continuous-time motion estimation (CTME)\ntasks. Our approach stands out by employing a third-order random jerk model,\nfeaturing closed-form expressions for both rotational and translational state\nderivatives. This model provides smooth, continuous trajectory representations\nthat are crucial for precise estimation of complex motion. To support the wider\nrobotics and computer vision communities, we have made the source code for GPTR\navailable as a light-weight header-only library. This format was chosen for its\nease of integration, allowing developers to incorporate GPTR into existing\nsystems without needing extensive code modifications. Moreover, we also provide\na set of optimization examples with LiDAR, camera, IMU, UWB factors, and\nclosed-form analytical Jacobians under the proposed GP framework. Our\nexperiments demonstrate the efficacy and efficiency of GP-based trajectory\nrepresentation in various motion estimation tasks, and the examples can serve\nas the prototype to help researchers quickly develop future applications such\nas batch optimization, calibration, sensor fusion, trajectory planning, etc.,\nwith continuous-time trajectory representation. Our project is accessible at\nhttps://github.com/brytsknguyen/gptr .\n","authors":["Thien-Minh Nguyen","Ziyu Cao","Kailai Li","Shenghai Yuan","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2410.22931v3.pdf","comment":"The source code has been released. All feedbacks are welcome"},{"id":"http://arxiv.org/abs/2410.24164v2","updated":"2024-11-02T04:00:56Z","published":"2024-10-31T17:22:30Z","title":"$π_0$: A Vision-Language-Action Flow Model for General Robot Control","summary":" Robot learning holds tremendous promise to unlock the full potential of\nflexible, general, and dexterous robot systems, as well as to address some of\nthe deepest questions in artificial intelligence. However, bringing robot\nlearning to the level of generality required for effective real-world systems\nfaces major obstacles in terms of data, generalization, and robustness. In this\npaper, we discuss how generalist robot policies (i.e., robot foundation models)\ncan address these challenges, and how we can design effective generalist robot\npolicies for complex and highly dexterous tasks. We propose a novel flow\nmatching architecture built on top of a pre-trained vision-language model (VLM)\nto inherit Internet-scale semantic knowledge. We then discuss how this model\ncan be trained on a large and diverse dataset from multiple dexterous robot\nplatforms, including single-arm robots, dual-arm robots, and mobile\nmanipulators. We evaluate our model in terms of its ability to perform tasks in\nzero shot after pre-training, follow language instructions from people and from\na high-level VLM policy, and its ability to acquire new skills via fine-tuning.\nOur results cover a wide variety of tasks, such as laundry folding, table\ncleaning, and assembling boxes.\n","authors":["Kevin Black","Noah Brown","Danny Driess","Adnan Esmail","Michael Equi","Chelsea Finn","Niccolo Fusai","Lachy Groom","Karol Hausman","Brian Ichter","Szymon Jakubczak","Tim Jones","Liyiming Ke","Sergey Levine","Adrian Li-Bell","Mohith Mothukuri","Suraj Nair","Karl Pertsch","Lucy Xiaoyang Shi","James Tanner","Quan Vuong","Anna Walling","Haohuan Wang","Ury Zhilinsky"],"pdf_url":"https://arxiv.org/pdf/2410.24164v2.pdf","comment":"See project website for videos:\n https://physicalintelligence.company/blog/pi0"},{"id":"http://arxiv.org/abs/2411.01120v1","updated":"2024-11-02T03:20:33Z","published":"2024-11-02T03:20:33Z","title":"Generation of Conservative Dynamical Systems Based on Stiffness Encoding","summary":" Dynamical systems (DSs) provide a framework for high flexibility, robustness,\nand control reliability and are widely used in motion planning and physical\nhuman-robot interaction. The properties of the DS directly determine the\nrobot's specific motion patterns and the performance of the closed-loop control\nsystem. In this paper, we establish a quantitative relationship between\nstiffness properties and DS. We propose a stiffness encoding framework to\nmodulate DS properties by embedding specific stiffnesses. In particular, from\nthe perspective of the closed-loop control system's passivity, a conservative\nDS is learned by encoding a conservative stiffness. The generated DS has a\nsymmetric attraction behavior and a variable stiffness profile. The proposed\nmethod is applicable to demonstration trajectories belonging to different\nmanifolds and types (e.g., closed and self-intersecting trajectories), and the\nclosed-loop control system is always guaranteed to be passive in different\ncases. For controllers tracking the general DS, the passivity of the system\nneeds to be guaranteed by the energy tank. We further propose a generic vector\nfield decomposition strategy based on conservative stiffness, which effectively\nslows down the decay rate of energy in the energy tank and improves the\nstability margin of the control system. Finally, a series of simulations in\nvarious scenarios and experiments on planar and curved motion tasks demonstrate\nthe validity of our theory and methodology.\n","authors":["Tengyu Hou","Hanming Bai","Ye Ding","Han Ding"],"pdf_url":"https://arxiv.org/pdf/2411.01120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01119v1","updated":"2024-11-02T03:20:06Z","published":"2024-11-02T03:20:06Z","title":"AquaFuse: Waterbody Fusion for Physics Guided View Synthesis of\n Underwater Scenes","summary":" We introduce the idea of AquaFuse, a physics-based method for synthesizing\nwaterbody properties in underwater imagery. We formulate a closed-form solution\nfor waterbody fusion that facilitates realistic data augmentation and\ngeometrically consistent underwater scene rendering. AquaFuse leverages the\nphysical characteristics of light propagation underwater to synthesize the\nwaterbody from one scene to the object contents of another. Unlike data-driven\nstyle transfer, AquaFuse preserves the depth consistency and object geometry in\nan input scene. We validate this unique feature by comprehensive experiments\nover diverse underwater scenes. We find that the AquaFused images preserve over\n94% depth consistency and 90-95% structural similarity of the input scenes. We\nalso demonstrate that it generates accurate 3D view synthesis by preserving\nobject geometry while adapting to the inherent waterbody fusion process.\nAquaFuse opens up a new research direction in data augmentation by\ngeometry-preserving style transfer for underwater imaging and robot vision\napplications.\n","authors":["Md Abu Bakr Siddique","Jiayi Wu","Ioannis Rekleitis","Md Jahidul Islam"],"pdf_url":"https://arxiv.org/pdf/2411.01119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19236v3","updated":"2024-11-02T02:14:09Z","published":"2024-06-27T15:01:42Z","title":"Human-Aware Vision-and-Language Navigation: Bridging Simulation to\n Reality with Dynamic Human Interactions","summary":" Vision-and-Language Navigation (VLN) aims to develop embodied agents that\nnavigate based on human instructions. However, current VLN frameworks often\nrely on static environments and optimal expert supervision, limiting their\nreal-world applicability. To address this, we introduce Human-Aware\nVision-and-Language Navigation (HA-VLN), extending traditional VLN by\nincorporating dynamic human activities and relaxing key assumptions. We propose\nthe Human-Aware 3D (HA3D) simulator, which combines dynamic human activities\nwith the Matterport3D dataset, and the Human-Aware Room-to-Room (HA-R2R)\ndataset, extending R2R with human activity descriptions. To tackle HA-VLN\nchallenges, we present the Expert-Supervised Cross-Modal (VLN-CM) and\nNon-Expert-Supervised Decision Transformer (VLN-DT) agents, utilizing\ncross-modal fusion and diverse training strategies for effective navigation in\ndynamic human environments. A comprehensive evaluation, including metrics\nconsidering human activities, and systematic analysis of HA-VLN's unique\nchallenges, underscores the need for further research to enhance HA-VLN agents'\nreal-world robustness and adaptability. Ultimately, this work provides\nbenchmarks and insights for future research on embodied AI and Sim2Real\ntransfer, paving the way for more realistic and applicable VLN systems in\nhuman-populated environments.\n","authors":["Heng Li","Minghan Li","Zhi-Qi Cheng","Yifei Dong","Yuxuan Zhou","Jun-Yan He","Qi Dai","Teruko Mitamura","Alexander G. Hauptmann"],"pdf_url":"https://arxiv.org/pdf/2406.19236v3.pdf","comment":"Spotlight at NeurIPS 2024 D&B Track. 32 pages, 18 figures, Project\n Page: https://lpercc.github.io/HA3D_simulator/"},{"id":"http://arxiv.org/abs/2410.21795v2","updated":"2024-11-02T02:09:58Z","published":"2024-10-29T07:00:47Z","title":"Robot Policy Learning with Temporal Optimal Transport Reward","summary":" Reward specification is one of the most tricky problems in Reinforcement\nLearning, which usually requires tedious hand engineering in practice. One\npromising approach to tackle this challenge is to adopt existing expert video\ndemonstrations for policy learning. Some recent work investigates how to learn\nrobot policies from only a single/few expert video demonstrations. For example,\nreward labeling via Optimal Transport (OT) has been shown to be an effective\nstrategy to generate a proxy reward by measuring the alignment between the\nrobot trajectory and the expert demonstrations. However, previous work mostly\noverlooks that the OT reward is invariant to temporal order information, which\ncould bring extra noise to the reward signal. To address this issue, in this\npaper, we introduce the Temporal Optimal Transport (TemporalOT) reward to\nincorporate temporal order information for learning a more accurate OT-based\nproxy reward. Extensive experiments on the Meta-world benchmark tasks validate\nthe efficacy of the proposed method. Code is available at:\nhttps://github.com/fuyw/TemporalOT\n","authors":["Yuwei Fu","Haichao Zhang","Di Wu","Wei Xu","Benoit Boulet"],"pdf_url":"https://arxiv.org/pdf/2410.21795v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2301.05294v4","updated":"2024-11-02T01:59:45Z","published":"2023-01-12T21:09:58Z","title":"Learning to Control and Coordinate Mixed Traffic Through Robot Vehicles\n at Complex and Unsignalized Intersections","summary":" Intersections are essential road infrastructures for traffic in modern\nmetropolises. However, they can also be the bottleneck of traffic flows as a\nresult of traffic incidents or the absence of traffic coordination mechanisms\nsuch as traffic lights. Recently, various control and coordination mechanisms\nthat are beyond traditional control methods have been proposed to improve the\nefficiency of intersection traffic by leveraging the ability of autonomous\nvehicles. Amongst these methods, the control of foreseeable mixed traffic that\nconsists of human-driven vehicles (HVs) and robot vehicles (RVs) has emerged.\nWe propose a decentralized multi-agent reinforcement learning approach for the\ncontrol and coordination of mixed traffic by RVs at real-world, complex\nintersections -- an open challenge to date. We design comprehensive experiments\nto evaluate the effectiveness, robustness, generalizablility, and adaptability\nof our approach. In particular, our method can prevent congestion formation via\nmerely 5% RVs under a real-world traffic demand of 700 vehicles per hour. In\ncontrast, without RVs, congestion will form when the traffic demand reaches as\nlow as 200 vehicles per hour. Moreover, when the RV penetration rate exceeds\n60%, our method starts to outperform traffic signal control in terms of the\naverage waiting time of all vehicles. Our method is not only robust against\nblackout events, sudden RV percentage drops, and V2V communication error, but\nalso enjoys excellent generalizablility, evidenced by its successful deployment\nin five unseen intersections. Lastly, our method performs well under various\ntraffic rules, demonstrating its adaptability to diverse scenarios. Videos and\ncode of our work are available at\nhttps://sites.google.com/view/mixedtrafficcontrol\n","authors":["Dawei Wang","Weizi Li","Lei Zhu","Jia Pan"],"pdf_url":"https://arxiv.org/pdf/2301.05294v4.pdf","comment":"This paper introduces the first method to control and coordinate\n mixed traffic (i.e., human-driven vehicles and robot vehicles) at\n unsignalized intersections with both complicated topology and real-world\n traffic demands. The International Journal of Robotics Research. 2024;0(0)"},{"id":"http://arxiv.org/abs/2309.08079v3","updated":"2024-11-02T00:23:06Z","published":"2023-09-15T00:39:56Z","title":"MPCGPU: Real-Time Nonlinear Model Predictive Control through\n Preconditioned Conjugate Gradient on the GPU","summary":" Nonlinear Model Predictive Control (NMPC) is a state-of-the-art approach for\nlocomotion and manipulation which leverages trajectory optimization at each\ncontrol step. While the performance of this approach is computationally\nbounded, implementations of direct trajectory optimization that use iterative\nmethods to solve the underlying moderately-large and sparse linear systems, are\na natural fit for parallel hardware acceleration. In this work, we introduce\nMPCGPU, a GPU-accelerated, real-time NMPC solver that leverages an accelerated\npreconditioned conjugate gradient (PCG) linear system solver at its core. We\nshow that MPCGPU increases the scalability and real-time performance of NMPC,\nsolving larger problems, at faster rates. In particular, for tracking tasks\nusing the Kuka IIWA manipulator, MPCGPU is able to scale to kilohertz control\nrates with trajectories as long as 512 knot points. This is driven by a custom\nPCG solver which outperforms state-of-the-art, CPU-based, linear system solvers\nby at least 10x for a majority of solves and 3.6x on average.\n","authors":["Emre Adabag","Miloni Atal","William Gerard","Brian Plancher"],"pdf_url":"https://arxiv.org/pdf/2309.08079v3.pdf","comment":"Accepted to ICRA 2024, 8 pages, 6 figures"}]},"2024-11-05T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2411.03303v1","updated":"2024-11-05T17:56:27Z","published":"2024-11-05T17:56:27Z","title":"Monocular Event-Based Vision for Obstacle Avoidance with a Quadrotor","summary":" We present the first static-obstacle avoidance method for quadrotors using\njust an onboard, monocular event camera. Quadrotors are capable of fast and\nagile flight in cluttered environments when piloted manually, but vision-based\nautonomous flight in unknown environments is difficult in part due to the\nsensor limitations of traditional onboard cameras. Event cameras, however,\npromise nearly zero motion blur and high dynamic range, but produce a very\nlarge volume of events under significant ego-motion and further lack a\ncontinuous-time sensor model in simulation, making direct sim-to-real transfer\nnot possible. By leveraging depth prediction as a pretext task in our learning\nframework, we can pre-train a reactive obstacle avoidance events-to-control\npolicy with approximated, simulated events and then fine-tune the perception\ncomponent with limited events-and-depth real-world data to achieve obstacle\navoidance in indoor and outdoor settings. We demonstrate this across two\nquadrotor-event camera platforms in multiple settings and find, contrary to\ntraditional vision-based works, that low speeds (1m/s) make the task harder and\nmore prone to collisions, while high speeds (5m/s) result in better event-based\ndepth estimation and avoidance. We also find that success rates in outdoor\nscenes can be significantly higher than in certain indoor scenes.\n","authors":["Anish Bhattacharya","Marco Cannici","Nishanth Rao","Yuezhan Tao","Vijay Kumar","Nikolai Matni","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2411.03303v1.pdf","comment":"18 pages with supplementary"},{"id":"http://arxiv.org/abs/2404.00318v2","updated":"2024-11-05T17:51:36Z","published":"2024-03-30T10:54:59Z","title":"Cognitive Planning for Object Goal Navigation using Generative AI Models","summary":" Recent advancements in Generative AI, particularly in Large Language Models\n(LLMs) and Large Vision-Language Models (LVLMs), offer new possibilities for\nintegrating cognitive planning into robotic systems. In this work, we present a\nnovel framework for solving the object goal navigation problem that generates\nefficient exploration strategies. Our approach enables a robot to navigate\nunfamiliar environments by leveraging LLMs and LVLMs to understand the semantic\nstructure of the scene. To address the challenge of representing complex\nenvironments without overwhelming the system, we propose a 3D modular scene\nrepresentation, enriched with semantic descriptions. This representation is\ndynamically pruned using an LLM-based mechanism, which filters irrelevant\ninformation and focuses on task-specific data. By combining these elements, our\nsystem generates high-level sub-goals that guide the exploration of the robot\ntoward the target object. We validate our approach in simulated environments,\ndemonstrating its ability to enhance object search efficiency while maintaining\nscalability in complex settings.\n","authors":["Arjun P S","Andrew Melnik","Gora Chand Nandi"],"pdf_url":"https://arxiv.org/pdf/2404.00318v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03294v1","updated":"2024-11-05T17:41:14Z","published":"2024-11-05T17:41:14Z","title":"Out-of-Distribution Recovery with Object-Centric Keypoint Inverse Policy\n For Visuomotor Imitation Learning","summary":" We propose an object-centric recovery policy framework to address the\nchallenges of out-of-distribution (OOD) scenarios in visuomotor policy\nlearning. Previous behavior cloning (BC) methods rely heavily on a large amount\nof labeled data coverage, failing in unfamiliar spatial states. Without relying\non extra data collection, our approach learns a recovery policy constructed by\nan inverse policy inferred from object keypoint manifold gradient in the\noriginal training data. The recovery policy serves as a simple add-on to any\nbase visuomotor BC policy, agnostic to a specific method, guiding the system\nback towards the training distribution to ensure task success even in OOD\nsituations. We demonstrate the effectiveness of our object-centric framework in\nboth simulation and real robot experiments, achieving an improvement of\n$\\textbf{77.7\\%}$ over the base policy in OOD. Project Website:\nhttps://sites.google.com/view/ocr-penn\n","authors":["George Jiayuan Gao","Tianyu Li","Nadia Figueroa"],"pdf_url":"https://arxiv.org/pdf/2411.03294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03289v1","updated":"2024-11-05T17:38:03Z","published":"2024-11-05T17:38:03Z","title":"Data-Driven Sampling Based Stochastic MPC for Skid-Steer Mobile Robot\n Navigation","summary":" Traditional approaches to motion modeling for skid-steer robots struggle with\ncapturing nonlinear tire-terrain dynamics, especially during high-speed\nmaneuvers. In this paper, we tackle such nonlinearities by enhancing a dynamic\nunicycle model with Gaussian Process (GP) regression outputs. This enables us\nto develop an adaptive, uncertainty-informed navigation formulation. We solve\nthe resultant stochastic optimal control problem using a chance-constrained\nModel Predictive Path Integral (MPPI) control method. This approach formulates\nboth obstacle avoidance and path-following as chance constraints, accounting\nfor residual uncertainties from the GP to ensure safety and reliability in\ncontrol. Leveraging GPU acceleration, we efficiently manage the non-convex\nnature of the problem, ensuring real-time performance. Our approach unifies\npath-following and obstacle avoidance across different terrains, unlike prior\nworks which typically focus on one or the other. We compare our GP-MPPI method\nagainst unicycle and data-driven kinematic models within the MPPI framework. In\nsimulations, our approach shows superior tracking accuracy and obstacle\navoidance. We further validate our approach through hardware experiments on a\nskid-steer robot platform, demonstrating its effectiveness in high-speed\nnavigation. The GPU implementation of the proposed method and supplementary\nvideo footage are available at https: //stochasticmppi.github.io.\n","authors":["Ananya Trivedi","Sarvesh Prajapati","Anway Shirgaonkar","Mark Zolotas","Taskin Padir"],"pdf_url":"https://arxiv.org/pdf/2411.03289v1.pdf","comment":"Currently under review for ICRA 2025"},{"id":"http://arxiv.org/abs/2411.03287v1","updated":"2024-11-05T17:36:32Z","published":"2024-11-05T17:36:32Z","title":"The Future of Intelligent Healthcare: A Systematic Analysis and\n Discussion on the Integration and Impact of Robots Using Large Language\n Models for Healthcare","summary":" The potential use of large language models (LLMs) in healthcare robotics can\nhelp address the significant demand put on healthcare systems around the world\nwith respect to an aging demographic and a shortage of healthcare\nprofessionals. Even though LLMs have already been integrated into medicine to\nassist both clinicians and patients, the integration of LLMs within healthcare\nrobots has not yet been explored for clinical settings. In this perspective\npaper, we investigate the groundbreaking developments in robotics and LLMs to\nuniquely identify the needed system requirements for designing health specific\nLLM based robots in terms of multi modal communication through human robot\ninteractions (HRIs), semantic reasoning, and task planning. Furthermore, we\ndiscuss the ethical issues, open challenges, and potential future research\ndirections for this emerging innovative field.\n","authors":["Souren Pashangpour","Goldie Nejat"],"pdf_url":"https://arxiv.org/pdf/2411.03287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13297v3","updated":"2024-11-05T17:35:25Z","published":"2024-03-20T04:39:15Z","title":"POLICEd RL: Learning Closed-Loop Robot Control Policies with Provable\n Satisfaction of Hard Constraints","summary":" In this paper, we seek to learn a robot policy guaranteed to satisfy state\nconstraints. To encourage constraint satisfaction, existing RL algorithms\ntypically rely on Constrained Markov Decision Processes and discourage\nconstraint violations through reward shaping. However, such soft constraints\ncannot offer verifiable safety guarantees. To address this gap, we propose\nPOLICEd RL, a novel RL algorithm explicitly designed to enforce affine hard\nconstraints in closed-loop with a black-box environment. Our key insight is to\nforce the learned policy to be affine around the unsafe set and use this affine\nregion as a repulsive buffer to prevent trajectories from violating the\nconstraint. We prove that such policies exist and guarantee constraint\nsatisfaction. Our proposed framework is applicable to both systems with\ncontinuous and discrete state and action spaces and is agnostic to the choice\nof the RL training algorithm. Our results demonstrate the capacity of POLICEd\nRL to enforce hard constraints in robotic tasks while significantly\noutperforming existing methods.\n","authors":["Jean-Baptiste Bouvier","Kartik Nagpal","Negar Mehr"],"pdf_url":"https://arxiv.org/pdf/2403.13297v3.pdf","comment":"Robotics: Science and Systems (RSS) 2024,\n https://www.roboticsproceedings.org/rss20/p104.html"},{"id":"http://arxiv.org/abs/2410.16481v3","updated":"2024-11-05T16:53:48Z","published":"2024-10-21T20:12:45Z","title":"Caging in Time: A Framework for Robust Object Manipulation under\n Uncertainties and Limited Robot Perception","summary":" Real-world object manipulation has been commonly challenged by physical\nuncertainties and perception limitations. Being an effective strategy, while\ncaging configuration-based manipulation frameworks have successfully provided\nrobust solutions, they are not broadly applicable due to their strict\nrequirements on the availability of multiple robots, widely distributed\ncontacts, or specific geometries of the robots or the objects. To this end,\nthis work proposes a novel concept, termed Caging in Time, to allow caging\nconfigurations to be formed even if there is just one robot engaged in a task.\nThis novel concept can be explained by an insight that even if a caging\nconfiguration is needed to constrain the motion of an object, only a small\nportion of the cage is actively manipulating at a time. As such, we can switch\nthe configuration of the robot strategically so that by collapsing its\nconfiguration in time, we will see a cage formed and its necessary portion\nactive whenever needed. We instantiate our Caging in Time theory on challenging\nquasistatic and dynamic manipulation tasks, showing that Caging in Time can be\nachieved in general state spaces including geometry-based and energy-based\nspaces. With extensive experiments, we show robust and accurate manipulation,\nin an open-loop manner, without requiring detailed knowledge of the object\ngeometry or physical properties, nor realtime accurate feedback on the\nmanipulation states. In addition to being an effective and robust open-loop\nmanipulation solution, the proposed theory can be a supplementary strategy to\nother manipulation systems affected by uncertain or limited robot perception.\n","authors":["Gaotian Wang","Kejia Ren","Andrew S. Morgan","Kaiyu Hang"],"pdf_url":"https://arxiv.org/pdf/2410.16481v3.pdf","comment":"24 pages, 25 figures, video available at:\n www.youtube.com/watch?v=Ag_jTzazuSM"},{"id":"http://arxiv.org/abs/2310.20605v4","updated":"2024-11-05T16:46:53Z","published":"2023-10-31T16:39:58Z","title":"Learning Lyapunov-Stable Polynomial Dynamical Systems through Imitation","summary":" Imitation learning is a paradigm to address complex motion planning problems\nby learning a policy to imitate an expert's behavior. However, relying solely\non the expert's data might lead to unsafe actions when the robot deviates from\nthe demonstrated trajectories. Stability guarantees have previously been\nprovided utilizing nonlinear dynamical systems, acting as high-level motion\nplanners, in conjunction with the Lyapunov stability theorem. Yet, these\nmethods are prone to inaccurate policies, high computational cost, sample\ninefficiency, or quasi stability when replicating complex and highly nonlinear\ntrajectories. To mitigate this problem, we present an approach for learning a\nglobally stable nonlinear dynamical system as a motion planning policy. We\nmodel the nonlinear dynamical system as a parametric polynomial and learn the\npolynomial's coefficients jointly with a Lyapunov candidate. To showcase its\nsuccess, we compare our method against the state of the art in simulation and\nconduct real-world experiments with the Kinova Gen3 Lite manipulator arm. Our\nexperiments demonstrate the sample efficiency and reproduction accuracy of our\nmethod for various expert trajectories, while remaining stable in the face of\nperturbations.\n","authors":["Amin Abyaneh","Hsiu-Chin Lin"],"pdf_url":"https://arxiv.org/pdf/2310.20605v4.pdf","comment":"In 7th Annual Conference on Robot Learning 2023 Aug 30"},{"id":"http://arxiv.org/abs/2406.17620v2","updated":"2024-11-05T16:15:08Z","published":"2024-06-25T15:05:00Z","title":"OCCAM: Online Continuous Controller Adaptation with Meta-Learned Models","summary":" Control tuning and adaptation present a significant challenge to the usage of\nrobots in diverse environments. It is often nontrivial to find a single set of\ncontrol parameters by hand that work well across the broad array of\nenvironments and conditions that a robot might encounter. Automated adaptation\napproaches must utilize prior knowledge about the system while adapting to\nsignificant domain shifts to find new control parameters quickly. In this work,\nwe present a general framework for online controller adaptation that deals with\nthese challenges. We combine meta-learning with Bayesian recursive estimation\nto learn prior predictive models of system performance that quickly adapt to\nonline data, even when there is significant domain shift. These predictive\nmodels can be used as cost functions within efficient sampling-based\noptimization routines to find new control parameters online that maximize\nsystem performance. Our framework is powerful and flexible enough to adapt\ncontrollers for four diverse systems: a simulated race car, a simulated\nquadrupedal robot, and a simulated and physical quadrotor. The video and code\ncan be found at https://hersh500.github.io/occam.\n","authors":["Hersh Sanghvi","Spencer Folk","Camillo Jose Taylor"],"pdf_url":"https://arxiv.org/pdf/2406.17620v2.pdf","comment":"8 pages, 4 figures. Accepted to Conference on Robot Learning (CoRL)\n 2024"},{"id":"http://arxiv.org/abs/2411.03213v1","updated":"2024-11-05T16:02:54Z","published":"2024-11-05T16:02:54Z","title":"What Makes an Educational Robot Game Fun? Framework Analysis of\n Children's Design Ideas","summary":" Fun acts as a catalyst for learning by enhancing motivation, active\nengagement and knowledge retention. As social robots gain traction as\neducational tools, understanding how their unique affordances can be leveraged\nto cultivate fun becomes crucial. This research investigates the concept of fun\nin educational games involving social robots to support the design of REMind:a\nrobot-mediated role-play game aimed at encouraging bystander intervention\nagainst peer bullying among children. To incorporate fun elements into design\nof REMind, we conducted a user-centered Research through Design (RtD) study\nwith focus groups of children to gain a deeper understanding of their\nperceptions of fun. We analyzed children's ideas by using Framework Analysis\nand leveraging LeBlanc's Taxonomy of Game Pleasures and identified 28 elements\nof fun that can be incorporated into robot-mediated games. We present our\nobservations, discuss their impact on REMind's design, and offer\nrecommendations for designing fun educational games using social robots.\n","authors":["Elaheh Sanoubari","John Edison Muñoz","Ali Yamini","Neil Randall","Kerstni Dautenhahn"],"pdf_url":"https://arxiv.org/pdf/2411.03213v1.pdf","comment":"This is a pre-print of a manuscript that was accepted to\n International Conference on Social Robotics 2024 (ICSR'24 + AI), 2024, which\n was held in Odense, Denmark"},{"id":"http://arxiv.org/abs/2405.17779v2","updated":"2024-11-05T15:56:14Z","published":"2024-05-28T03:19:15Z","title":"Online Analytic Exemplar-Free Continual Learning with Large Models for\n Imbalanced Autonomous Driving Task","summary":" In autonomous driving, even a meticulously trained model can encounter\nfailures when facing unfamiliar scenarios. One of these scenarios can be\nformulated as an online continual learning (OCL) problem. That is, data come in\nan online fashion, and models are updated according to these streaming data.\nTwo major OCL challenges are catastrophic forgetting and data imbalance. To\naddress these challenges, in this paper, we propose an Analytic Exemplar-Free\nOnline Continual Learning algorithm (AEF-OCL). The AEF-OCL leverages analytic\ncontinual learning principles and employs ridge regression as a classifier for\nfeatures extracted by a large backbone network. It solves the OCL problem by\nrecursively calculating the analytical solution, ensuring an equalization\nbetween the continual learning and its joint-learning counterpart, and works\nwithout the need to save any used samples (i.e., exemplar-free). Additionally,\nwe introduce a Pseudo-Features Generator (PFG) module that recursively\nestimates the mean and the variance of real features for each class. It\nover-samples offset pseudo-features from the same normal distribution as the\nreal features, thereby addressing the data imbalance issue. Experimental\nresults demonstrate that despite being an exemplar-free strategy, our method\noutperforms various methods on the autonomous driving SODA10M dataset. Source\ncode is available at https://github.com/ZHUANGHP/Analytic-continual-learning.\n","authors":["Huiping Zhuang","Di Fang","Kai Tong","Yuchen Liu","Ziqian Zeng","Xu Zhou","Cen Chen"],"pdf_url":"https://arxiv.org/pdf/2405.17779v2.pdf","comment":"This paper is to be published in IEEE Transactions on Vehicular\n Technology"},{"id":"http://arxiv.org/abs/2411.03194v1","updated":"2024-11-05T15:39:36Z","published":"2024-11-05T15:39:36Z","title":"Energy Consumption in Robotics: A Simplified Modeling Approach","summary":" The energy use of a robot is trajectory-dependent, and thus can be reduced by\noptimization of the trajectory. Current methods for robot trajectory\noptimization can reduce energy up to 15\\% for fixed start and end points,\nhowever their use in industrial robot planning is still restricted due to model\ncomplexity and lack of integration with planning tools which address other\nconcerns (e.g. collision avoidance). We propose an approach that uses\ndifferentiable inertial and kinematic models from standard open-source tools,\nintegrating with standard ROS planning methods. An inverse dynamics-based\nenergy model is optionally extended with a single-parameter electrical model,\nsimplifying the model identification process. We compare the inertial and\nelectrical models on a collaborative robot, showing that simplified models\nprovide competitive accuracy and are easier to deploy in practice.\n","authors":["Valentyn Petrichenko","Lisa Lokstein","Gregor Thiele","Kevin Haninger"],"pdf_url":"https://arxiv.org/pdf/2411.03194v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2411.03189v1","updated":"2024-11-05T15:34:25Z","published":"2024-11-05T15:34:25Z","title":"Energy-Aware Predictive Motion Planning for Autonomous Vehicles Using a\n Hybrid Zonotope Constraint Representation","summary":" Uncrewed aerial systems have tightly coupled energy and motion dynamics which\nmust be accounted for by onboard planning algorithms. This work proposes a\nstrategy for coupled motion and energy planning using model predictive control\n(MPC). A reduced-order linear time-invariant model of coupled energy and motion\ndynamics is presented. Constrained zonotopes are used to represent state and\ninput constraints, and hybrid zonotopes are used to represent non-convex\nconstraints tied to a map of the environment. The structures of these\nconstraint representations are exploited within a mixed-integer quadratic\nprogram solver tailored to MPC motion planning problems. Results apply the\nproposed methodology to coupled motion and energy utilization planning problems\nfor 1) a hybrid-electric vehicle that must restrict engine usage when flying\nover regions with noise restrictions, and 2) an electric package delivery drone\nthat must track waysets with both position and battery state of charge\nrequirements. By leveraging the structure-exploiting solver, the proposed\nmixed-integer MPC formulations can be implemented in real time.\n","authors":["Joshua A. Robbins","Andrew F. Thompson","Sean Brennan","Herschel C. Pangborn"],"pdf_url":"https://arxiv.org/pdf/2411.03189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03176v1","updated":"2024-11-05T15:22:21Z","published":"2024-11-05T15:22:21Z","title":"Developing Simulation Models for Soft Robotic Grippers in Webots","summary":" Robotic simulators provide cost-effective and risk-free virtual environments\nfor studying robotic designs, control algorithms, and sensor integrations. They\ntypically host extensive libraries of sensors and actuators that facilitate\nrapid prototyping and design evaluations in simulation. The use of the most\nprominent existing robotic simulators is however limited to simulation of\nrigid-link robots. On the other hand, there exist dedicated specialized\nenvironments for simulating soft robots. This separation limits the study of\nsoft robotic systems, particularly in hybrid scenarios where soft and rigid\nsub-systems co-exist. In this work, we develop a lightweight open-source\ndigital twin of a commercially available soft gripper, directly integrated\nwithin the robotic simulator Webots. We use a Rigid-Link-Discretization (RLD)\nmodel to simulate the soft gripper. Using a Particle Swarm Optimization (PSO)\napproach, we identify the parameters of the RLD model based on the kinematics\nand dynamics of the physical system and show the efficacy of our modeling\napproach in validation experiments. All software and experimental details are\navailable on github: https://github.com/anonymousgituser1/Robosoft2025\n","authors":["Yulyan Wahyu Hadi","Lars Hof","Bayu Jayawardhana","Bahar Haghighat"],"pdf_url":"https://arxiv.org/pdf/2411.03176v1.pdf","comment":"7 pages, 9 figures, 1 table"},{"id":"http://arxiv.org/abs/2404.06178v2","updated":"2024-11-05T15:05:40Z","published":"2024-04-09T09:58:59Z","title":"Resilient Movement Planning for Continuum Robots","summary":" The paper presents an experimental study of resilient path planning for\ncon-tinuum robots taking into account the multi-objective optimisation problem.\nTo do this, we used two well-known algorithms, namely Genetic algorithm and A*\nalgorithm, for path planning and the Analytical Hierarchy Process algorithm for\npaths evaluation. In our experiment Analytical Hierarchy Process algorithm\nconsiders four different criteria, i.e. distance, motors damage, mechanical\ndamage and accuracy each considered to contribute to the resilience of a\ncontinuum robot. The use of different criteria is necessary to increasing the\ntime to maintenance operations of the robot. The experiment shows that on the\none hand both algorithms can be used in combination with Analytical Hierarchy\nProcess algorithm for multi criteria path-planning, while Genetic algorithm\nshows superior performance in the comparison of the two algorithms.\n","authors":["Oxana Shamilyan","Ievgen Kabin","Zoya Dyka","Peter Langendoerfer"],"pdf_url":"https://arxiv.org/pdf/2404.06178v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04815v2","updated":"2024-11-05T15:02:25Z","published":"2024-06-07T10:35:29Z","title":"Skill-aware Mutual Information Optimisation for Generalisation in\n Reinforcement Learning","summary":" Meta-Reinforcement Learning (Meta-RL) agents can struggle to operate across\ntasks with varying environmental features that require different optimal skills\n(i.e., different modes of behaviour). Using context encoders based on\ncontrastive learning to enhance the generalisability of Meta-RL agents is now\nwidely studied but faces challenges such as the requirement for a large sample\nsize, also referred to as the $\\log$-$K$ curse. To improve RL generalisation to\ndifferent tasks, we first introduce Skill-aware Mutual Information (SaMI), an\noptimisation objective that aids in distinguishing context embeddings according\nto skills, thereby equipping RL agents with the ability to identify and execute\ndifferent skills across tasks. We then propose Skill-aware Noise Contrastive\nEstimation (SaNCE), a $K$-sample estimator used to optimise the SaMI objective.\nWe provide a framework for equipping an RL agent with SaNCE in practice and\nconduct experimental validation on modified MuJoCo and Panda-gym benchmarks. We\nempirically find that RL agents that learn by maximising SaMI achieve\nsubstantially improved zero-shot generalisation to unseen tasks. Additionally,\nthe context encoder trained with SaNCE demonstrates greater robustness to a\nreduction in the number of available samples, thus possessing the potential to\novercome the $\\log$-$K$ curse.\n","authors":["Xuehui Yu","Mhairi Dunion","Xin Li","Stefano V. Albrecht"],"pdf_url":"https://arxiv.org/pdf/2406.04815v2.pdf","comment":"The Thirty-eighth Annual Conference on Neural Information Processing\n Systems (NeurIPS), 2024"},{"id":"http://arxiv.org/abs/2409.11935v2","updated":"2024-11-05T14:23:27Z","published":"2024-09-18T12:50:28Z","title":"Reinforcement Learning with Lie Group Orientations for Robotics","summary":" Handling orientations of robots and objects is a crucial aspect of many\napplications. Yet, ever so often, there is a lack of mathematical correctness\nwhen dealing with orientations, especially in learning pipelines involving, for\nexample, artificial neural networks. In this paper, we investigate\nreinforcement learning with orientations and propose a simple modification of\nthe network's input and output that adheres to the Lie group structure of\norientations. As a result, we obtain an easy and efficient implementation that\nis directly usable with existing learning libraries and achieves significantly\nbetter performance than other common orientation representations. We briefly\nintroduce Lie theory specifically for orientations in robotics to motivate and\noutline our approach. Subsequently, a thorough empirical evaluation of\ndifferent combinations of orientation representations for states and actions\ndemonstrates the superior performance of our proposed approach in different\nscenarios, including: direct orientation control, end effector orientation\ncontrol, and pick-and-place tasks.\n","authors":["Martin Schuck","Jan Brüdigam","Sandra Hirche","Angela Schoellig"],"pdf_url":"https://arxiv.org/pdf/2409.11935v2.pdf","comment":"Submitted to ICRA 2025"},{"id":"http://arxiv.org/abs/2411.03048v1","updated":"2024-11-05T12:31:20Z","published":"2024-11-05T12:31:20Z","title":"UNet: A Generic and Reliable Multi-UAV Communication and Networking\n Architecture for Heterogeneous Applications","summary":" The rapid growth of UAV applications necessitates a robust communication and\nnetworking architecture capable of addressing the diverse requirements of\nvarious applications concurrently, rather than relying on application-specific\nsolutions. This paper proposes a generic and reliable multi-UAV communication\nand networking architecture designed to support the varying demands of\nheterogeneous applications, including short-range and long-range communication,\nstar and mesh topologies, different data rates, and multiple wireless\nstandards. Our architecture accommodates both adhoc and infrastructure\nnetworks, ensuring seamless connectivity throughout the network. Additionally,\nwe present the design of a multi-protocol UAV gateway that enables\ninteroperability among various communication protocols. Furthermore, we\nintroduce a data processing and service layer framework with a graphical user\ninterface of a ground control station that facilitates remote control and\nmonitoring from any location at any time. We practically implemented the\nproposed architecture and evaluated its performance using different metrics,\ndemonstrating its effectiveness.\n","authors":["Sanku Kumar Roy","Mohamed Samshad","Ketan Rajawat"],"pdf_url":"https://arxiv.org/pdf/2411.03048v1.pdf","comment":"11 pages, 20 figures, Journal paper"},{"id":"http://arxiv.org/abs/2411.03011v1","updated":"2024-11-05T11:19:30Z","published":"2024-11-05T11:19:30Z","title":"Set-Membership Estimation for Fault Diagnosis of Nonlinear Systems","summary":" This paper introduces a Fault Diagnosis (Detection, Isolation, and\nEstimation) method using Set-Membership Estimation (SME) designed for a class\nof nonlinear systems that are linear to the fault parameters. The methodology\nadvances fault diagnosis by continuously evaluating an estimate of the fault\nparameter and a feasible parameter set where the true fault parameter belongs.\nUnlike previous SME approaches, in this work, we address nonlinear systems\nsubjected to both input and output uncertainties by utilizing inclusion\nfunctions and interval arithmetic. Additionally, we present an approach to\nouter-approximate the polytopic description of the feasible parameter set by\neffectively balancing approximation accuracy with computational efficiency\nresulting in improved fault detectability. Lastly, we introduce adaptive\nregularization of the parameter estimates to enhance the estimation process\nwhen the input-output data are sparse or non-informative, enhancing fault\nidentifiability. We demonstrate the effectiveness of this method in simulations\ninvolving an Autonomous Surface Vehicle in both a path-following and a\nrealistic collision avoidance scenario, underscoring its potential to enhance\nsafety and reliability in critical applications.\n","authors":["A. Tsolakis","L. Ferranti","V. Reppa"],"pdf_url":"https://arxiv.org/pdf/2411.03011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02983v1","updated":"2024-11-05T10:45:30Z","published":"2024-11-05T10:45:30Z","title":"Autonomous Decision Making for UAV Cooperative Pursuit-Evasion Game with\n Reinforcement Learning","summary":" The application of intelligent decision-making in unmanned aerial vehicle\n(UAV) is increasing, and with the development of UAV 1v1 pursuit-evasion game,\nmulti-UAV cooperative game has emerged as a new challenge. This paper proposes\na deep reinforcement learning-based model for decision-making in multi-role UAV\ncooperative pursuit-evasion game, to address the challenge of enabling UAV to\nautonomously make decisions in complex game environments. In order to enhance\nthe training efficiency of the reinforcement learning algorithm in UAV\npursuit-evasion game environment that has high-dimensional state-action space,\nthis paper proposes multi-environment asynchronous double deep Q-network with\npriority experience replay algorithm to effectively train the UAV's game\npolicy. Furthermore, aiming to improve cooperation ability and task completion\nefficiency, as well as minimize the cost of UAVs in the pursuit-evasion game,\nthis paper focuses on the allocation of roles and targets within multi-UAV\nenvironment. The cooperative game decision model with varying numbers of UAVs\nare obtained by assigning diverse tasks and roles to the UAVs in different\nscenarios. The simulation results demonstrate that the proposed method enables\nautonomous decision-making of the UAVs in pursuit-evasion game scenarios and\nexhibits significant capabilities in cooperation.\n","authors":["Yang Zhao","Zidong Nie","Kangsheng Dong","Qinghua Huang","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2411.02983v1.pdf","comment":"11 pages, 12 figures, 31 conference"},{"id":"http://arxiv.org/abs/2411.02975v1","updated":"2024-11-05T10:24:45Z","published":"2024-11-05T10:24:45Z","title":"Transformer-Based Fault-Tolerant Control for Fixed-Wing UAVs Using\n Knowledge Distillation and In-Context Adaptation","summary":" This study presents a transformer-based approach for fault-tolerant control\nin fixed-wing Unmanned Aerial Vehicles (UAVs), designed to adapt in real time\nto dynamic changes caused by structural damage or actuator failures. Unlike\ntraditional Flight Control Systems (FCSs) that rely on classical control theory\nand struggle under severe alterations in dynamics, our method directly maps\nouter-loop reference values -- altitude, heading, and airspeed -- into control\ncommands using the in-context learning and attention mechanisms of\ntransformers, thus bypassing inner-loop controllers and fault-detection layers.\nEmploying a teacher-student knowledge distillation framework, the proposed\napproach trains a student agent with partial observations by transferring\nknowledge from a privileged expert agent with full observability, enabling\nrobust performance across diverse failure scenarios. Experimental results\ndemonstrate that our transformer-based controller outperforms industry-standard\nFCS and state-of-the-art reinforcement learning (RL) methods, maintaining high\ntracking accuracy and stability in nominal conditions and extreme failure\ncases, highlighting its potential for enhancing UAV operational safety and\nreliability.\n","authors":["Francisco Giral","Ignacio Gómez","Ricardo Vinuesa","Soledad Le-Clainche"],"pdf_url":"https://arxiv.org/pdf/2411.02975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02969v1","updated":"2024-11-05T10:13:23Z","published":"2024-11-05T10:13:23Z","title":"Multi-modal NeRF Self-Supervision for LiDAR Semantic Segmentation","summary":" LiDAR Semantic Segmentation is a fundamental task in autonomous driving\nperception consisting of associating each LiDAR point to a semantic label.\nFully-supervised models have widely tackled this task, but they require labels\nfor each scan, which either limits their domain or requires impractical amounts\nof expensive annotations. Camera images, which are generally recorded alongside\nLiDAR pointclouds, can be processed by the widely available 2D foundation\nmodels, which are generic and dataset-agnostic. However, distilling knowledge\nfrom 2D data to improve LiDAR perception raises domain adaptation challenges.\nFor example, the classical perspective projection suffers from the parallax\neffect produced by the position shift between both sensors at their respective\ncapture times. We propose a Semi-Supervised Learning setup to leverage\nunlabeled LiDAR pointclouds alongside distilled knowledge from the camera\nimages. To self-supervise our model on the unlabeled scans, we add an auxiliary\nNeRF head and cast rays from the camera viewpoint over the unlabeled voxel\nfeatures. The NeRF head predicts densities and semantic logits at each sampled\nray location which are used for rendering pixel semantics. Concurrently, we\nquery the Segment-Anything (SAM) foundation model with the camera image to\ngenerate a set of unlabeled generic masks. We fuse the masks with the rendered\npixel semantics from LiDAR to produce pseudo-labels that supervise the pixel\npredictions. During inference, we drop the NeRF head and run our model with\nonly LiDAR. We show the effectiveness of our approach in three public LiDAR\nSemantic Segmentation benchmarks: nuScenes, SemanticKITTI and ScribbleKITTI.\n","authors":["Xavier Timoneda","Markus Herb","Fabian Duerr","Daniel Goehring","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2411.02969v1.pdf","comment":"IEEE/RSJ International Conference on Intelligent Robots and Systems\n (IROS) 2024"},{"id":"http://arxiv.org/abs/2402.16398v2","updated":"2024-11-05T09:55:14Z","published":"2024-02-26T08:47:35Z","title":"AsynEVO: Asynchronous Event-Driven Visual Odometry for Pure Event\n Streams","summary":" Event cameras are bio-inspired vision sensors that asynchronously measure\nper-pixel brightness changes.The high-temporal resolution and asynchronicity of\nevent cameras offer great potential for estimating robot motion states. Recent\nworks have adopted the continuous-time estimation methods to exploit the\ninherent nature of event cameras. However, existing methods either have poor\nruntime performance or neglect the high-temporal resolution of event cameras.\nTo alleviate it, an Asynchronous Event-driven Visual Odometry (AsynEVO) based\non sparse Gaussian Process (GP) regression is proposed to efficiently infer the\nmotion trajectory from pure event streams. Concretely, an asynchronous frontend\npipeline is designed to adapt event-driven feature tracking and manage feature\ntrajectories; a parallel dynamic sliding-window backend is presented within the\nframework of sparse GP regression on $SE(3)$. Notably, a dynamic\nmarginalization strategy is employed to ensure the consistency and sparsity of\nthis GP regression. Experiments conducted on public datasets and real-world\nscenarios demonstrate that AsynEVO achieves competitive precision and superior\nrobustness compared to the state-of-the-art.The experiment in the\nrepeated-texture scenario indicates that the high-temporal resolution of\nAsynEVO plays a vital role in the estimation of high-speed movement.\nFurthermore, we show that the computational efficiency of AsynEVO significantly\noutperforms the incremental method.\n","authors":["Zhixiang Wang","Xudong Li","Yizhai Zhang","Panfeng Huang"],"pdf_url":"https://arxiv.org/pdf/2402.16398v2.pdf","comment":"Submitted to IEEE Transactions on Intelligent Transportation Systems\n (2024-07-15)"},{"id":"http://arxiv.org/abs/2411.02938v1","updated":"2024-11-05T09:31:30Z","published":"2024-11-05T09:31:30Z","title":"Multi-Modal 3D Scene Graph Updater for Shared and Dynamic Environments","summary":" The advent of generalist Large Language Models (LLMs) and Large Vision Models\n(VLMs) have streamlined the construction of semantically enriched maps that can\nenable robots to ground high-level reasoning and planning into their\nrepresentations. One of the most widely used semantic map formats is the 3D\nScene Graph, which captures both metric (low-level) and semantic (high-level)\ninformation. However, these maps often assume a static world, while real\nenvironments, like homes and offices, are dynamic. Even small changes in these\nspaces can significantly impact task performance. To integrate robots into\ndynamic environments, they must detect changes and update the scene graph in\nreal-time. This update process is inherently multimodal, requiring input from\nvarious sources, such as human agents, the robot's own perception system, time,\nand its actions. This work proposes a framework that leverages these multimodal\ninputs to maintain the consistency of scene graphs during real-time operation,\npresenting promising initial results and outlining a roadmap for future\nresearch.\n","authors":["Emilio Olivastri","Jonathan Francis","Alberto Pretto","Niko Sünderhauf","Krishan Rana"],"pdf_url":"https://arxiv.org/pdf/2411.02938v1.pdf","comment":"This paper has been accepted at the Workshop on Lifelong Learning for\n Home Robots at the 8th Conference on Robot Learning (CoRL 2024), Munich,\n Germany"},{"id":"http://arxiv.org/abs/2411.02891v1","updated":"2024-11-05T08:11:37Z","published":"2024-11-05T08:11:37Z","title":"Nature's All-in-One: Multitasking Robots Inspired by Dung Beetles","summary":" Dung beetles impressively coordinate their six legs simultaneously to\neffectively roll large dung balls. They are also capable of rolling dung balls\nvarying in the weight on different terrains. The mechanisms underlying how\ntheir motor commands are adapted to walk and simultaneously roll balls\n(multitasking behavior) under different conditions remain unknown. Therefore,\nthis study unravels the mechanisms of how dung beetles roll dung balls and\nadapt their leg movements to stably roll balls over different terrains for\nmultitasking robots. We synthesize a modular neural-based loco-manipulation\ncontrol inspired by and based on ethological observations of the ball-rolling\nbehavior of dung beetles. The proposed neural-based control contains various\nneural modules, including a central pattern generator (CPG) module, a pattern\nformation network (PFN) module, and a robot orientation control (ROC) module.\nThe integrated neural control mechanisms can successfully control a dung\nbeetle-like robot (ALPHA) with biomechanical feet to perform adaptive robust\n(multitasking) loco-manipulation (walking and ball-rolling) on various terrains\n(flat and uneven). It can also deal with different ball weights (2.0 and 4.6\nkg) and ball types (soft and rigid). The control mechanisms can serve as\nguiding principles for solving complex sensory-motor coordination for\nmultitasking robots. Furthermore, this study contributes to biological research\nby enhancing our scientific understanding of sensory-motor coordination for\ncomplex adaptive (multitasking) loco-manipulation behavior in animals.\n","authors":["Binggwong Leung","Stanislav Gorb","Poramate Manoonpong"],"pdf_url":"https://arxiv.org/pdf/2411.02891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11031v3","updated":"2024-11-05T06:21:42Z","published":"2024-04-17T03:13:58Z","title":"TaCOS: Task-Specific Camera Optimization with Simulation","summary":" The performance of perception tasks is heavily influenced by imaging systems.\nHowever, designing cameras with high task performance is costly, requiring\nextensive camera knowledge and experimentation with physical hardware.\nAdditionally, cameras and perception tasks are mostly designed in isolation,\nwhereas recent methods that jointly design cameras and tasks have shown\nimproved performance. Therefore, we present a novel end-to-end optimization\napproach that co-designs cameras with specific vision tasks. This method\ncombines derivative-free and gradient-based optimizers to support both\ncontinuous and discrete camera parameters within manufacturing constraints. We\nleverage recent computer graphics techniques and physical camera\ncharacteristics to simulate the cameras in virtual environments, making the\ndesign process cost-effective. We validate our simulations against physical\ncameras and provide a procedurally generated virtual environment. Our\nexperiments demonstrate that our method designs cameras that outperform common\noff-the-shelf options, and more efficiently compared to the state-of-the-art\napproach, requiring only 2 minutes to design a camera on an example experiment\ncompared with 67 minutes for the competing method. Designed to support the\ndevelopment of cameras under manufacturing constraints, multiple cameras, and\nunconventional cameras, we believe this approach can advance the fully\nautomated design of cameras.\n","authors":["Chengyang Yan","Donald G. Dansereau"],"pdf_url":"https://arxiv.org/pdf/2404.11031v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02788v1","updated":"2024-11-05T03:54:00Z","published":"2024-11-05T03:54:00Z","title":"When to Localize? A Risk-Constrained Reinforcement Learning Approach","summary":" In a standard navigation pipeline, a robot localizes at every time step to\nlower navigational errors. However, in some scenarios, a robot needs to\nselectively localize when it is expensive to obtain observations. For example,\nan underwater robot surfacing to localize too often hinders it from searching\nfor critical items underwater, such as black boxes from crashed aircraft. On\nthe other hand, if the robot never localizes, poor state estimates cause\nfailure to find the items due to inadvertently leaving the search area or\nentering hazardous, restricted areas. Motivated by these scenarios, we\ninvestigate approaches to help a robot determine \"when to localize?\" We\nformulate this as a bi-criteria optimization problem: minimize the number of\nlocalization actions while ensuring the probability of failure (due to\ncollision or not reaching a desired goal) remains bounded. In recent work, we\nshowed how to formulate this active localization problem as a constrained\nPartially Observable Markov Decision Process (POMDP), which was solved using an\nonline POMDP solver. However, this approach is too slow and requires full\nknowledge of the robot transition and observation models. In this paper, we\npresent RiskRL, a constrained Reinforcement Learning (RL) framework that\novercomes these limitations. RiskRL uses particle filtering and recurrent Soft\nActor-Critic network to learn a policy that minimizes the number of\nlocalizations while ensuring the probability of failure constraint is met. Our\nnumerical experiments show that RiskRL learns a robust policy that outperforms\nthe baseline by at least 13% while also generalizing to unseen environments.\n","authors":["Chak Lam Shek","Kasra Torshizi","Troi Williams","Pratap Tokekar"],"pdf_url":"https://arxiv.org/pdf/2411.02788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02772v1","updated":"2024-11-05T03:33:54Z","published":"2024-11-05T03:33:54Z","title":"Communication and Energy-Aware Multi-UAV Coverage Path Planning for\n Networked Operations","summary":" This paper presents a communication and energy-aware Multi-UAV Coverage Path\nPlanning (mCPP) method for scenarios requiring continuous inter-UAV\ncommunication, such as cooperative search and rescue and surveillance missions.\nUnlike existing mCPP solutions that focus on energy, time, or coverage\nefficiency, our approach generates coverage paths that require minimal the\ncommunication range to maintain inter-UAV connectivity while also optimizing\nenergy consumption. The mCPP problem is formulated as a multi-objective\noptimization task, aiming to minimize both the communication range requirement\nand energy consumption. Our approach significantly reduces the communication\nrange needed for maintaining connectivity while ensuring energy efficiency,\noutperforming state-of-the-art methods. Its effectiveness is validated through\nsimulations on complex and arbitrary shaped regions of interests, including\nscenarios with no-fly zones. Additionally, real-world experiment demonstrate\nits high accuracy, achieving 99\\% consistency between the estimated and actual\ncommunication range required during a multi-UAV coverage mission involving\nthree UAVs.\n","authors":["Mohamed Samshad","Ketan Rajawat"],"pdf_url":"https://arxiv.org/pdf/2411.02772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01796v2","updated":"2024-11-05T03:28:38Z","published":"2024-11-04T04:41:12Z","title":"Constrained Human-AI Cooperation: An Inclusive Embodied Social\n Intelligence Challenge","summary":" We introduce Constrained Human-AI Cooperation (CHAIC), an inclusive embodied\nsocial intelligence challenge designed to test social perception and\ncooperation in embodied agents. In CHAIC, the goal is for an embodied agent\nequipped with egocentric observations to assist a human who may be operating\nunder physical constraints -- e.g., unable to reach high places or confined to\na wheelchair -- in performing common household or outdoor tasks as efficiently\nas possible. To achieve this, a successful helper must: (1) infer the human's\nintents and constraints by following the human and observing their behaviors\n(social perception), and (2) make a cooperative plan tailored to the human\npartner to solve the task as quickly as possible, working together as a team\n(cooperative planning). To benchmark this challenge, we create four new agents\nwith real physical constraints and eight long-horizon tasks featuring both\nindoor and outdoor scenes with various constraints, emergency events, and\npotential risks. We benchmark planning- and learning-based baselines on the\nchallenge and introduce a new method that leverages large language models and\nbehavior modeling. Empirical evaluations demonstrate the effectiveness of our\nbenchmark in enabling systematic assessment of key aspects of machine social\nintelligence. Our benchmark and code are publicly available at\nhttps://github.com/UMass-Foundation-Model/CHAIC.\n","authors":["Weihua Du","Qiushi Lyu","Jiaming Shan","Zhenting Qi","Hongxin Zhang","Sunli Chen","Andi Peng","Tianmin Shu","Kwonjoon Lee","Behzad Dariush","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2411.01796v2.pdf","comment":"NeurIPS 2024 Dataset and Benchmark Track. The first two authors\n contributed equally. Project Website at https://vis-www.cs.umass.edu/CHAIC/"},{"id":"http://arxiv.org/abs/2401.06518v2","updated":"2024-11-05T01:29:40Z","published":"2024-01-12T11:34:38Z","title":"Transitional Grid Maps: Joint Modeling of Static and Dynamic Occupancy","summary":" Autonomous agents rely on sensor data to construct representations of their\nenvironments, essential for predicting future events and planning their\nactions. However, sensor measurements suffer from limited range, occlusions,\nand sensor noise. These challenges become more evident in highly dynamic\nenvironments. This work proposes a probabilistic framework to jointly infer\nwhich parts of an environment are statically and which parts are dynamically\noccupied. We formulate the problem as a Bayesian network and introduce minimal\nassumptions that significantly reduce the complexity of the problem. Based on\nthose, we derive Transitional Grid Maps (TGMs), an efficient analytical\nsolution. Using real data, we demonstrate how this approach produces better\nmaps by keeping track of both static and dynamic elements and, as a side\neffect, can help improve existing SLAM algorithms.\n","authors":["José Manuel Gaspar Sánchez","Leonard Bruns","Jana Tumova","Patric Jensfelt","Martin Törngren"],"pdf_url":"https://arxiv.org/pdf/2401.06518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02706v1","updated":"2024-11-05T01:09:51Z","published":"2024-11-05T01:09:51Z","title":"Safety Verification for Evasive Collision Avoidance in Autonomous\n Vehicles with Enhanced Resolutions","summary":" This paper presents a comprehensive hazard analysis, risk assessment, and\nloss evaluation for an Evasive Minimum Risk Maneuvering (EMRM) system designed\nfor autonomous vehicles. The EMRM system is engineered to enhance collision\navoidance and mitigate loss severity by drawing inspiration from professional\ndrivers who perform aggressive maneuvers while maintaining stability for\neffective risk mitigation. Recent advancements in autonomous vehicle technology\ndemonstrate a growing capability for high-performance maneuvers. This paper\ndiscusses a comprehensive safety verification process and establishes a clear\nsafety goal to enhance testing validation. The study systematically identifies\npotential hazards and assesses their risks to overall safety and the protection\nof vulnerable road users. A novel loss evaluation approach is introduced,\nfocusing on the impact of mitigation maneuvers on loss severity. Additionally,\nthe proposed mitigation integrity level can be used to verify the minimum-risk\nmaneuver feature. This paper applies a verification method to evasive\nmaneuvering, contributing to the development of more reliable active safety\nfeatures in autonomous driving systems.\n","authors":["Aliasghar Arab","Milad Khaleghi","Alireza Partovi","Alireza Abbaspour","Chaitanya Shinde","Yashar Mousavi","Vahid Azimi","Ali Karimmoddini"],"pdf_url":"https://arxiv.org/pdf/2411.02706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02704v1","updated":"2024-11-05T01:02:51Z","published":"2024-11-05T01:02:51Z","title":"RT-Affordance: Affordances are Versatile Intermediate Representations\n for Robot Manipulation","summary":" We explore how intermediate policy representations can facilitate\ngeneralization by providing guidance on how to perform manipulation tasks.\nExisting representations such as language, goal images, and trajectory sketches\nhave been shown to be helpful, but these representations either do not provide\nenough context or provide over-specified context that yields less robust\npolicies. We propose conditioning policies on affordances, which capture the\npose of the robot at key stages of the task. Affordances offer expressive yet\nlightweight abstractions, are easy for users to specify, and facilitate\nefficient learning by transferring knowledge from large internet datasets. Our\nmethod, RT-Affordance, is a hierarchical model that first proposes an\naffordance plan given the task language, and then conditions the policy on this\naffordance plan to perform manipulation. Our model can flexibly bridge\nheterogeneous sources of supervision including large web datasets and robot\ntrajectories. We additionally train our model on cheap-to-collect in-domain\naffordance images, allowing us to learn new tasks without collecting any\nadditional costly robot trajectories. We show on a diverse set of novel tasks\nhow RT-Affordance exceeds the performance of existing methods by over 50%, and\nwe empirically demonstrate that affordances are robust to novel settings.\nVideos available at https://snasiriany.me/rt-affordance\n","authors":["Soroush Nasiriany","Sean Kirmani","Tianli Ding","Laura Smith","Yuke Zhu","Danny Driess","Dorsa Sadigh","Ted Xiao"],"pdf_url":"https://arxiv.org/pdf/2411.02704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02703v1","updated":"2024-11-05T01:01:48Z","published":"2024-11-05T01:01:48Z","title":"LVI-GS: Tightly-coupled LiDAR-Visual-Inertial SLAM using 3D Gaussian\n Splatting","summary":" 3D Gaussian Splatting (3DGS) has shown its ability in rapid rendering and\nhigh-fidelity mapping. In this paper, we introduce LVI-GS, a tightly-coupled\nLiDAR-Visual-Inertial mapping framework with 3DGS, which leverages the\ncomplementary characteristics of LiDAR and image sensors to capture both\ngeometric structures and visual details of 3D scenes. To this end, the 3D\nGaussians are initialized from colourized LiDAR points and optimized using\ndifferentiable rendering. In order to achieve high-fidelity mapping, we\nintroduce a pyramid-based training approach to effectively learn multi-level\nfeatures and incorporate depth loss derived from LiDAR measurements to improve\ngeometric feature perception. Through well-designed strategies for Gaussian-Map\nexpansion, keyframe selection, thread management, and custom CUDA acceleration,\nour framework achieves real-time photo-realistic mapping. Numerical experiments\nare performed to evaluate the superior performance of our method compared to\nstate-of-the-art 3D reconstruction systems.\n","authors":["Huibin Zhao","Weipeng Guan","Peng Lu"],"pdf_url":"https://arxiv.org/pdf/2411.02703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15122v4","updated":"2024-11-05T00:58:32Z","published":"2023-12-23T00:07:06Z","title":"Scaling Is All You Need: Autonomous Driving with JAX-Accelerated\n Reinforcement Learning","summary":" Reinforcement learning has been demonstrated to outperform even the best\nhumans in complex domains like video games. However, running reinforcement\nlearning experiments on the required scale for autonomous driving is extremely\ndifficult. Building a large scale reinforcement learning system and\ndistributing it across many GPUs is challenging. Gathering experience during\ntraining on real world vehicles is prohibitive from a safety and scalability\nperspective. Therefore, an efficient and realistic driving simulator is\nrequired that uses a large amount of data from real-world driving. We bring\nthese capabilities together and conduct large-scale reinforcement learning\nexperiments for autonomous driving. We demonstrate that our policy performance\nimproves with increasing scale. Our best performing policy reduces the failure\nrate by 64% while improving the rate of driving progress by 25% compared to the\npolicies produced by state-of-the-art machine learning for autonomous driving.\n","authors":["Moritz Harmel","Anubhav Paras","Andreas Pasternak","Nicholas Roy","Gary Linscott"],"pdf_url":"https://arxiv.org/pdf/2312.15122v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03556v1","updated":"2024-11-05T23:34:27Z","published":"2024-11-05T23:34:27Z","title":"VQ-ACE: Efficient Policy Search for Dexterous Robotic Manipulation via\n Action Chunking Embedding","summary":" Dexterous robotic manipulation remains a significant challenge due to the\nhigh dimensionality and complexity of hand movements required for tasks like\nin-hand manipulation and object grasping. This paper addresses this issue by\nintroducing Vector Quantized Action Chunking Embedding (VQ-ACE), a novel\nframework that compresses human hand motion into a quantized latent space,\nsignificantly reducing the action space's dimensionality while preserving key\nmotion characteristics. By integrating VQ-ACE with both Model Predictive\nControl (MPC) and Reinforcement Learning (RL), we enable more efficient\nexploration and policy learning in dexterous manipulation tasks using a\nbiomimetic robotic hand. Our results show that latent space sampling with MPC\nproduces more human-like behavior in tasks such as Ball Rolling and Object\nPicking, leading to higher task success rates and reduced control costs. For\nRL, action chunking accelerates learning and improves exploration, demonstrated\nthrough faster convergence in tasks like cube stacking and in-hand cube\nreorientation. These findings suggest that VQ-ACE offers a scalable and\neffective solution for robotic manipulation tasks involving complex,\nhigh-dimensional state spaces, contributing to more natural and adaptable\nrobotic systems.\n","authors":["Chenyu Yang","Davide Liconti","Robert K. Katzschmann"],"pdf_url":"https://arxiv.org/pdf/2411.03556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03555v1","updated":"2024-11-05T23:28:57Z","published":"2024-11-05T23:28:57Z","title":"Object and Contact Point Tracking in Demonstrations Using 3D Gaussian\n Splatting","summary":" This paper introduces a method to enhance Interactive Imitation Learning\n(IIL) by extracting touch interaction points and tracking object movement from\nvideo demonstrations. The approach extends current IIL systems by providing\nrobots with detailed knowledge of both where and how to interact with objects,\nparticularly complex articulated ones like doors and drawers. By leveraging\ncutting-edge techniques such as 3D Gaussian Splatting and FoundationPose for\ntracking, this method allows robots to better understand and manipulate objects\nin dynamic environments. The research lays the foundation for more effective\ntask learning and execution in autonomous robotic systems.\n","authors":["Michael Büttner","Jonathan Francis","Helge Rhodin","Andrew Melnik"],"pdf_url":"https://arxiv.org/pdf/2411.03555v1.pdf","comment":"CoRL 2024, Workshop on Lifelong Learning for Home Robots, Munich,\n Germany"},{"id":"http://arxiv.org/abs/2411.03540v1","updated":"2024-11-05T22:42:41Z","published":"2024-11-05T22:42:41Z","title":"VLA-3D: A Dataset for 3D Semantic Scene Understanding and Navigation","summary":" With the recent rise of Large Language Models (LLMs), Vision-Language Models\n(VLMs), and other general foundation models, there is growing potential for\nmultimodal, multi-task embodied agents that can operate in diverse environments\ngiven only natural language as input. One such application area is indoor\nnavigation using natural language instructions. However, despite recent\nprogress, this problem remains challenging due to the spatial reasoning and\nsemantic understanding required, particularly in arbitrary scenes that may\ncontain many objects belonging to fine-grained classes. To address this\nchallenge, we curate the largest real-world dataset for Vision and\nLanguage-guided Action in 3D Scenes (VLA-3D), consisting of over 11.5K scanned\n3D indoor rooms from existing datasets, 23.5M heuristically generated semantic\nrelations between objects, and 9.7M synthetically generated referential\nstatements. Our dataset consists of processed 3D point clouds, semantic object\nand room annotations, scene graphs, navigable free space annotations, and\nreferential language statements that specifically focus on view-independent\nspatial relations for disambiguating objects. The goal of these features is to\naid the downstream task of navigation, especially on real-world systems where\nsome level of robustness must be guaranteed in an open world of changing scenes\nand imperfect language. We benchmark our dataset with current state-of-the-art\nmodels to obtain a performance baseline. All code to generate and visualize the\ndataset is publicly released, see https://github.com/HaochenZ11/VLA-3D. With\nthe release of this dataset, we hope to provide a resource for progress in\nsemantic 3D scene understanding that is robust to changes and one which will\naid the development of interactive indoor navigation systems.\n","authors":["Haochen Zhang","Nader Zantout","Pujith Kachana","Zongyuan Wu","Ji Zhang","Wenshan Wang"],"pdf_url":"https://arxiv.org/pdf/2411.03540v1.pdf","comment":"Accepted and presented at the 1st Workshop on Semantic Reasoning and\n Goal Understanding in Robotics (SemRob), Robotics Science and Systems\n Conference (RSS 2024)"},{"id":"http://arxiv.org/abs/2411.03532v1","updated":"2024-11-05T22:15:28Z","published":"2024-11-05T22:15:28Z","title":"A Behavior Architecture for Fast Humanoid Robot Door Traversals","summary":" Towards the role of humanoid robots as squad mates in urban operations and\nother domains, we identified doors as a major area lacking capability\ndevelopment. In this paper, we focus on the ability of humanoid robots to\nnavigate and deal with doors. Human-sized doors are ubiquitous in many\nenvironment domains and the humanoid form factor is uniquely suited to operate\nand traverse them. We present an architecture which incorporates GPU\naccelerated perception and a tree based interactive behavior coordination\nsystem with a whole body motion and walking controller. Our system is capable\nof performing door traversals on a variety of door types. It supports rapid\nauthoring of behaviors for unseen door types and techniques to achieve\nre-usability of those authored behaviors. The behaviors are modelled using\ntrees and feature logical reactivity and action sequences that can be executed\nwith layered concurrency to increase speed. Primitive actions are built on top\nof our existing whole body controller which supports manipulation while\nwalking. We include a perception system using both neural networks and\nclassical computer vision for door mechanism detection outside of the lab\nenvironment. We present operator-robot interdependence analysis charts to\nexplore how human cognition is combined with artificial intelligence to produce\ncomplex robot behavior. Finally, we present and discuss real robot performances\nof fast door traversals on our Nadia humanoid robot. Videos online at\nhttps://www.youtube.com/playlist?list=PLXuyT8w3JVgMPaB5nWNRNHtqzRK8i68dy.\n","authors":["Duncan Calvert","Luigi Penco","Dexton Anderson","Tomasz Bialek","Arghya Chatterjee","Bhavyansh Mishra","Geoffrey Clark","Sylvain Bertrand","Robert Griffin"],"pdf_url":"https://arxiv.org/pdf/2411.03532v1.pdf","comment":"15 pages, 23 figure, for submission to Elsevier RAS"},{"id":"http://arxiv.org/abs/2411.03494v1","updated":"2024-11-05T20:18:29Z","published":"2024-11-05T20:18:29Z","title":"An Open-source Sim2Real Approach for Sensor-independent Robot Navigation\n in a Grid","summary":" This paper presents a Sim2Real (Simulation to Reality) approach to bridge the\ngap between a trained agent in a simulated environment and its real-world\nimplementation in navigating a robot in a similar setting. Specifically, we\nfocus on navigating a quadruped robot in a real-world grid-like environment\ninspired by the Gymnasium Frozen Lake -- a highly user-friendly and free\nApplication Programming Interface (API) to develop and test Reinforcement\nLearning (RL) algorithms. We detail the development of a pipeline to transfer\nmotion policies learned in the Frozen Lake simulation to a physical quadruped\nrobot, thus enabling autonomous navigation and obstacle avoidance in a grid\nwithout relying on expensive localization and mapping sensors. The work\ninvolves training an RL agent in the Frozen Lake environment and utilizing the\nresulting Q-table to control a 12 Degrees-of-Freedom (DOF) quadruped robot. In\naddition to detailing the RL implementation, inverse kinematics-based quadruped\ngaits, and the transfer policy pipeline, we open-source the project on GitHub\nand include a demonstration video of our Sim2Real transfer approach. This work\nprovides an accessible, straightforward, and low-cost framework for\nresearchers, students, and hobbyists to explore and implement RL-based robot\nnavigation in real-world grid environments.\n","authors":["Murad Mehrab Abrar","Souryadeep Mondal","Michelle Hickner"],"pdf_url":"https://arxiv.org/pdf/2411.03494v1.pdf","comment":"Accepted for publication at the 9th IEEE International Conference on\n Robotics and Automation Engineering (IEEE ICRAE 2024), Singapore"},{"id":"http://arxiv.org/abs/2411.03487v1","updated":"2024-11-05T20:11:41Z","published":"2024-11-05T20:11:41Z","title":"Enhancing Exploratory Capability of Visual Navigation Using Uncertainty\n of Implicit Scene Representation","summary":" In the context of visual navigation in unknown scenes, both \"exploration\" and\n\"exploitation\" are equally crucial. Robots must first establish environmental\ncognition through exploration and then utilize the cognitive information to\naccomplish target searches. However, most existing methods for image-goal\nnavigation prioritize target search over the generation of exploratory\nbehavior. To address this, we propose the Navigation with Uncertainty-driven\nExploration (NUE) pipeline, which uses an implicit and compact scene\nrepresentation, NeRF, as a cognitive structure. We estimate the uncertainty of\nNeRF and augment the exploratory ability by the uncertainty to in turn\nfacilitate the construction of implicit representation. Simultaneously, we\nextract memory information from NeRF to enhance the robot's reasoning ability\nfor determining the location of the target. Ultimately, we seamlessly combine\nthe two generated abilities to produce navigational actions. Our pipeline is\nend-to-end, with the environmental cognitive structure being constructed\nonline. Extensive experimental results on image-goal navigation demonstrate the\ncapability of our pipeline to enhance exploratory behaviors, while also\nenabling a natural transition from the exploration to exploitation phase. This\nenables our model to outperform existing memory-based cognitive navigation\nstructures in terms of navigation performance.\n","authors":["Yichen Wang","Qiming Liu","Zhe Liu","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2411.03487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03483v1","updated":"2024-11-05T20:07:47Z","published":"2024-11-05T20:07:47Z","title":"Augmented-Reality Enabled Crop Monitoring with Robot Assistance","summary":" The integration of augmented reality (AR), extended reality (XR), and virtual\nreality (VR) technologies in agriculture has shown significant promise in\nenhancing various agricultural practices. Mobile robots have also been adopted\nas assessment tools in precision agriculture, improving economic efficiency and\nproductivity, and minimizing undesired effects such as weeds and pests. Despite\nconsiderable work on both fronts, the combination of a versatile User Interface\n(UI) provided by an AR headset with the integration and direct interaction and\ncontrol of a mobile field robot has not yet been fully explored or\nstandardized. This work aims to address this gap by providing real-time data\ninput and control output of a mobile robot for precision agriculture through a\nvirtual environment enabled by an AR headset interface. The system leverages\nopen-source computational tools and off-the-shelf hardware for effective\nintegration. Distinctive case studies are presented where growers or\ntechnicians can interact with a legged robot via an AR headset and a UI. Users\ncan teleoperate the robot to gather information in an area of interest, request\nreal-time graphed status of an area, or have the robot autonomously navigate to\nselected areas for measurement updates. The proposed system utilizes a custom\nlocal navigation method with a fixed holographic coordinate system in\ncombination with QR codes. This step toward fusing AR and robotics in\nagriculture aims to provide practical solutions for real-time data management\nand control enabled by human-robot interaction. The implementation can be\nextended to various robot applications in agriculture and beyond, promoting a\nunified framework for on-demand and autonomous robot operation in the field.\n","authors":["Caio Mucchiani","Dimitrios Chatziparaschis","Konstantinos Karydis"],"pdf_url":"https://arxiv.org/pdf/2411.03483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03481v1","updated":"2024-11-05T20:07:27Z","published":"2024-11-05T20:07:27Z","title":"Chance-Constrained Convex MPC for Robust Quadruped Locomotion Under\n Parametric and Additive Uncertainties","summary":" Recent advances in quadrupedal locomotion have focused on improving stability\nand performance across diverse environments. However, existing methods often\nlack adequate safety analysis and struggle to adapt to varying payloads and\ncomplex terrains, typically requiring extensive tuning. To overcome these\nchallenges, we propose a Chance-Constrained Model Predictive Control (CCMPC)\nframework that explicitly models payload and terrain variability as\ndistributions of parametric and additive disturbances within the single rigid\nbody dynamics (SRBD) model. Our approach ensures safe and consistent\nperformance under uncertain dynamics by expressing the model friction cone\nconstraints, which define the feasible set of ground reaction forces, as chance\nconstraints. Moreover, we solve the resulting stochastic control problem using\na computationally efficient quadratic programming formulation. Extensive Monte\nCarlo simulations of quadrupedal locomotion across varying payloads and complex\nterrains demonstrate that CCMPC significantly outperforms two competitive\nbenchmarks: Linear MPC (LMPC) and MPC with hand-tuned safety margins to\nmaintain stability, reduce foot slippage, and track the center of mass.\nHardware experiments on the Unitree Go1 robot show successful locomotion across\nvarious indoor and outdoor terrains with unknown loads exceeding 50% of the\nrobot body weight, despite no additional parameter tuning. A video of the\nresults and accompanying code can be found at: https://cc-mpc.github.io/.\n","authors":["Ananya Trivedi","Sarvesh Prajapati","Mark Zolotas","Michael Everett","Taskin Padir"],"pdf_url":"https://arxiv.org/pdf/2411.03481v1.pdf","comment":"Under review for Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2411.03465v1","updated":"2024-11-05T19:35:23Z","published":"2024-11-05T19:35:23Z","title":"Digital Twin for Autonomous Surface Vessels: Enabler for Safe Maritime\n Navigation","summary":" Autonomous surface vessels (ASVs) are becoming increasingly significant in\nenhancing the safety and sustainability of maritime operations. To ensure the\nreliability of modern control algorithms utilized in these vessels, digital\ntwins (DTs) provide a robust framework for conducting safe and effective\nsimulations within a virtual environment. Digital twins are generally\nclassified on a scale from 0 to 5, with each level representing a progression\nin complexity and functionality: Level 0 (Standalone) employs offline modeling\ntechniques; Level 1 (Descriptive) integrates sensors and online modeling to\nenhance situational awareness; Level 2 (Diagnostic) focuses on condition\nmonitoring and cybersecurity; Level 3 (Predictive) incorporates predictive\nanalytics; Level 4 (Prescriptive) embeds decision-support systems; and Level 5\n(Autonomous) enables advanced functionalities such as collision avoidance and\npath following. These digital representations not only provide insights into\nthe vessel's current state and operational efficiency but also predict future\nscenarios and assess life endurance. By continuously updating with real-time\nsensor data, the digital twin effectively corrects modeling errors and enhances\ndecision-making processes. Since DTs are key enablers for complex autonomous\nsystems, this paper introduces a comprehensive methodology for establishing a\ndigital twin framework specifically tailored for ASVs. Through a detailed\nliterature survey, we explore existing state-of-the-art enablers across the\ndefined levels, offering valuable recommendations for future research and\ndevelopment in this rapidly evolving field.\n","authors":["Daniel Menges","Adil Rasheed"],"pdf_url":"https://arxiv.org/pdf/2411.03465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16859v3","updated":"2024-11-05T19:34:22Z","published":"2024-03-25T15:23:14Z","title":"A Semi-Lagrangian Approach for Time and Energy Path Planning\n Optimization in Static Flow Fields","summary":" Efficient path planning for autonomous mobile robots is a critical problem\nacross numerous domains, where optimizing both time and energy consumption is\nparamount. This paper introduces a novel methodology that considers the dynamic\ninfluence of an environmental flow field and considers geometric constraints,\nincluding obstacles and forbidden zones, enriching the complexity of the\nplanning problem. We formulate it as a multi-objective optimal control problem,\npropose a novel transformation called Harmonic Transformation, and apply a\nsemi-Lagrangian scheme to solve it. The set of Pareto efficient solutions is\nobtained considering two distinct approaches: a deterministic method and an\nevolutionary-based one, both of which are designed to make use of the proposed\nHarmonic Transformation. Through an extensive analysis of these approaches, we\ndemonstrate their efficacy in finding optimized paths.\n","authors":["Víctor C. da S. Campos","Armando A. Neto","Douglas G. Macharet"],"pdf_url":"https://arxiv.org/pdf/2403.16859v3.pdf","comment":"50 pages, reviewed version; Preprint submitted to Journal of the\n Franklin Institute (under review)"},{"id":"http://arxiv.org/abs/2411.03416v1","updated":"2024-11-05T18:57:45Z","published":"2024-11-05T18:57:45Z","title":"Accelerating Gaussian Variational Inference for Motion Planning Under\n Uncertainty","summary":" This work addresses motion planning under uncertainty as a stochastic optimal\ncontrol problem. The path distribution induced by the optimal controller\ncorresponds to a posterior path distribution with a known form. To approximate\nthis posterior, we frame an optimization problem in the space of Gaussian\ndistributions, which aligns with the Gaussian Variational Inference Motion\nPlanning (GVIMP) paradigm introduced in \\cite{yu2023gaussian}. In this\nframework, the computation bottleneck lies in evaluating the expectation of\ncollision costs over a dense discretized trajectory and computing the marginal\ncovariances. This work exploits the sparse motion planning factor graph, which\nallows for parallel computing collision costs and Gaussian Belief Propagation\n(GBP) marginal covariance computation, to introduce a computationally efficient\napproach to solving GVIMP. We term the novel paradigm as the Parallel Gaussian\nVariational Inference Motion Planning (P-GVIMP). We validate the proposed\nframework on various robotic systems, demonstrating significant speed\nacceleration achieved by leveraging Graphics Processing Units (GPUs) for\nparallel computation. An open-sourced implementation is presented at\nhttps://github.com/hzyu17/VIMP.\n","authors":["Zinuo Chang","Hongzhe Yu","Patricio Vela","Yongxin Chen"],"pdf_url":"https://arxiv.org/pdf/2411.03416v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2411.03409v1","updated":"2024-11-05T18:48:12Z","published":"2024-11-05T18:48:12Z","title":"STEER: Flexible Robotic Manipulation via Dense Language Grounding","summary":" The complexity of the real world demands robotic systems that can\nintelligently adapt to unseen situations. We present STEER, a robot learning\nframework that bridges high-level, commonsense reasoning with precise, flexible\nlow-level control. Our approach translates complex situational awareness into\nactionable low-level behavior through training language-grounded policies with\ndense annotation. By structuring policy training around fundamental, modular\nmanipulation skills expressed in natural language, STEER exposes an expressive\ninterface for humans or Vision-Language Models (VLMs) to intelligently\norchestrate the robot's behavior by reasoning about the task and context. Our\nexperiments demonstrate the skills learned via STEER can be combined to\nsynthesize novel behaviors to adapt to new situations or perform completely new\ntasks without additional data collection or training.\n","authors":["Laura Smith","Alex Irpan","Montserrat Gonzalez Arenas","Sean Kirmani","Dmitry Kalashnikov","Dhruv Shah","Ted Xiao"],"pdf_url":"https://arxiv.org/pdf/2411.03409v1.pdf","comment":"Project website: https://lauramsmith.github.io/steer/"},{"id":"http://arxiv.org/abs/2411.03408v1","updated":"2024-11-05T18:47:22Z","published":"2024-11-05T18:47:22Z","title":"Learning Few-Shot Object Placement with Intra-Category Transfer","summary":" Efficient learning from demonstration for long-horizon tasks remains an open\nchallenge in robotics. While significant effort has been directed toward\nlearning trajectories, a recent resurgence of object-centric approaches has\ndemonstrated improved sample efficiency, enabling transferable robotic skills.\nSuch approaches model tasks as a sequence of object poses over time. In this\nwork, we propose a scheme for transferring observed object arrangements to\nnovel object instances by learning these arrangements on canonical class\nframes. We then employ this scheme to enable a simple yet effective approach\nfor training models from as few as five demonstrations to predict arrangements\nof a wide range of objects including tableware, cutlery, furniture, and desk\nspaces. We propose a method for optimizing the learned models to enables\nefficient learning of tasks such as setting a table or tidying up an office\nwith intra-category transfer, even in the presence of distractors. We present\nextensive experimental results in simulation and on a real robotic system for\ntable setting which, based on human evaluations, scored 73.3% compared to a\nhuman baseline. We make the code and trained models publicly available at\nhttp://oplict.cs.uni-freiburg.de.\n","authors":["Adrian Röfer","Russell Buchanan","Max Argus","Sethu Vijayakumar","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2411.03408v1.pdf","comment":"8 pages, 7 figures, 2 tables, submitted to RA-L"},{"id":"http://arxiv.org/abs/2411.02914v1","updated":"2024-11-05T08:58:35Z","published":"2024-11-05T08:58:35Z","title":"Exploring the Interplay Between Video Generation and World Models in\n Autonomous Driving: A Survey","summary":" World models and video generation are pivotal technologies in the domain of\nautonomous driving, each playing a critical role in enhancing the robustness\nand reliability of autonomous systems. World models, which simulate the\ndynamics of real-world environments, and video generation models, which produce\nrealistic video sequences, are increasingly being integrated to improve\nsituational awareness and decision-making capabilities in autonomous vehicles.\nThis paper investigates the relationship between these two technologies,\nfocusing on how their structural parallels, particularly in diffusion-based\nmodels, contribute to more accurate and coherent simulations of driving\nscenarios. We examine leading works such as JEPA, Genie, and Sora, which\nexemplify different approaches to world model design, thereby highlighting the\nlack of a universally accepted definition of world models. These diverse\ninterpretations underscore the field's evolving understanding of how world\nmodels can be optimized for various autonomous driving tasks. Furthermore, this\npaper discusses the key evaluation metrics employed in this domain, such as\nChamfer distance for 3D scene reconstruction and Fr\\'echet Inception Distance\n(FID) for assessing the quality of generated video content. By analyzing the\ninterplay between video generation and world models, this survey identifies\ncritical challenges and future research directions, emphasizing the potential\nof these technologies to jointly advance the performance of autonomous driving\nsystems. The findings presented in this paper aim to provide a comprehensive\nunderstanding of how the integration of video generation and world models can\ndrive innovation in the development of safer and more reliable autonomous\nvehicles.\n","authors":["Ao Fu","Yi Zhou","Tao Zhou","Yi Yang","Bojun Gao","Qun Li","Guobin Wu","Ling Shao"],"pdf_url":"https://arxiv.org/pdf/2411.02914v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2411.03314v1","updated":"2024-11-05T18:59:51Z","published":"2024-11-05T18:59:51Z","title":"MME-Finance: A Multimodal Finance Benchmark for Expert-level\n Understanding and Reasoning","summary":" In recent years, multimodal benchmarks for general domains have guided the\nrapid development of multimodal models on general tasks. However, the financial\nfield has its peculiarities. It features unique graphical images (e.g.,\ncandlestick charts, technical indicator charts) and possesses a wealth of\nspecialized financial knowledge (e.g., futures, turnover rate). Therefore,\nbenchmarks from general fields often fail to measure the performance of\nmultimodal models in the financial domain, and thus cannot effectively guide\nthe rapid development of large financial models. To promote the development of\nlarge financial multimodal models, we propose MME-Finance, an bilingual\nopen-ended and practical usage-oriented Visual Question Answering (VQA)\nbenchmark. The characteristics of our benchmark are finance and expertise,\nwhich include constructing charts that reflect the actual usage needs of users\n(e.g., computer screenshots and mobile photography), creating questions\naccording to the preferences in financial domain inquiries, and annotating\nquestions by experts with 10+ years of experience in the financial industry.\nAdditionally, we have developed a custom-designed financial evaluation system\nin which visual information is first introduced in the multi-modal evaluation\nprocess. Extensive experimental evaluations of 19 mainstream MLLMs are\nconducted to test their perception, reasoning, and cognition capabilities. The\nresults indicate that models performing well on general benchmarks cannot do\nwell on MME-Finance; for instance, the top-performing open-source and\nclosed-source models obtain 65.69 (Qwen2VL-72B) and 63.18 (GPT-4o),\nrespectively. Their performance is particularly poor in categories most\nrelevant to finance, such as candlestick charts and technical indicator charts.\nIn addition, we propose a Chinese version, which helps compare performance of\nMLLMs under a Chinese context.\n","authors":["Ziliang Gan","Yu Lu","Dong Zhang","Haohan Li","Che Liu","Jian Liu","Ji Liu","Haipang Wu","Chaoyou Fu","Zenglin Xu","Rongjunchen Zhang","Yong Dai"],"pdf_url":"https://arxiv.org/pdf/2411.03314v1.pdf","comment":"Project Page: https://hithink-research.github.io/MME-Finance/"},{"id":"http://arxiv.org/abs/2411.03313v1","updated":"2024-11-05T18:58:15Z","published":"2024-11-05T18:58:15Z","title":"Classification Done Right for Vision-Language Pre-Training","summary":" We introduce SuperClass, a super simple classification method for\nvision-language pre-training on image-text data. Unlike its contrastive\ncounterpart CLIP who contrast with a text encoder, SuperClass directly utilizes\ntokenized raw text as supervised classification labels, without the need for\nadditional text filtering or selection. Due to the absence of the text encoding\nas contrastive target, SuperClass does not require a text encoder and does not\nneed to maintain a large batch size as CLIP does. SuperClass demonstrated\nsuperior performance on various downstream tasks, including classic computer\nvision benchmarks and vision language downstream tasks. We further explored the\nscaling behavior of SuperClass on model size, training length, or data size,\nand reported encouraging results and comparisons to CLIP.\nhttps://github.com/x-cls/superclass\n","authors":["Huang Zilong","Ye Qinghao","Kang Bingyi","Feng Jiashi","Fan Haoqi"],"pdf_url":"https://arxiv.org/pdf/2411.03313v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.03312v1","updated":"2024-11-05T18:54:21Z","published":"2024-11-05T18:54:21Z","title":"Inference Optimal VLMs Need Only One Visual Token but Larger Models","summary":" Vision Language Models (VLMs) have demonstrated strong capabilities across\nvarious visual understanding and reasoning tasks. However, their real-world\ndeployment is often constrained by high latency during inference due to\nsubstantial compute required to process the large number of input tokens\n(predominantly from the image) by the LLM. To reduce inference costs, one can\neither downsize the LLM or reduce the number of input image-tokens, the latter\nof which has been the focus of many recent works around token compression.\nHowever, it is unclear what the optimal trade-off is, as both the factors\ndirectly affect the VLM performance. We first characterize this optimal\ntrade-off between the number of visual tokens and LLM parameters by\nestablishing scaling laws that capture variations in performance with these two\nfactors. Our results reveal a surprising trend: for visual reasoning tasks, the\ninference-optimal behavior in VLMs, i.e., minimum downstream error at any given\nfixed inference compute, is achieved when using the largest LLM that fits\nwithin the inference budget while minimizing visual token count - often to a\nsingle token. While the token reduction literature has mainly focused on\nmaintaining base model performance by modestly reducing the token count (e.g.,\n$5-10\\times$), our results indicate that the compute-optimal inference regime\nrequires operating under even higher token compression ratios. Based on these\ninsights, we take some initial steps towards building approaches tailored for\nhigh token compression settings. Code is available at\nhttps://github.com/locuslab/llava-token-compression.\n","authors":["Kevin Y. Li","Sachin Goyal","Joao D. Semedo","J. Zico Kolter"],"pdf_url":"https://arxiv.org/pdf/2411.03312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05438v2","updated":"2024-11-05T18:44:55Z","published":"2024-10-07T19:04:24Z","title":"DAAL: Density-Aware Adaptive Line Margin Loss for Multi-Modal Deep\n Metric Learning","summary":" Multi-modal deep metric learning is crucial for effectively capturing diverse\nrepresentations in tasks such as face verification, fine-grained object\nrecognition, and product search. Traditional approaches to metric learning,\nwhether based on distance or margin metrics, primarily emphasize class\nseparation, often overlooking the intra-class distribution essential for\nmulti-modal feature learning. In this context, we propose a novel loss function\ncalled Density-Aware Adaptive Margin Loss(DAAL), which preserves the density\ndistribution of embeddings while encouraging the formation of adaptive\nsub-clusters within each class. By employing an adaptive line strategy, DAAL\nnot only enhances intra-class variance but also ensures robust inter-class\nseparation, facilitating effective multi-modal representation. Comprehensive\nexperiments on benchmark fine-grained datasets demonstrate the superior\nperformance of DAAL, underscoring its potential in advancing retrieval\napplications and multi-modal deep metric learning.\n","authors":["Hadush Hailu Gebrerufael","Anil Kumar Tiwari","Gaurav Neupane","Goitom Ybrah Hailu"],"pdf_url":"https://arxiv.org/pdf/2410.05438v2.pdf","comment":"13 pages, 4 fugues, 2 tables"},{"id":"http://arxiv.org/abs/2404.00318v2","updated":"2024-11-05T17:51:36Z","published":"2024-03-30T10:54:59Z","title":"Cognitive Planning for Object Goal Navigation using Generative AI Models","summary":" Recent advancements in Generative AI, particularly in Large Language Models\n(LLMs) and Large Vision-Language Models (LVLMs), offer new possibilities for\nintegrating cognitive planning into robotic systems. In this work, we present a\nnovel framework for solving the object goal navigation problem that generates\nefficient exploration strategies. Our approach enables a robot to navigate\nunfamiliar environments by leveraging LLMs and LVLMs to understand the semantic\nstructure of the scene. To address the challenge of representing complex\nenvironments without overwhelming the system, we propose a 3D modular scene\nrepresentation, enriched with semantic descriptions. This representation is\ndynamically pruned using an LLM-based mechanism, which filters irrelevant\ninformation and focuses on task-specific data. By combining these elements, our\nsystem generates high-level sub-goals that guide the exploration of the robot\ntoward the target object. We validate our approach in simulated environments,\ndemonstrating its ability to enhance object search efficiency while maintaining\nscalability in complex settings.\n","authors":["Arjun P S","Andrew Melnik","Gora Chand Nandi"],"pdf_url":"https://arxiv.org/pdf/2404.00318v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03286v1","updated":"2024-11-05T17:35:41Z","published":"2024-11-05T17:35:41Z","title":"DiT4Edit: Diffusion Transformer for Image Editing","summary":" Despite recent advances in UNet-based image editing, methods for shape-aware\nobject editing in high-resolution images are still lacking. Compared to UNet,\nDiffusion Transformers (DiT) demonstrate superior capabilities to effectively\ncapture the long-range dependencies among patches, leading to higher-quality\nimage generation. In this paper, we propose DiT4Edit, the first Diffusion\nTransformer-based image editing framework. Specifically, DiT4Edit uses the\nDPM-Solver inversion algorithm to obtain the inverted latents, reducing the\nnumber of steps compared to the DDIM inversion algorithm commonly used in\nUNet-based frameworks. Additionally, we design unified attention control and\npatches merging, tailored for transformer computation streams. This integration\nallows our framework to generate higher-quality edited images faster. Our\ndesign leverages the advantages of DiT, enabling it to surpass UNet structures\nin image editing, especially in high-resolution and arbitrary-size images.\nExtensive experiments demonstrate the strong performance of DiT4Edit across\nvarious editing scenarios, highlighting the potential of Diffusion Transformers\nin supporting image editing.\n","authors":["Kunyu Feng","Yue Ma","Bingyuan Wang","Chenyang Qi","Haozhe Chen","Qifeng Chen","Zeyu Wang"],"pdf_url":"https://arxiv.org/pdf/2411.03286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02188v2","updated":"2024-11-05T17:09:54Z","published":"2024-11-04T15:42:22Z","title":"Digi2Real: Bridging the Realism Gap in Synthetic Data Face Recognition\n via Foundation Models","summary":" The accuracy of face recognition systems has improved significantly in the\npast few years, thanks to the large amount of data collected and the\nadvancement in neural network architectures. However, these large-scale\ndatasets are often collected without explicit consent, raising ethical and\nprivacy concerns. To address this, there have been proposals to use synthetic\ndatasets for training face recognition models. Yet, such models still rely on\nreal data to train the generative models and generally exhibit inferior\nperformance compared to those trained on real datasets. One of these datasets,\nDigiFace, uses a graphics pipeline to generate different identities and\ndifferent intra-class variations without using real data in training the\nmodels. However, the performance of this approach is poor on face recognition\nbenchmarks, possibly due to the lack of realism in the images generated from\nthe graphics pipeline. In this work, we introduce a novel framework for realism\ntransfer aimed at enhancing the realism of synthetically generated face images.\nOur method leverages the large-scale face foundation model, and we adapt the\npipeline for realism enhancement. By integrating the controllable aspects of\nthe graphics pipeline with our realism enhancement technique, we generate a\nlarge amount of realistic variations-combining the advantages of both\napproaches. Our empirical evaluations demonstrate that models trained using our\nenhanced dataset significantly improve the performance of face recognition\nsystems over the baseline. The source code and datasets will be made available\npublicly.\n","authors":["Anjith George","Sebastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2411.02188v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2411.03260v1","updated":"2024-11-05T16:59:06Z","published":"2024-11-05T16:59:06Z","title":"ShadowMamba: State-Space Model with Boundary-Region Selective Scan for\n Shadow Removal","summary":" Image shadow removal is a typical low-level vision problem, where the\npresence of shadows leads to abrupt changes in brightness in certain regions,\naffecting the accuracy of upstream tasks. Current shadow removal methods still\nface challenges such as residual boundary artifacts, and capturing feature\ninformation at shadow boundaries is crucial for removing shadows and\neliminating residual boundary artifacts. Recently, Mamba has achieved\nremarkable success in computer vision by globally modeling long-sequence\ninformation with linear complexity. However, when applied to image shadow\nremoval, the original Mamba scanning method overlooks the semantic continuity\nof shadow boundaries as well as the continuity of semantics within the same\nregion. Based on the unique characteristics of shadow images, this paper\nproposes a novel selective scanning method called boundary-region selective\nscanning. This method scans boundary regions, shadow regions, and non-shadow\nregions independently, bringing pixels of the same region type closer together\nin the long sequence, especially focusing on the local information at the\nboundaries, which is crucial for shadow removal. This method combines with\nglobal scanning and channel scanning to jointly accomplish the shadow removal.\nWe name our model ShadowMamba, the first Mamba-based model for shadow removal.\nExtensive experimental results show that our method outperforms current\nstate-of-the-art models across most metrics on multiple datasets. The code for\nShadowMamba is available at (Code will be released upon acceptance).\n","authors":["Xiujin Zhu","Chee-Onn Chow","Joon Huang Chuah"],"pdf_url":"https://arxiv.org/pdf/2411.03260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17387v3","updated":"2024-11-05T16:52:39Z","published":"2024-03-26T05:12:18Z","title":"Decoupled Pseudo-labeling for Semi-Supervised Monocular 3D Object\n Detection","summary":" We delve into pseudo-labeling for semi-supervised monocular 3D object\ndetection (SSM3OD) and discover two primary issues: a misalignment between the\nprediction quality of 3D and 2D attributes and the tendency of depth\nsupervision derived from pseudo-labels to be noisy, leading to significant\noptimization conflicts with other reliable forms of supervision. We introduce a\nnovel decoupled pseudo-labeling (DPL) approach for SSM3OD. Our approach\nfeatures a Decoupled Pseudo-label Generation (DPG) module, designed to\nefficiently generate pseudo-labels by separately processing 2D and 3D\nattributes. This module incorporates a unique homography-based method for\nidentifying dependable pseudo-labels in BEV space, specifically for 3D\nattributes. Additionally, we present a DepthGradient Projection (DGP) module to\nmitigate optimization conflicts caused by noisy depth supervision of\npseudo-labels, effectively decoupling the depth gradient and removing\nconflicting gradients. This dual decoupling strategy-at both the pseudo-label\ngeneration and gradient levels-significantly improves the utilization of\npseudo-labels in SSM3OD. Our comprehensive experiments on the KITTI benchmark\ndemonstrate the superiority of our method over existing approaches.\n","authors":["Jiacheng Zhang","Jiaming Li","Xiangru Lin","Wei Zhang","Xiao Tan","Junyu Han","Errui Ding","Jingdong Wang","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2403.17387v3.pdf","comment":"accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2411.03239v1","updated":"2024-11-05T16:37:30Z","published":"2024-11-05T16:37:30Z","title":"Decoupling Fine Detail and Global Geometry for Compressed Depth Map\n Super-Resolution","summary":" Recovering high-quality depth maps from compressed sources has gained\nsignificant attention due to the limitations of consumer-grade depth cameras\nand the bandwidth restrictions during data transmission. However, current\nmethods still suffer from two challenges. First, bit-depth compression produces\na uniform depth representation in regions with subtle variations, hindering the\nrecovery of detailed information. Second, densely distributed random noise\nreduces the accuracy of estimating the global geometric structure of the scene.\nTo address these challenges, we propose a novel framework, termed\ngeometry-decoupled network (GDNet), for compressed depth map super-resolution\nthat decouples the high-quality depth map reconstruction process by handling\nglobal and detailed geometric features separately. To be specific, we propose\nthe fine geometry detail encoder (FGDE), which is designed to aggregate fine\ngeometry details in high-resolution low-level image features while\nsimultaneously enriching them with complementary information from\nlow-resolution context-level image features. In addition, we develop the global\ngeometry encoder (GGE) that aims at suppressing noise and extracting global\ngeometric information effectively via constructing compact feature\nrepresentation in a low-rank space. We conduct experiments on multiple\nbenchmark datasets, demonstrating that our GDNet significantly outperforms\ncurrent methods in terms of geometric consistency and detail recovery. In the\nECCV 2024 AIM Compressed Depth Upsampling Challenge, our solution won the 1st\nplace award. Our codes will be available.\n","authors":["Huan Zheng","Wencheng Han","Jianbing Shen"],"pdf_url":"https://arxiv.org/pdf/2411.03239v1.pdf","comment":"The 1st solution for the ECCV 2024 AIM Compressed Depth Upsampling\n Challenge"},{"id":"http://arxiv.org/abs/2311.17898v3","updated":"2024-11-05T16:31:24Z","published":"2023-11-29T18:51:46Z","title":"Contextual Knowledge Pursuit for Faithful Visual Synthesis","summary":" Modern text-to-vision generative models often hallucinate when the prompt\ndescribing the scene to be generated is underspecified. In large language\nmodels (LLMs), a prevalent strategy to reduce hallucinations is to retrieve\nfactual knowledge from an external database. While such retrieval augmentation\nstrategies have great potential to enhance text-to-vision generators, existing\nstatic top-K retrieval methods explore the knowledge pool once, missing the\nbroader context necessary for high-quality generation. Furthermore, LLMs\ninternally possess rich world knowledge learned during large-scale training\n(parametric knowledge) that could mitigate the need for external data\nretrieval. This paper proposes Contextual Knowledge Pursuit (CKPT), a framework\nthat leverages the complementary strengths of external and parametric knowledge\nto help generators produce reliable visual content. Instead of the one-time\nretrieval of facts from an external database to improve a given prompt, CKPT\nuses (1) an LLM to decide whether to seek external knowledge or to self-elicit\ndescriptions from LLM parametric knowledge, (2) a knowledge pursuit process to\ncontextually seek and sequentially gather most relevant facts, (3) a knowledge\naggregator for prompt enhancement with the gathered fact context, and (4) a\nfiltered fine-tuning objective to improve visual synthesis with richer prompts.\nWe evaluate CKPT across multiple text-driven generative tasks (image, 3D\nrendering, and video) on datasets of rare objects and daily scenarios. Our\nresults show that CKPT is capable of generating faithful and semantically rich\ncontent across diverse visual domains, offering a promising data source for\nzero-shot synthesis and filtered fine-tuning of text-to-vision generative\nmodels.\n","authors":["Jinqi Luo","Kwan Ho Ryan Chan","Dimitris Dimos","René Vidal"],"pdf_url":"https://arxiv.org/pdf/2311.17898v3.pdf","comment":"Accepted in ECCV 2024 SDCV Workshop. GitHub repository at\n https://github.com/peterljq/Contextual-Knowledge-Pursuit"},{"id":"http://arxiv.org/abs/2409.18336v2","updated":"2024-11-05T16:30:30Z","published":"2024-09-26T23:18:25Z","title":"DeBaRA: Denoising-Based 3D Room Arrangement Generation","summary":" Generating realistic and diverse layouts of furnished indoor 3D scenes\nunlocks multiple interactive applications impacting a wide range of industries.\nThe inherent complexity of object interactions, the limited amount of available\ndata and the requirement to fulfill spatial constraints all make generative\nmodeling for 3D scene synthesis and arrangement challenging. Current methods\naddress these challenges autoregressively or by using off-the-shelf diffusion\nobjectives by simultaneously predicting all attributes without 3D reasoning\nconsiderations. In this paper, we introduce DeBaRA, a score-based model\nspecifically tailored for precise, controllable and flexible arrangement\ngeneration in a bounded environment. We argue that the most critical component\nof a scene synthesis system is to accurately establish the size and position of\nvarious objects within a restricted area. Based on this insight, we propose a\nlightweight conditional score-based model designed with 3D spatial awareness at\nits core. We demonstrate that by focusing on spatial attributes of objects, a\nsingle trained DeBaRA model can be leveraged at test time to perform several\ndownstream applications such as scene synthesis, completion and re-arrangement.\nFurther, we introduce a novel Self Score Evaluation procedure so it can be\noptimally employed alongside external LLM models. We evaluate our approach\nthrough extensive experiments and demonstrate significant improvement upon\nstate-of-the-art approaches in a range of scenarios.\n","authors":["Léopold Maillard","Nicolas Sereyjol-Garros","Tom Durand","Maks Ovsjanikov"],"pdf_url":"https://arxiv.org/pdf/2409.18336v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2311.16484v2","updated":"2024-11-05T16:25:05Z","published":"2023-11-26T05:14:06Z","title":"Seeing Eye to AI: Comparing Human Gaze and Model Attention in Video\n Memorability","summary":" Understanding what makes a video memorable has important applications in\nadvertising or education technology. Towards this goal, we investigate\nspatio-temporal attention mechanisms underlying video memorability. Different\nfrom previous works that fuse multiple features, we adopt a simple\nCNN+Transformer architecture that enables analysis of spatio-temporal attention\nwhile matching state-of-the-art (SoTA) performance on video memorability\nprediction. We compare model attention against human gaze fixations collected\nthrough a small-scale eye-tracking study where humans perform the video memory\ntask. We uncover the following insights: (i) Quantitative saliency metrics show\nthat our model, trained only to predict a memorability score, exhibits similar\nspatial attention patterns to human gaze, especially for more memorable videos.\n(ii) The model assigns greater importance to initial frames in a video,\nmimicking human attention patterns. (iii) Panoptic segmentation reveals that\nboth (model and humans) assign a greater share of attention to things and less\nattention to stuff as compared to their occurrence probability.\n","authors":["Prajneya Kumar","Eshika Khandelwal","Makarand Tapaswi","Vishnu Sreekumar"],"pdf_url":"https://arxiv.org/pdf/2311.16484v2.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2411.03228v1","updated":"2024-11-05T16:20:14Z","published":"2024-11-05T16:20:14Z","title":"Topograph: An efficient Graph-Based Framework for Strictly Topology\n Preserving Image Segmentation","summary":" Topological correctness plays a critical role in many image segmentation\ntasks, yet most networks are trained using pixel-wise loss functions, such as\nDice, neglecting topological accuracy. Existing topology-aware methods often\nlack robust topological guarantees, are limited to specific use cases, or\nimpose high computational costs. In this work, we propose a novel, graph-based\nframework for topologically accurate image segmentation that is both\ncomputationally efficient and generally applicable. Our method constructs a\ncomponent graph that fully encodes the topological information of both the\nprediction and ground truth, allowing us to efficiently identify topologically\ncritical regions and aggregate a loss based on local neighborhood information.\nFurthermore, we introduce a strict topological metric capturing the homotopy\nequivalence between the union and intersection of prediction-label pairs. We\nformally prove the topological guarantees of our approach and empirically\nvalidate its effectiveness on binary and multi-class datasets. Our loss\ndemonstrates state-of-the-art performance with up to fivefold faster loss\ncomputation compared to persistent homology methods.\n","authors":["Laurin Lux","Alexander H. Berger","Alexander Weers","Nico Stucki","Daniel Rueckert","Ulrich Bauer","Johannes C. Paetzold"],"pdf_url":"https://arxiv.org/pdf/2411.03228v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03226v1","updated":"2024-11-05T16:18:57Z","published":"2024-11-05T16:18:57Z","title":"Kernel Orthogonality does not necessarily imply a Decrease in Feature\n Map Redundancy in CNNs: Convolutional Similarity Minimization","summary":" Convolutional Neural Networks (CNNs) have been heavily used in Deep Learning\ndue to their success in various tasks. Nonetheless, it has been observed that\nCNNs suffer from redundancy in feature maps, leading to inefficient capacity\nutilization. Efforts to mitigate and solve this problem led to the emergence of\nmultiple methods, amongst which is kernel orthogonality through variant means.\nIn this work, we challenge the common belief that kernel orthogonality leads to\na decrease in feature map redundancy, which is, supposedly, the ultimate\nobjective behind kernel orthogonality. We prove, theoretically and empirically,\nthat kernel orthogonality has an unpredictable effect on feature map similarity\nand does not necessarily decrease it. Based on our theoretical result, we\npropose an effective method to reduce feature map similarity independently of\nthe input of the CNN. This is done by minimizing a novel loss function we call\nConvolutional Similarity. Empirical results show that minimizing the\nConvolutional Similarity increases the performance of classification models and\ncan accelerate their convergence. Furthermore, using our proposed method pushes\ntowards a more efficient use of the capacity of models, allowing the use of\nsignificantly smaller models to achieve the same levels of performance.\n","authors":["Zakariae Belmekki","Jun Li","Patrick Reuter","David Antonio Gómez Jáuregui","Karl Jenkins"],"pdf_url":"https://arxiv.org/pdf/2411.03226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24010v2","updated":"2024-11-05T16:16:29Z","published":"2024-10-31T15:10:38Z","title":"Re-assembling the past: The RePAIR dataset and benchmark for real world\n 2D and 3D puzzle solving","summary":" This paper proposes the RePAIR dataset that represents a challenging\nbenchmark to test modern computational and data driven methods for\npuzzle-solving and reassembly tasks. Our dataset has unique properties that are\nuncommon to current benchmarks for 2D and 3D puzzle solving. The fragments and\nfractures are realistic, caused by a collapse of a fresco during a World War II\nbombing at the Pompeii archaeological park. The fragments are also eroded and\nhave missing pieces with irregular shapes and different dimensions, challenging\nfurther the reassembly algorithms. The dataset is multi-modal providing high\nresolution images with characteristic pictorial elements, detailed 3D scans of\nthe fragments and meta-data annotated by the archaeologists. Ground truth has\nbeen generated through several years of unceasing fieldwork, including the\nexcavation and cleaning of each fragment, followed by manual puzzle solving by\narchaeologists of a subset of approx. 1000 pieces among the 16000 available.\nAfter digitizing all the fragments in 3D, a benchmark was prepared to challenge\ncurrent reassembly and puzzle-solving methods that often solve more simplistic\nsynthetic scenarios. The tested baselines show that there clearly exists a gap\nto fill in solving this computationally complex problem.\n","authors":["Theodore Tsesmelis","Luca Palmieri","Marina Khoroshiltseva","Adeela Islam","Gur Elkin","Ofir Itzhak Shahar","Gianluca Scarpellini","Stefano Fiorini","Yaniv Ohayon","Nadav Alali","Sinem Aslan","Pietro Morerio","Sebastiano Vascon","Elena Gravina","Maria Cristina Napolitano","Giuseppe Scarpati","Gabriel Zuchtriegel","Alexandra Spühler","Michel E. Fuchs","Stuart James","Ohad Ben-Shahar","Marcello Pelillo","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2410.24010v2.pdf","comment":"NeurIPS 2024, Track Datasets and Benchmarks, 10 pages"},{"id":"http://arxiv.org/abs/2411.03225v1","updated":"2024-11-05T16:15:33Z","published":"2024-11-05T16:15:33Z","title":"Knowledge Graphs of Driving Scenes to Empower the Emerging Capabilities\n of Neurosymbolic AI","summary":" In the era of Generative AI, Neurosymbolic AI is emerging as a powerful\napproach for tasks spanning from perception to cognition. The use of\nNeurosymbolic AI has been shown to achieve enhanced capabilities, including\nimproved grounding, alignment, explainability, and reliability. However, due to\nits nascent stage, there is a lack of widely available real-world benchmark\ndatasets tailored to Neurosymbolic AI tasks. To address this gap and support\nthe evaluation of current and future methods, we introduce DSceneKG -- a suite\nof knowledge graphs of driving scenes built from real-world, high-quality\nscenes from multiple open autonomous driving datasets. In this article, we\ndetail the construction process of DSceneKG and highlight its application in\nseven different tasks. DSceneKG is publicly accessible at:\nhttps://github.com/ruwantw/DSceneKG\n","authors":["Ruwan Wickramarachchi","Cory Henson","Amit Sheth"],"pdf_url":"https://arxiv.org/pdf/2411.03225v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2411.03223v1","updated":"2024-11-05T16:12:12Z","published":"2024-11-05T16:12:12Z","title":"Beyond Grid Data: Exploring Graph Neural Networks for Earth Observation","summary":" Earth Observation (EO) data analysis has been significantly revolutionized by\ndeep learning (DL), with applications typically limited to grid-like data\nstructures. Graph Neural Networks (GNNs) emerge as an important innovation,\npropelling DL into the non-Euclidean domain. Naturally, GNNs can effectively\ntackle the challenges posed by diverse modalities, multiple sensors, and the\nheterogeneous nature of EO data. To introduce GNNs in the related domains, our\nreview begins by offering fundamental knowledge on GNNs. Then, we summarize the\ngeneric problems in EO, to which GNNs can offer potential solutions. Following\nthis, we explore a broad spectrum of GNNs' applications to scientific problems\nin Earth systems, covering areas such as weather and climate analysis, disaster\nmanagement, air quality monitoring, agriculture, land cover classification,\nhydrological process modeling, and urban modeling. The rationale behind\nadopting GNNs in these fields is explained, alongside methodologies for\norganizing graphs and designing favorable architectures for various tasks.\nFurthermore, we highlight methodological challenges of implementing GNNs in\nthese domains and possible solutions that could guide future research. While\nacknowledging that GNNs are not a universal solution, we conclude the paper by\ncomparing them with other popular architectures like transformers and analyzing\ntheir potential synergies.\n","authors":["Shan Zhao","Zhaiyu Chen","Zhitong Xiong","Yilei Shi","Sudipan Saha","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.03223v1.pdf","comment":"Accepted for publication in Geoscience and Remote Sensing Magazine\n (GRSM)"},{"id":"http://arxiv.org/abs/2403.09918v4","updated":"2024-11-05T15:37:00Z","published":"2024-03-14T23:31:41Z","title":"Attention-based Class-Conditioned Alignment for Multi-Source Domain\n Adaptation of Object Detectors","summary":" Domain adaptation methods for object detection (OD) strive to mitigate the\nimpact of distribution shifts by promoting feature alignment across source and\ntarget domains. Multi-source domain adaptation (MSDA) allows leveraging\nmultiple annotated source datasets and unlabeled target data to improve the\naccuracy and robustness of the detection model. Most state-of-the-art MSDA\nmethods for OD perform feature alignment in a class-agnostic manner. This is\nchallenging since the objects have unique modal information due to variations\nin object appearance across domains. A recent prototype-based approach proposed\na class-wise alignment, yet it suffers from error accumulation due to noisy\npseudo-labels that can negatively affect adaptation with imbalanced data. To\novercome these limitations, we propose an attention-based class-conditioned\nalignment method for MSDA that aligns instances of each object category across\ndomains. In particular, an attention module coupled with an adversarial domain\nclassifier allows learning domain-invariant and class-specific instance\nrepresentations. Experimental results on multiple benchmarking MSDA datasets\nindicate that our method outperforms the state-of-the-art methods and is robust\nto class imbalance using a conceptually simple class-conditioning method. Our\ncode is available at https://github.com/imatif17/ACIA.\n","authors":["Atif Belal","Akhil Meethal","Francisco Perdigon Romero","Marco Pedersoli","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2403.09918v4.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2309.14950"},{"id":"http://arxiv.org/abs/2411.03177v1","updated":"2024-11-05T15:22:26Z","published":"2024-11-05T15:22:26Z","title":"On Improved Conditioning Mechanisms and Pre-training Strategies for\n Diffusion Models","summary":" Large-scale training of latent diffusion models (LDMs) has enabled\nunprecedented quality in image generation. However, the key components of the\nbest performing LDM training recipes are oftentimes not available to the\nresearch community, preventing apple-to-apple comparisons and hindering the\nvalidation of progress in the field. In this work, we perform an in-depth study\nof LDM training recipes focusing on the performance of models and their\ntraining efficiency. To ensure apple-to-apple comparisons, we re-implement five\npreviously published models with their corresponding recipes. Through our\nstudy, we explore the effects of (i)~the mechanisms used to condition the\ngenerative model on semantic information (e.g., text prompt) and control\nmetadata (e.g., crop size, random flip flag, etc.) on the model performance,\nand (ii)~the transfer of the representations learned on smaller and\nlower-resolution datasets to larger ones on the training efficiency and model\nperformance. We then propose a novel conditioning mechanism that disentangles\nsemantic and control metadata conditionings and sets a new state-of-the-art in\nclass-conditional generation on the ImageNet-1k dataset -- with FID\nimprovements of 7% on 256 and 8% on 512 resolutions -- as well as text-to-image\ngeneration on the CC12M dataset -- with FID improvements of 8% on 256 and 23%\non 512 resolution.\n","authors":["Tariq Berrada Ifriqi","Pietro Astolfi","Melissa Hall","Reyhane Askari-Hemmat","Yohann Benchetrit","Marton Havasi","Matthew Muckley","Karteek Alahari","Adriana Romero-Soriano","Jakob Verbeek","Michal Drozdzal"],"pdf_url":"https://arxiv.org/pdf/2411.03177v1.pdf","comment":"Accepted as a conference paper (poster) for NeurIPS 2024"},{"id":"http://arxiv.org/abs/2309.05756v3","updated":"2024-11-05T15:18:15Z","published":"2023-09-11T18:35:14Z","title":"GlobalDoc: A Cross-Modal Vision-Language Framework for Real-World\n Document Image Retrieval and Classification","summary":" Visual document understanding (VDU) has rapidly advanced with the development\nof powerful multi-modal language models. However, these models typically\nrequire extensive document pre-training data to learn intermediate\nrepresentations and often suffer a significant performance drop in real-world\nonline industrial settings. A primary issue is their heavy reliance on OCR\nengines to extract local positional information within document pages, which\nlimits the models' ability to capture global information and hinders their\ngeneralizability, flexibility, and robustness. In this paper, we introduce\nGlobalDoc, a cross-modal transformer-based architecture pre-trained in a\nself-supervised manner using three novel pretext objective tasks. GlobalDoc\nimproves the learning of richer semantic concepts by unifying language and\nvisual representations, resulting in more transferable models. For proper\nevaluation, we also propose two novel document-level downstream VDU tasks,\nFew-Shot Document Image Classification (DIC) and Content-based Document Image\nRetrieval (DIR), designed to simulate industrial scenarios more closely.\nExtensive experimentation has been conducted to demonstrate GlobalDoc's\neffectiveness in practical settings.\n","authors":["Souhail Bakkali","Sanket Biswas","Zuheng Ming","Mickaël Coustaty","Marçal Rusiñol","Oriol Ramos Terrades","Josep Lladós"],"pdf_url":"https://arxiv.org/pdf/2309.05756v3.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2411.03169v1","updated":"2024-11-05T15:18:02Z","published":"2024-11-05T15:18:02Z","title":"Pre-trained Visual Dynamics Representations for Efficient Policy\n Learning","summary":" Pre-training for Reinforcement Learning (RL) with purely video data is a\nvaluable yet challenging problem. Although in-the-wild videos are readily\navailable and inhere a vast amount of prior world knowledge, the absence of\naction annotations and the common domain gap with downstream tasks hinder\nutilizing videos for RL pre-training. To address the challenge of pre-training\nwith videos, we propose Pre-trained Visual Dynamics Representations (PVDR) to\nbridge the domain gap between videos and downstream tasks for efficient policy\nlearning. By adopting video prediction as a pre-training task, we use a\nTransformer-based Conditional Variational Autoencoder (CVAE) to learn visual\ndynamics representations. The pre-trained visual dynamics representations\ncapture the visual dynamics prior knowledge in the videos. This abstract prior\nknowledge can be readily adapted to downstream tasks and aligned with\nexecutable actions through online adaptation. We conduct experiments on a\nseries of robotics visual control tasks and verify that PVDR is an effective\nform for pre-training with videos to promote policy learning.\n","authors":["Hao Luo","Bohan Zhou","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2411.03169v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2410.02401v5","updated":"2024-11-05T14:38:30Z","published":"2024-10-03T11:29:09Z","title":"SynCo: Synthetic Hard Negatives in Contrastive Learning for Better\n Unsupervised Visual Representations","summary":" Contrastive learning has become a dominant approach in self-supervised visual\nrepresentation learning. Hard negatives - samples closely resembling the anchor\n- are key to enhancing learned representations' discriminative power. However,\nefficiently leveraging hard negatives remains challenging. We introduce SynCo\n(Synthetic Negatives in Contrastive learning), a novel approach that improves\nmodel performance by generating synthetic hard negatives on the representation\nspace. Building on the MoCo framework, SynCo introduces six strategies for\ncreating diverse synthetic hard negatives on-the-fly with minimal computational\noverhead. SynCo achieves faster training and better representation learning,\nreaching 67.9% top-1 accuracy on ImageNet ILSVRC-2012 linear evaluation after\n200 pretraining epochs, surpassing MoCo's 67.5% using the same ResNet-50\nencoder. It also transfers more effectively to detection tasks: on PASCAL VOC,\nit outperforms both the supervised baseline and MoCo with 82.5% AP; on COCO, it\nsets new benchmarks with 40.9% AP for bounding box detection and 35.5% AP for\ninstance segmentation. Our synthetic hard negative generation approach\nsignificantly enhances visual representations learned through self-supervised\ncontrastive learning. Code is available at\nhttps://github.com/giakoumoglou/synco.\n","authors":["Nikolaos Giakoumoglou","Tania Stathaki"],"pdf_url":"https://arxiv.org/pdf/2410.02401v5.pdf","comment":"10 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2411.02293v2","updated":"2024-11-05T14:33:41Z","published":"2024-11-04T17:21:42Z","title":"Tencent Hunyuan3D-1.0: A Unified Framework for Text-to-3D and\n Image-to-3D Generation","summary":" While 3D generative models have greatly improved artists' workflows, the\nexisting diffusion models for 3D generation suffer from slow generation and\npoor generalization. To address this issue, we propose a two-stage approach\nnamed Hunyuan3D-1.0 including a lite version and a standard version, that both\nsupport text- and image-conditioned generation. In the first stage, we employ a\nmulti-view diffusion model that efficiently generates multi-view RGB in\napproximately 4 seconds. These multi-view images capture rich details of the 3D\nasset from different viewpoints, relaxing the tasks from single-view to\nmulti-view reconstruction. In the second stage, we introduce a feed-forward\nreconstruction model that rapidly and faithfully reconstructs the 3D asset\ngiven the generated multi-view images in approximately 7 seconds. The\nreconstruction network learns to handle noises and in-consistency introduced by\nthe multi-view diffusion and leverages the available information from the\ncondition image to efficiently recover the 3D structure. Our framework involves\nthe text-to-image model, i.e., Hunyuan-DiT, making it a unified framework to\nsupport both text- and image-conditioned 3D generation. Our standard version\nhas 3x more parameters than our lite and other existing model. Our\nHunyuan3D-1.0 achieves an impressive balance between speed and quality,\nsignificantly reducing generation time while maintaining the quality and\ndiversity of the produced assets.\n","authors":["Xianghui Yang","Huiwen Shi","Bowen Zhang","Fan Yang","Jiacheng Wang","Hongxu Zhao","Xinhai Liu","Xinzhou Wang","Qingxiang Lin","Jiaao Yu","Lifu Wang","Zhuo Chen","Sicong Liu","Yuhong Liu","Yong Yang","Di Wang","Jie Jiang","Chunchao Guo"],"pdf_url":"https://arxiv.org/pdf/2411.02293v2.pdf","comment":"Technical Report; 3D Generation"},{"id":"http://arxiv.org/abs/2411.03129v1","updated":"2024-11-05T14:21:01Z","published":"2024-11-05T14:21:01Z","title":"MA^2: A Self-Supervised and Motion Augmenting Autoencoder for Gait-Based\n Automatic Disease Detection","summary":" Ground reaction force (GRF) is the force exerted by the ground on a body in\ncontact with it. GRF-based automatic disease detection (ADD) has become an\nemerging medical diagnosis method, which aims to learn and identify disease\npatterns corresponding to different gait pressures based on deep learning\nmethods. Although existing ADD methods can save doctors time in making\ndiagnoses, training deep models still struggles with the cost caused by the\nlabeling engineering for a large number of gait diagnostic data for subjects.\nOn the other hand, the accuracy of the deep model under the unified benchmark\nGRF dataset and the generalization ability on scalable gait datasets need to be\nfurther improved. To address these issues, we propose MA2, a GRF-based\nself-supervised and motion augmenting auto-encoder, which models the ADD task\nas an encoder-decoder paradigm. In the encoder, we introduce an embedding block\nincluding the 3-layer 1D convolution for extracting the token and a mask\ngenerator to randomly mask out the sequence of tokens to maximize the model's\npotential to capture high-level, discriminative, intrinsic representations.\nwhereafter, the decoder utilizes this information to reconstruct the pixel\nsequence of the origin input and calculate the reconstruction loss to optimize\nthe network. Moreover, the backbone of an auto-encoder is multi-head\nself-attention that can consider the global information of the token from the\ninput, not just the local neighborhood. This allows the model to capture\ngeneralized contextual information. Extensive experiments demonstrate MA2 has\nSOTA performance of 90.91% accuracy on 1% limited pathological GRF samples with\nlabels, and good generalization ability of 78.57% accuracy on scalable\nParkinson disease dataset.\n","authors":["Yiqun Liu","Ke Zhang","Yin Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.03129v1.pdf","comment":"8 pages, 11 figures, article"},{"id":"http://arxiv.org/abs/2410.20595v3","updated":"2024-11-05T14:13:56Z","published":"2024-10-27T21:02:37Z","title":"A Framework for Real-Time Volcano-Seismic Event Recognition Based on\n Multi-Station Seismograms and Semantic Segmentation Models","summary":" In volcano monitoring, effective recognition of seismic events is essential\nfor understanding volcanic activity and raising timely warning alerts.\nTraditional methods rely on manual analysis, which can be subjective and\nlabor-intensive. Furthermore, current automatic approaches often tackle\ndetection and classification separately, mostly rely on single station\ninformation and generally require tailored preprocessing and representations to\nperform predictions. These limitations often hinder their application to\nreal-time monitoring and utilization across different volcano conditions. This\nstudy introduces a novel approach that utilizes Semantic Segmentation models to\nautomate seismic event recognition by applying a straight forward\ntransformation of multi-channel 1D signals into 2D representations, enabling\ntheir use as images. Our framework employs a data-driven, end-to-end design\nthat integrates multi-station seismic data with minimal preprocessing,\nperforming both detection and classification simultaneously for five seismic\nevent classes. We evaluated four state-of-the-art segmentation models (UNet,\nUNet++, DeepLabV3+ and SwinUNet) on approximately 25.000 seismic events\nrecorded at four different Chilean volcanoes: Nevados del Chill\\'an Volcanic\nComplex, Laguna del Maule, Villarrica and Puyehue-Cord\\'on Caulle. Among these\nmodels, the UNet architecture was identified as the most effective model,\nachieving mean F1 and Intersection over Union (IoU) scores of up to 0.91 and\n0.88, respectively, and demonstrating superior noise robustness and model\nflexibility to unseen volcano datasets.\n","authors":["Camilo Espinosa-Curilem","Millaray Curilem","Daniel Basualto"],"pdf_url":"https://arxiv.org/pdf/2410.20595v3.pdf","comment":"10 pages, 9 figures. This is a pre-print, it is currently under\n review for publication"},{"id":"http://arxiv.org/abs/2312.05327v3","updated":"2024-11-05T14:12:40Z","published":"2023-12-08T19:24:05Z","title":"Better, Not Just More: Data-Centric Machine Learning for Earth\n Observation","summary":" Recent developments and research in modern machine learning have led to\nsubstantial improvements in the geospatial field. Although numerous deep\nlearning architectures and models have been proposed, the majority of them have\nbeen solely developed on benchmark datasets that lack strong real-world\nrelevance. Furthermore, the performance of many methods has already saturated\non these datasets. We argue that a shift from a model-centric view to a\ncomplementary data-centric perspective is necessary for further improvements in\naccuracy, generalization ability, and real impact on end-user applications.\nFurthermore, considering the entire machine learning cycle-from problem\ndefinition to model deployment with feedback-is crucial for enhancing machine\nlearning models that can be reliable in unforeseen situations. This work\npresents a definition as well as a precise categorization and overview of\nautomated data-centric learning approaches for geospatial data. It highlights\nthe complementary role of data-centric learning with respect to model-centric\nin the larger machine learning deployment cycle. We review papers across the\nentire geospatial field and categorize them into different groups. A set of\nrepresentative experiments shows concrete implementation examples. These\nexamples provide concrete steps to act on geospatial data with data-centric\nmachine learning approaches.\n","authors":["Ribana Roscher","Marc Rußwurm","Caroline Gevaert","Michael Kampffmeyer","Jefersson A. dos Santos","Maria Vakalopoulou","Ronny Hänsch","Stine Hansen","Keiller Nogueira","Jonathan Prexl","Devis Tuia"],"pdf_url":"https://arxiv.org/pdf/2312.05327v3.pdf","comment":"Accepted to Geoscience and Remote Sensing Magazine"},{"id":"http://arxiv.org/abs/2411.03114v1","updated":"2024-11-05T14:03:36Z","published":"2024-11-05T14:03:36Z","title":"Investigating the Applicability of a Snapshot Computed Tomography\n Imaging Spectrometer for the Prediction of Brix and pH of Grapes","summary":" In this paper, a recently developed snapshot hyperspectral imaging (HSI)\nsystem based on Computed Tomography Imaging Spectroscopy (CTIS) is utilized to\ndetermine Brix and pH values in Sheegene 20 table grapes through Partial Least\nSquares Regression (PLSR) modeling. The performance of the CTIS system is\ncompared with that of a state-of-the-art line scan HSI system by imaging 100\ngrapes across both platforms. Reference measurements of Brix and pH values are\nobtained directly using a refractometer and a pH meter, as these parameters are\nessential for assessing the quality of table and wine grapes. The findings\nindicate that the spectra captured by the CTIS camera correlate well with the\nreference measurements, despite the system's narrower spectral range. The CTIS\ncamera's advantages, including its lower cost, portability, and reduced\nsusceptibility to motion errors, highlight its potential for promising in-field\napplications in grape quality assessment.\n","authors":["Mads Svanborg Peters","Mads Juul Ahlebæk","Mads Toudal Frandsen","Bjarke Jørgensen","Christian Hald Jessen","Andreas Krogh Carlsen","Wei-Chih Huang","René Lynge Eriksen"],"pdf_url":"https://arxiv.org/pdf/2411.03114v1.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2401.17789v3","updated":"2024-11-05T14:00:12Z","published":"2024-01-31T12:32:17Z","title":"Robustly overfitting latents for flexible neural image compression","summary":" Neural image compression has made a great deal of progress. State-of-the-art\nmodels are based on variational autoencoders and are outperforming classical\nmodels. Neural compression models learn to encode an image into a quantized\nlatent representation that can be efficiently sent to the decoder, which\ndecodes the quantized latent into a reconstructed image. While these models\nhave proven successful in practice, they lead to sub-optimal results due to\nimperfect optimization and limitations in the encoder and decoder capacity.\nRecent work shows how to use stochastic Gumbel annealing (SGA) to refine the\nlatents of pre-trained neural image compression models. We extend this idea by\nintroducing SGA+, which contains three different methods that build upon SGA.\nWe show how our method improves the overall compression performance in terms of\nthe R-D trade-off, compared to its predecessors. Additionally, we show how\nrefinement of the latents with our best-performing method improves the\ncompression performance on both the Tecnick and CLIC dataset. Our method is\ndeployed for a pre-trained hyperprior and for a more flexible model. Further,\nwe give a detailed analysis of our proposed methods and show that they are less\nsensitive to hyperparameter choices. Finally, we show how each method can be\nextended to three- instead of two-class rounding.\n","authors":["Yura Perugachi-Diaz","Arwin Gansekoele","Sandjai Bhulai"],"pdf_url":"https://arxiv.org/pdf/2401.17789v3.pdf","comment":"Accepted at Neural Information Processing Systems (NeurIPS) 2024"},{"id":"http://arxiv.org/abs/2411.03098v1","updated":"2024-11-05T13:44:25Z","published":"2024-11-05T13:44:25Z","title":"Local Lesion Generation is Effective for Capsule Endoscopy Image Data\n Augmentation in a Limited Data Setting","summary":" Limited medical imaging datasets challenge deep learning models by increasing\nrisks of overfitting and reduced generalization, particularly in Generative\nAdversarial Networks (GANs), where discriminators may overfit, leading to\ntraining divergence. This constraint also impairs classification models trained\non small datasets. Generative Data Augmentation (GDA) addresses this by\nexpanding training datasets with synthetic data, although it requires training\na generative model. We propose and evaluate two local lesion generation\napproaches to address the challenge of augmenting small medical image datasets.\nThe first approach employs the Poisson Image Editing algorithm, a classical\nimage processing technique, to create realistic image composites that\noutperform current state-of-the-art methods. The second approach introduces a\nnovel generative method, leveraging a fine-tuned Image Inpainting GAN to\nsynthesize realistic lesions within specified regions of real training images.\nA comprehensive comparison of the two proposed methods demonstrates that\neffective local lesion generation in a data-constrained setting allows for\nreaching new state-of-the-art results in capsule endoscopy lesion\nclassification. Combination of our techniques achieves a macro F1-score of\n33.07%, surpassing the previous best result by 7.84 percentage points (p.p.) on\nthe highly imbalanced Kvasir Capsule Dataset, a benchmark for capsule\nendoscopy. To the best of our knowledge, this work is the first to apply a\nfine-tuned Image Inpainting GAN for GDA in medical imaging, demonstrating that\nan image-conditional GAN can be adapted effectively to limited datasets to\ngenerate high-quality examples, facilitating effective data augmentation.\nAdditionally, we show that combining this GAN-based approach with classical\nimage processing techniques further enhances the results.\n","authors":["Adrian B. Chłopowiec","Adam R. Chłopowiec","Krzysztof Galus","Wojciech Cebula","Martin Tabakov"],"pdf_url":"https://arxiv.org/pdf/2411.03098v1.pdf","comment":"45 pages, 27 figures"},{"id":"http://arxiv.org/abs/2411.03086v1","updated":"2024-11-05T13:31:04Z","published":"2024-11-05T13:31:04Z","title":"HFGaussian: Learning Generalizable Gaussian Human with Integrated Human\n Features","summary":" Recent advancements in radiance field rendering show promising results in 3D\nscene representation, where Gaussian splatting-based techniques emerge as\nstate-of-the-art due to their quality and efficiency. Gaussian splatting is\nwidely used for various applications, including 3D human representation.\nHowever, previous 3D Gaussian splatting methods either use parametric body\nmodels as additional information or fail to provide any underlying structure,\nlike human biomechanical features, which are essential for different\napplications. In this paper, we present a novel approach called HFGaussian that\ncan estimate novel views and human features, such as the 3D skeleton, 3D key\npoints, and dense pose, from sparse input images in real time at 25 FPS. The\nproposed method leverages generalizable Gaussian splatting technique to\nrepresent the human subject and its associated features, enabling efficient and\ngeneralizable reconstruction. By incorporating a pose regression network and\nthe feature splatting technique with Gaussian splatting, HFGaussian\ndemonstrates improved capabilities over existing 3D human methods, showcasing\nthe potential of 3D human representations with integrated biomechanics. We\nthoroughly evaluate our HFGaussian method against the latest state-of-the-art\ntechniques in human Gaussian splatting and pose estimation, demonstrating its\nreal-time, state-of-the-art performance.\n","authors":["Arnab Dey","Cheng-You Lu","Andrew I. Comport","Srinath Sridhar","Chin-Teng Lin","Jean Martinet"],"pdf_url":"https://arxiv.org/pdf/2411.03086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03082v1","updated":"2024-11-05T13:26:31Z","published":"2024-11-05T13:26:31Z","title":"Self-supervised cross-modality learning for uncertainty-aware object\n detection and recognition in applications which lack pre-labelled training\n data","summary":" This paper shows how an uncertainty-aware, deep neural network can be trained\nto detect, recognise and localise objects in 2D RGB images, in applications\nlacking annotated train-ng datasets. We propose a self-supervising\nteacher-student pipeline, in which a relatively simple teacher classifier,\ntrained with only a few labelled 2D thumbnails, automatically processes a\nlarger body of unlabelled RGB-D data to teach a student network based on a\nmodified YOLOv3 architecture. Firstly, 3D object detection with back projection\nis used to automatically extract and teach 2D detection and localisation\ninformation to the student network. Secondly, a weakly supervised 2D thumbnail\nclassifier, with minimal training on a small number of hand-labelled images, is\nused to teach object category recognition. Thirdly, we use a Gaussian Process\nGP to encode and teach a robust uncertainty estimation functionality, so that\nthe student can output confidence scores with each categorization. The\nresulting student significantly outperforms the same YOLO architecture trained\ndirectly on the same amount of labelled data. Our GP-based approach yields\nrobust and meaningful uncertainty estimations for complex industrial object\nclassifications. The end-to-end network is also capable of real-time\nprocessing, needed for robotics applications. Our method can be applied to many\nimportant industrial tasks, where labelled datasets are typically unavailable.\nIn this paper, we demonstrate an example of detection, localisation, and object\ncategory recognition of nuclear mixed-waste materials in highly cluttered and\nunstructured scenes. This is critical for robotic sorting and handling of\nlegacy nuclear waste, which poses complex environmental remediation challenges\nin many nuclearised nations.\n","authors":["Irum Mehboob","Li Sun","Alireza Astegarpanah","Rustam Stolkin"],"pdf_url":"https://arxiv.org/pdf/2411.03082v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2410.22566v2","updated":"2024-11-05T13:21:26Z","published":"2024-10-29T22:15:03Z","title":"Deep Priors for Video Quality Prediction","summary":" In this work, we designed a completely blind video quality assessment\nalgorithm using the deep video prior. This work mainly explores the utility of\ndeep video prior in estimating the visual quality of the video. In our work, we\nhave used a single distorted video and a reference video pair to learn the deep\nvideo prior. At inference time, the learned deep prior is used to restore the\noriginal videos from the distorted videos. The ability of learned deep video\nprior to restore the original video from the distorted video is measured to\nquantify distortion in the video. Our hypothesis is that the learned deep video\nprior fails in restoring the highly distorted videos. The restoring ability of\ndeep video prior is proportional to the distortion present in the video.\nTherefore, we propose to use the distance between the distorted video and the\nrestored video as the perceptual quality of the video. Our algorithm is trained\nusing a single video pair and it does not need any labelled data. We show that\nour proposed algorithm outperforms the existing unsupervised video quality\nassessment algorithms in terms of LCC and SROCC on a synthetically distorted\nvideo quality assessment dataset.\n","authors":["Siddharath Narayan Shakya","Parimala Kancharla"],"pdf_url":"https://arxiv.org/pdf/2410.22566v2.pdf","comment":"Indian Conference on Computer Vision, Graphics and Image Processing\n (ICVGIP) 2024 conference tinny paper"},{"id":"http://arxiv.org/abs/2410.20535v3","updated":"2024-11-05T13:18:23Z","published":"2024-10-27T17:57:30Z","title":"Asynchronous Perception Machine For Efficient Test-Time-Training","summary":" In this work, we propose Asynchronous Perception Machine (APM), a\ncomputationally-efficient architecture for test-time-training (TTT). APM can\nprocess patches of an image one at a time in any order asymmetrically and still\nencode semantic-awareness in the net. We demonstrate APM's ability to recognize\nout-of-distribution images without dataset-specific pre-training, augmentation\nor any-pretext task. APM offers competitive performance over existing TTT\napproaches. To perform TTT, APM just distills test sample's representation\nonce. APM possesses a unique property: it can learn using just this single\nrepresentation and starts predicting semantically-aware features.\n APM demostrates potential applications beyond test-time-training: APM can\nscale up to a dataset of 2D images and yield semantic-clusterings in a single\nforward pass. APM also provides first empirical evidence towards validating\nGLOM's insight, i.e. input percept is a field. Therefore, APM helps us converge\ntowards an implementation which can do both interpolation and perception on a\nshared-connectionist hardware. Our code is publicly available at this link:\nhttps://rajatmodi62.github.io/apm_project_page/.\n","authors":["Rajat Modi","Yogesh Singh Rawat"],"pdf_url":"https://arxiv.org/pdf/2410.20535v3.pdf","comment":"Accepted to NeurIPS 2024 Main Track. APM is a step to getting\n Geoffrey Hinton's GLOM working"},{"id":"http://arxiv.org/abs/2404.11614v3","updated":"2024-11-05T13:16:54Z","published":"2024-04-17T17:59:55Z","title":"Dynamic Typography: Bringing Text to Life via Video Diffusion Prior","summary":" Text animation serves as an expressive medium, transforming static\ncommunication into dynamic experiences by infusing words with motion to evoke\nemotions, emphasize meanings, and construct compelling narratives. Crafting\nanimations that are semantically aware poses significant challenges, demanding\nexpertise in graphic design and animation. We present an automated text\nanimation scheme, termed \"Dynamic Typography\", which combines two challenging\ntasks. It deforms letters to convey semantic meaning and infuses them with\nvibrant movements based on user prompts. Our technique harnesses vector\ngraphics representations and an end-to-end optimization-based framework. This\nframework employs neural displacement fields to convert letters into base\nshapes and applies per-frame motion, encouraging coherence with the intended\ntextual concept. Shape preservation techniques and perceptual loss\nregularization are employed to maintain legibility and structural integrity\nthroughout the animation process. We demonstrate the generalizability of our\napproach across various text-to-video models and highlight the superiority of\nour end-to-end methodology over baseline methods, which might comprise separate\ntasks. Through quantitative and qualitative evaluations, we demonstrate the\neffectiveness of our framework in generating coherent text animations that\nfaithfully interpret user prompts while maintaining readability. Our code is\navailable at: https://animate-your-word.github.io/demo/.\n","authors":["Zichen Liu","Yihao Meng","Hao Ouyang","Yue Yu","Bolin Zhao","Daniel Cohen-Or","Huamin Qu"],"pdf_url":"https://arxiv.org/pdf/2404.11614v3.pdf","comment":"Our demo and code is available at:\n https://animate-your-word.github.io/demo/"},{"id":"http://arxiv.org/abs/2410.02761v3","updated":"2024-11-05T13:14:23Z","published":"2024-10-03T17:59:34Z","title":"FakeShield: Explainable Image Forgery Detection and Localization via\n Multi-modal Large Language Models","summary":" The rapid development of generative AI is a double-edged sword, which not\nonly facilitates content creation but also makes image manipulation easier and\nmore difficult to detect. Although current image forgery detection and\nlocalization (IFDL) methods are generally effective, they tend to face two\nchallenges: \\textbf{1)} black-box nature with unknown detection principle,\n\\textbf{2)} limited generalization across diverse tampering methods (e.g.,\nPhotoshop, DeepFake, AIGC-Editing). To address these issues, we propose the\nexplainable IFDL task and design FakeShield, a multi-modal framework capable of\nevaluating image authenticity, generating tampered region masks, and providing\na judgment basis based on pixel-level and image-level tampering clues.\nAdditionally, we leverage GPT-4o to enhance existing IFDL datasets, creating\nthe Multi-Modal Tamper Description dataSet (MMTD-Set) for training FakeShield's\ntampering analysis capabilities. Meanwhile, we incorporate a Domain Tag-guided\nExplainable Forgery Detection Module (DTE-FDM) and a Multi-modal Forgery\nLocalization Module (MFLM) to address various types of tamper detection\ninterpretation and achieve forgery localization guided by detailed textual\ndescriptions. Extensive experiments demonstrate that FakeShield effectively\ndetects and localizes various tampering techniques, offering an explainable and\nsuperior solution compared to previous IFDL methods.\n","authors":["Zhipei Xu","Xuanyu Zhang","Runyi Li","Zecheng Tang","Qing Huang","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.02761v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03064v1","updated":"2024-11-05T12:54:01Z","published":"2024-11-05T12:54:01Z","title":"Exploiting the Segment Anything Model (SAM) for Lung Segmentation in\n Chest X-ray Images","summary":" Segment Anything Model (SAM), a new AI model from Meta AI released in April\n2023, is an ambitious tool designed to identify and separate individual objects\nwithin a given image through semantic interpretation. The advanced capabilities\nof SAM are the result of its training with millions of images and masks, and a\nfew days after its release, several researchers began testing the model on\nmedical images to evaluate its performance in this domain. With this\nperspective in focus -- i.e., optimizing work in the healthcare field -- this\nwork proposes the use of this new technology to evaluate and study chest X-ray\nimages. The approach adopted for this work, with the aim of improving the\nmodel's performance for lung segmentation, involved a transfer learning\nprocess, specifically the fine-tuning technique. After applying this\nadjustment, a substantial improvement was observed in the evaluation metrics\nused to assess SAM's performance compared to the masks provided by the\ndatasets. The results obtained by the model after the adjustments were\nsatisfactory and similar to cutting-edge neural networks, such as U-Net.\n","authors":["Gabriel Bellon de Carvalho","Jurandy Almeida"],"pdf_url":"https://arxiv.org/pdf/2411.03064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03055v1","updated":"2024-11-05T12:42:42Z","published":"2024-11-05T12:42:42Z","title":"ATM: Improving Model Merging by Alternating Tuning and Merging","summary":" Model merging has recently emerged as a cost-efficient paradigm for\nmulti-task learning. Among current approaches, task arithmetic stands out for\nits simplicity and effectiveness. In this paper, we motivate the effectiveness\nof task vectors by linking them to multi-task gradients. We show that in a\nsingle-epoch scenario, task vectors are mathematically equivalent to the\ngradients obtained via gradient descent in a multi-task setting, and still\napproximate these gradients in subsequent epochs. Furthermore, we show that\ntask vectors perform optimally when equality is maintained, and their\neffectiveness is largely driven by the first epoch's gradient. Building on this\ninsight, we propose viewing model merging as a single step in an iterative\nprocess that Alternates between Tuning and Merging (ATM). This method acts as a\nbridge between model merging and multi-task gradient descent, achieving\nstate-of-the-art results with the same data and computational requirements. We\nextensively evaluate ATM across diverse settings, achieving up to 20% higher\naccuracy in computer vision and NLP tasks, compared to the best\nbaselines.Finally, we provide both empirical and theoretical support for its\neffectiveness, demonstrating increased orthogonality between task vectors and\nproving that ATM minimizes an upper bound on the loss obtained by jointly\nfinetuning all tasks.\n","authors":["Luca Zhou","Daniele Solombrino","Donato Crisostomi","Maria Sofia Bucarelli","Fabrizio Silvestri","Emanuele Rodolà"],"pdf_url":"https://arxiv.org/pdf/2411.03055v1.pdf","comment":"Main paper: 10 Pages, 11 figures, 2 tables"},{"id":"http://arxiv.org/abs/2411.03053v1","updated":"2024-11-05T12:39:21Z","published":"2024-11-05T12:39:21Z","title":"Gradient-Guided Conditional Diffusion Models for Private Image\n Reconstruction: Analyzing Adversarial Impacts of Differential Privacy and\n Denoising","summary":" We investigate the construction of gradient-guided conditional diffusion\nmodels for reconstructing private images, focusing on the adversarial interplay\nbetween differential privacy noise and the denoising capabilities of diffusion\nmodels. While current gradient-based reconstruction methods struggle with\nhigh-resolution images due to computational complexity and prior knowledge\nrequirements, we propose two novel methods that require minimal modifications\nto the diffusion model's generation process and eliminate the need for prior\nknowledge. Our approach leverages the strong image generation capabilities of\ndiffusion models to reconstruct private images starting from randomly generated\nnoise, even when a small amount of differentially private noise has been added\nto the gradients. We also conduct a comprehensive theoretical analysis of the\nimpact of differential privacy noise on the quality of reconstructed images,\nrevealing the relationship among noise magnitude, the architecture of attacked\nmodels, and the attacker's reconstruction capability. Additionally, extensive\nexperiments validate the effectiveness of our proposed methods and the accuracy\nof our theoretical findings, suggesting new directions for privacy risk\nauditing using conditional diffusion models.\n","authors":["Tao Huang","Jiayang Meng","Hong Chen","Guolong Zheng","Xu Yang","Xun Yi","Hua Wang"],"pdf_url":"https://arxiv.org/pdf/2411.03053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03047v1","updated":"2024-11-05T12:30:07Z","published":"2024-11-05T12:30:07Z","title":"GarVerseLOD: High-Fidelity 3D Garment Reconstruction from a Single\n In-the-Wild Image using a Dataset with Levels of Details","summary":" Neural implicit functions have brought impressive advances to the\nstate-of-the-art of clothed human digitization from multiple or even single\nimages. However, despite the progress, current arts still have difficulty\ngeneralizing to unseen images with complex cloth deformation and body poses. In\nthis work, we present GarVerseLOD, a new dataset and framework that paves the\nway to achieving unprecedented robustness in high-fidelity 3D garment\nreconstruction from a single unconstrained image. Inspired by the recent\nsuccess of large generative models, we believe that one key to addressing the\ngeneralization challenge lies in the quantity and quality of 3D garment data.\nTowards this end, GarVerseLOD collects 6,000 high-quality cloth models with\nfine-grained geometry details manually created by professional artists. In\naddition to the scale of training data, we observe that having disentangled\ngranularities of geometry can play an important role in boosting the\ngeneralization capability and inference accuracy of the learned model. We hence\ncraft GarVerseLOD as a hierarchical dataset with levels of details (LOD),\nspanning from detail-free stylized shape to pose-blended garment with\npixel-aligned details. This allows us to make this highly under-constrained\nproblem tractable by factorizing the inference into easier tasks, each narrowed\ndown with smaller searching space. To ensure GarVerseLOD can generalize well to\nin-the-wild images, we propose a novel labeling paradigm based on conditional\ndiffusion models to generate extensive paired images for each garment model\nwith high photorealism. We evaluate our method on a massive amount of\nin-the-wild images. Experimental results demonstrate that GarVerseLOD can\ngenerate standalone garment pieces with significantly better quality than prior\napproaches. Project page: https://garverselod.github.io/\n","authors":["Zhongjin Luo","Haolin Liu","Chenghong Li","Wanghao Du","Zirong Jin","Wanhu Sun","Yinyu Nie","Weikai Chen","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2411.03047v1.pdf","comment":"Project page: https://garverselod.github.io/"},{"id":"http://arxiv.org/abs/2411.03044v1","updated":"2024-11-05T12:27:24Z","published":"2024-11-05T12:27:24Z","title":"Evaluation of handwriting kinematics and pressure for differential\n diagnosis of Parkinson's disease","summary":" Objective: We present the PaHaW Parkinson's disease handwriting database,\nconsisting of handwriting samples from Parkinson's disease (PD) patients and\nhealthy controls. Our goal is to show that kinematic features and pressure\nfeatures in handwriting can be used for the differential diagnosis of PD.\nMethods and Material: The database contains records from 37 PD patients and 38\nhealthy controls performing eight different handwriting tasks. The tasks\ninclude drawing an Archimedean spiral, repetitively writing orthographically\nsimple syllables and words, and writing of a sentence. In addition to the\nconventional kinematic features related to the dynamics of handwriting, we\ninvestigated new pressure features based on the pressure exerted on the writing\nsurface. To discriminate between PD patients and healthy subjects, three\ndifferent classifiers were compared: K-nearest neighbors (K-NN), ensemble\nAdaBoost classifier, and support vector machines (SVM). Results: For predicting\nPD based on kinematic and pressure features of handwriting, the best performing\nmodel was SVM with classification accuracy of Pacc = 81.3% (sensitivity Psen =\n87.4% and specificity of Pspe = 80.9%). When evaluated separately, pressure\nfeatures proved to be relevant for PD diagnosis, yielding Pacc = 82.5% compared\nto Pacc = 75.4% using kinematic features. Conclusion: Experimental results\nshowed that an analysis of kinematic and pressure features during handwriting\ncan help assess subtle characteristics of handwriting and discriminate between\nPD patients and healthy controls.\n","authors":["Peter Drotár","Jiří Mekyska","Irena Rektorová","Lucia Masarová","Zdeněk Smékal","Marcos Faundez-Zanuy"],"pdf_url":"https://arxiv.org/pdf/2411.03044v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2410.00086v2","updated":"2024-11-05T12:25:32Z","published":"2024-09-30T17:56:27Z","title":"ACE: All-round Creator and Editor Following Instructions via Diffusion\n Transformer","summary":" Diffusion models have emerged as a powerful generative technology and have\nbeen found to be applicable in various scenarios. Most existing foundational\ndiffusion models are primarily designed for text-guided visual generation and\ndo not support multi-modal conditions, which are essential for many visual\nediting tasks. This limitation prevents these foundational diffusion models\nfrom serving as a unified model in the field of visual generation, like GPT-4\nin the natural language processing field. In this work, we propose ACE, an\nAll-round Creator and Editor, which achieves comparable performance compared to\nthose expert models in a wide range of visual generation tasks. To achieve this\ngoal, we first introduce a unified condition format termed Long-context\nCondition Unit (LCU), and propose a novel Transformer-based diffusion model\nthat uses LCU as input, aiming for joint training across various generation and\nediting tasks. Furthermore, we propose an efficient data collection approach to\naddress the issue of the absence of available training data. It involves\nacquiring pairwise images with synthesis-based or clustering-based pipelines\nand supplying these pairs with accurate textual instructions by leveraging a\nfine-tuned multi-modal large language model. To comprehensively evaluate the\nperformance of our model, we establish a benchmark of manually annotated pairs\ndata across a variety of visual generation tasks. The extensive experimental\nresults demonstrate the superiority of our model in visual generation fields.\nThanks to the all-in-one capabilities of our model, we can easily build a\nmulti-modal chat system that responds to any interactive request for image\ncreation using a single model to serve as the backend, avoiding the cumbersome\npipeline typically employed in visual agents. Code and models will be available\non the project page: https://ali-vilab.github.io/ace-page/.\n","authors":["Zhen Han","Zeyinzi Jiang","Yulin Pan","Jingfeng Zhang","Chaojie Mao","Chenwei Xie","Yu Liu","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.00086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03041v1","updated":"2024-11-05T12:24:28Z","published":"2024-11-05T12:24:28Z","title":"Judge Like a Real Doctor: Dual Teacher Sample Consistency Framework for\n Semi-supervised Medical Image Classification","summary":" Semi-supervised learning (SSL) is a popular solution to alleviate the high\nannotation cost in medical image classification. As a main branch of SSL,\nconsistency regularization engages in imposing consensus between the\npredictions of a single sample from different views, termed as Absolute\nLocation consistency (AL-c). However, only AL-c may be insufficient. Just like\nwhen diagnosing a case in practice, besides the case itself, the doctor usually\nrefers to certain related trustworthy cases to make more reliable\ndecisions.Therefore, we argue that solely relying on AL-c may ignore the\nrelative differences across samples, which we interpret as relative locations,\nand only exploit limited information from one perspective. To address this\nissue, we propose a Sample Consistency Mean Teacher (SCMT) which not only\nincorporates AL c but also additionally enforces consistency between the\nsamples' relative similarities to its related samples, called Relative Location\nconsistency (RL c). AL c and RL c conduct consistency regularization from two\ndifferent perspectives, jointly extracting more diverse semantic information\nfor classification. On the other hand, due to the highly similar structures in\nmedical images, the sample distribution could be overly dense in feature space,\nmaking their relative locations susceptible to noise. To tackle this problem,\nwe further develop a Sample Scatter Mean Teacher (SSMT) by utilizing\ncontrastive learning to sparsify the sample distribution and obtain robust and\neffective relative locations. Extensive experiments on different datasets\ndemonstrate the superiority of our method.\n","authors":["Zhang Qixiang","Yang Yuxiang","Zu Chen","Zhang Jianjia","Wu Xi","Zhou Jiliu","Wang Yan"],"pdf_url":"https://arxiv.org/pdf/2411.03041v1.pdf","comment":"Accepted by IEEE Transactions on Emerging Topics in Computational\n Intelligence"},{"id":"http://arxiv.org/abs/2405.19572v2","updated":"2024-11-05T12:21:24Z","published":"2024-05-29T23:38:12Z","title":"Blind Image Restoration via Fast Diffusion Inversion","summary":" Image Restoration (IR) methods based on a pre-trained diffusion model have\ndemonstrated state-of-the-art performance. However, they have two fundamental\nlimitations: 1) they often assume that the degradation operator is completely\nknown and 2) they alter the diffusion sampling process, which may result in\nrestored images that do not lie onto the data manifold. To address these\nissues, we propose Blind Image Restoration via fast Diffusion inversion (BIRD)\na blind IR method that jointly optimizes for the degradation model parameters\nand the restored image. To ensure that the restored images lie onto the data\nmanifold, we propose a novel sampling technique on a pre-trained diffusion\nmodel. A key idea in our method is not to modify the reverse sampling, i.e, not\nto alter all the intermediate latents, once an initial noise is sampled. This\nis ultimately equivalent to casting the IR task as an optimization problem in\nthe space of the input noise. Moreover, to mitigate the computational cost\nassociated with inverting a fully unrolled diffusion model, we leverage the\ninherent capability of these models to skip ahead in the forward diffusion\nprocess using large time steps. We experimentally validate BIRD on several\nimage restoration tasks and show that it achieves state of the art performance\non all of them. Our code is available at\nhttps://github.com/hamadichihaoui/BIRD.\n","authors":["Hamadi Chihaoui","Abdelhak Lemkhenter","Paolo Favaro"],"pdf_url":"https://arxiv.org/pdf/2405.19572v2.pdf","comment":"Accepted to Neurips 2024"},{"id":"http://arxiv.org/abs/2411.03033v1","updated":"2024-11-05T12:10:02Z","published":"2024-11-05T12:10:02Z","title":"Rethinking Decoders for Transformer-based Semantic Segmentation:\n Compression is All You Need","summary":" State-of-the-art methods for Transformer-based semantic segmentation\ntypically adopt Transformer decoders that are used to extract additional\nembeddings from image embeddings via cross-attention, refine either or both\ntypes of embeddings via self-attention, and project image embeddings onto the\nadditional embeddings via dot-product. Despite their remarkable success, these\nempirical designs still lack theoretical justifications or interpretations,\nthus hindering potentially principled improvements. In this paper, we argue\nthat there are fundamental connections between semantic segmentation and\ncompression, especially between the Transformer decoders and Principal\nComponent Analysis (PCA). From such a perspective, we derive a white-box, fully\nattentional DEcoder for PrIncipled semantiC segemenTation (DEPICT), with the\ninterpretations as follows: 1) the self-attention operator refines image\nembeddings to construct an ideal principal subspace that aligns with the\nsupervision and retains most information; 2) the cross-attention operator seeks\nto find a low-rank approximation of the refined image embeddings, which is\nexpected to be a set of orthonormal bases of the principal subspace and\ncorresponds to the predefined classes; 3) the dot-product operation yields\ncompact representation for image embeddings as segmentation masks. Experiments\nconducted on dataset ADE20K find that DEPICT consistently outperforms its\nblack-box counterpart, Segmenter, and it is light weight and more robust.\n","authors":["Qishuai Wen","Chun-Guang Li"],"pdf_url":"https://arxiv.org/pdf/2411.03033v1.pdf","comment":"NeurIPS2024. Code:https://github.com/QishuaiWen/DEPICT/"},{"id":"http://arxiv.org/abs/2406.16658v3","updated":"2024-11-05T11:53:56Z","published":"2024-06-24T14:08:27Z","title":"Sampling Strategies in Bayesian Inversion: A Study of RTO and Langevin\n Methods","summary":" This paper studies two classes of sampling methods for the solution of\ninverse problems, namely Randomize-Then-Optimize (RTO), which is rooted in\nsensitivity analysis, and Langevin methods, which are rooted in the Bayesian\nframework. The two classes of methods correspond to different assumptions and\nyield samples from different target distributions. We highlight the main\nconceptual and theoretical differences between the two approaches and compare\nthem from a practical point of view by tackling two classical inverse problems\nin imaging: deblurring and inpainting. We show that the choice of the sampling\nmethod has a significant impact on the quality of the reconstruction and that\nthe RTO method is more robust to the choice of the parameters.\n","authors":["Remi Laumont","Yiqiu Dong","Martin Skovgaard Andersen"],"pdf_url":"https://arxiv.org/pdf/2406.16658v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03019v1","updated":"2024-11-05T11:42:26Z","published":"2024-11-05T11:42:26Z","title":"FEDLAD: Federated Evaluation of Deep Leakage Attacks and Defenses","summary":" Federated Learning is a privacy preserving decentralized machine learning\nparadigm designed to collaboratively train models across multiple clients by\nexchanging gradients to the server and keeping private data local.\nNevertheless, recent research has revealed that the security of Federated\nLearning is compromised, as private ground truth data can be recovered through\na gradient inversion technique known as Deep Leakage. While these attacks are\ncrafted with a focus on applications in Federated Learning, they generally are\nnot evaluated in realistic scenarios. This paper introduces the FEDLAD\nFramework (Federated Evaluation of Deep Leakage Attacks and Defenses), a\ncomprehensive benchmark for evaluating Deep Leakage attacks and defenses within\na realistic Federated context. By implementing a unified benchmark that\nencompasses multiple state-of-the-art Deep Leakage techniques and various\ndefense strategies, our framework facilitates the evaluation and comparison of\nthe efficacy of these methods across different datasets and training states.\nThis work highlights a crucial trade-off between privacy and model accuracy in\nFederated Learning and aims to advance the understanding of security challenges\nin decentralized machine learning systems, stimulate future research, and\nenhance reproducibility in evaluating Deep Leakage attacks and defenses.\n","authors":["Isaac Baglin","Xiatian Zhu","Simon Hadfield"],"pdf_url":"https://arxiv.org/pdf/2411.03019v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2407.09510v4","updated":"2024-11-05T11:41:40Z","published":"2024-06-17T11:43:38Z","title":"3DGS.zip: A survey on 3D Gaussian Splatting Compression Methods","summary":" 3D Gaussian Splatting (3DGS) has emerged as a cutting-edge technique for\nreal-time radiance field rendering, offering state-of-the-art performance in\nterms of both quality and speed. 3DGS models a scene as a collection of\nthree-dimensional Gaussians, or splats, with additional attributes optimized to\nconform to the scene's geometric and visual properties. Despite its advantages\nin rendering speed and image fidelity, 3DGS is limited by its significant\nstorage and memory demands. These high demands make 3DGS impractical for mobile\ndevices or headsets, reducing its applicability in important areas of computer\ngraphics. To address these challenges and advance the practicality of 3DGS,\nthis survey provides a comprehensive and detailed examination of compression\nand compaction techniques developed to make 3DGS more efficient. We categorize\ncurrent approaches into compression techniques, which aim at achieving the\nhighest quality at minimal data size, and compaction techniques, which aim for\noptimal quality with the fewest Gaussians. We introduce the basic mathematical\nconcepts underlying the analyzed methods, as well as key implementation details\nand design choices. Our report thoroughly discusses similarities and\ndifferences among the methods, as well as their respective advantages and\ndisadvantages. We establish a consistent standard for comparing these methods\nbased on key performance metrics and datasets. Specifically, since these\nmethods have been developed in parallel and over a short period of time,\ncurrently, no comprehensive comparison exists. This survey, for the first time,\npresents a unified standard to evaluate 3DGS compression techniques. To\nfacilitate the continuous monitoring of emerging methodologies, we maintain a\ndedicated website that will be regularly updated with new techniques and\nrevisions of existing findings https://w-m.github.io/3dgs-compression-survey/ .\n","authors":["Milena T. Bagdasarian","Paul Knoll","Yi-Hsin Li","Florian Barthel","Anna Hilsmann","Peter Eisert","Wieland Morgenstern"],"pdf_url":"https://arxiv.org/pdf/2407.09510v4.pdf","comment":"3D Gaussian Splatting compression survey; 3DGS compression; new\n approaches added"},{"id":"http://arxiv.org/abs/2408.05964v3","updated":"2024-11-05T11:30:40Z","published":"2024-08-12T07:33:11Z","title":"Target Detection of Safety Protective Gear Using the Improved YOLOv5","summary":" In high-risk railway construction, personal protective equipment monitoring\nis critical but challenging due to small and frequently obstructed targets. We\npropose YOLO-EA, an innovative model that enhances safety measure detection by\nintegrating ECA into its backbone's convolutional layers, improving discernment\nof minuscule objects like hardhats. YOLO-EA further refines target recognition\nunder occlusion by replacing GIoU with EIoU loss. YOLO-EA's effectiveness was\nempirically substantiated using a dataset derived from real-world railway\nconstruction site surveillance footage. It outperforms YOLOv5, achieving 98.9%\nprecision and 94.7% recall, up 2.5% and 0.5% respectively, while maintaining\nreal-time performance at 70.774 fps. This highly efficient and precise YOLO-EA\nholds great promise for practical application in intricate construction\nscenarios, enforcing stringent safety compliance during complex railway\nconstruction projects.\n","authors":["Hao Liu","Xue Qin"],"pdf_url":"https://arxiv.org/pdf/2408.05964v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08565v3","updated":"2024-11-05T11:29:59Z","published":"2024-10-11T06:44:31Z","title":"Ocean-omni: To Understand the World with Omni-modality","summary":" The salient multimodal capabilities and interactive experience of GPT-4o\nhighlight its critical role in practical applications, yet it lacks a\nhigh-performing open-source counterpart. In this paper, we introduce\nOcean-omni, the first open-source 7B Multimodal Large Language Model (MLLM)\nadept at concurrently processing and analyzing modalities of image, video,\naudio, and text, while delivering an advanced multimodal interactive experience\nand strong performance. We propose an effective multimodal training schema\nstarting with 7B model and proceeding through two stages of multimodal\nalignment and multitask fine-tuning across audio, image, video, and text modal.\nThis approach equips the language model with the ability to handle visual and\naudio data effectively. Demonstrating strong performance across various\nomni-modal and multimodal benchmarks, we aim for this contribution to serve as\na competitive baseline for the open-source community in advancing multimodal\nunderstanding and real-time interaction.\n","authors":["Yadong Li","Haoze Sun","Mingan Lin","Tianpeng Li","Guosheng Dong","Tao Zhang","Bowen Ding","Wei Song","Zhenglin Cheng","Yuqi Huo","Song Chen","Xu Li","Da Pan","Shusen Zhang","Xin Wu","Zheng Liang","Jun Liu","Tao Zhang","Keer Lu","Yaqi Zhao","Yanjun Shen","Fan Yang","Kaicheng Yu","Tao Lin","Jianhua Xu","Zenan Zhou","Weipeng Chen"],"pdf_url":"https://arxiv.org/pdf/2410.08565v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03013v1","updated":"2024-11-05T11:25:19Z","published":"2024-11-05T11:25:19Z","title":"CRT-Fusion: Camera, Radar, Temporal Fusion Using Motion Information for\n 3D Object Detection","summary":" Accurate and robust 3D object detection is a critical component in autonomous\nvehicles and robotics. While recent radar-camera fusion methods have made\nsignificant progress by fusing information in the bird's-eye view (BEV)\nrepresentation, they often struggle to effectively capture the motion of\ndynamic objects, leading to limited performance in real-world scenarios. In\nthis paper, we introduce CRT-Fusion, a novel framework that integrates temporal\ninformation into radar-camera fusion to address this challenge. Our approach\ncomprises three key modules: Multi-View Fusion (MVF), Motion Feature Estimator\n(MFE), and Motion Guided Temporal Fusion (MGTF). The MVF module fuses radar and\nimage features within both the camera view and bird's-eye view, thereby\ngenerating a more precise unified BEV representation. The MFE module conducts\ntwo simultaneous tasks: estimation of pixel-wise velocity information and BEV\nsegmentation. Based on the velocity and the occupancy score map obtained from\nthe MFE module, the MGTF module aligns and fuses feature maps across multiple\ntimestamps in a recurrent manner. By considering the motion of dynamic objects,\nCRT-Fusion can produce robust BEV feature maps, thereby improving detection\naccuracy and robustness. Extensive evaluations on the challenging nuScenes\ndataset demonstrate that CRT-Fusion achieves state-of-the-art performance for\nradar-camera-based 3D object detection. Our approach outperforms the previous\nbest method in terms of NDS by +1.7%, while also surpassing the leading\napproach in mAP by +1.4%. These significant improvements in both metrics\nshowcase the effectiveness of our proposed fusion strategy in enhancing the\nreliability and accuracy of 3D object detection.\n","authors":["Jisong Kim","Minjae Seong","Jun Won Choi"],"pdf_url":"https://arxiv.org/pdf/2411.03013v1.pdf","comment":"Accepted at NeurIPS2024"},{"id":"http://arxiv.org/abs/2411.02999v1","updated":"2024-11-05T11:00:55Z","published":"2024-11-05T11:00:55Z","title":"Precise Drive with VLM: First Prize Solution for PRCV 2024 Drive LM\n challenge","summary":" This technical report outlines the methodologies we applied for the PRCV\nChallenge, focusing on cognition and decision-making in driving scenarios. We\nemployed InternVL-2.0, a pioneering open-source multi-modal model, and enhanced\nit by refining both the model input and training methodologies. For the input\ndata, we strategically concatenated and formatted the multi-view images. It is\nworth mentioning that we utilized the coordinates of the original images\nwithout transformation. In terms of model training, we initially pre-trained\nthe model on publicly available autonomous driving scenario datasets to bolster\nits alignment capabilities of the challenge tasks, followed by fine-tuning on\nthe DriveLM-nuscenes Dataset. During the fine-tuning phase, we innovatively\nmodified the loss function to enhance the model's precision in predicting\ncoordinate values. These approaches ensure that our model possesses advanced\ncognitive and decision-making capabilities in driving scenarios. Consequently,\nour model achieved a score of 0.6064, securing the first prize on the\ncompetition's final results.\n","authors":["Bin Huang","Siyu Wang","Yuanpeng Chen","Yidan Wu","Hui Song","Zifan Ding","Jing Leng","Chengpeng Liang","Peng Xue","Junliang Zhang","Tiankun Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.02999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02997v1","updated":"2024-11-05T10:58:37Z","published":"2024-11-05T10:58:37Z","title":"PV-faultNet: Optimized CNN Architecture to detect defects resulting\n efficient PV production","summary":" The global shift towards renewable energy has pushed PV cell manufacturing as\na pivotal point as they are the fundamental building block of green energy.\nHowever, the manufacturing process is complex enough to lose its purpose due to\nprobable defects experienced during the time impacting the overall efficiency.\nHowever, at the moment, manual inspection is being conducted to detect the\ndefects that can cause bias, leading to time and cost inefficiency. Even if\nautomated solutions have also been proposed, most of them are\nresource-intensive, proving ineffective in production environments. In that\ncontext, this study presents PV-faultNet, a lightweight Convolutional Neural\nNetwork (CNN) architecture optimized for efficient and real-time defect\ndetection in photovoltaic (PV) cells, designed to be deployable on\nresource-limited production devices. Addressing computational challenges in\nindustrial PV manufacturing environments, the model includes only 2.92 million\nparameters, significantly reducing processing demands without sacrificing\naccuracy. Comprehensive data augmentation techniques were implemented to tackle\ndata scarcity, thus enhancing model generalization and maintaining a balance\nbetween precision and recall. The proposed model achieved high performance with\n91\\% precision, 89\\% recall, and a 90\\% F1 score, demonstrating its\neffectiveness for scalable quality control in PV production.\n","authors":["Eiffat E Zaman","Rahima Khanam"],"pdf_url":"https://arxiv.org/pdf/2411.02997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02992v1","updated":"2024-11-05T10:53:25Z","published":"2024-11-05T10:53:25Z","title":"Efficient and Effective Adaptation of Multimodal Foundation Models in\n Sequential Recommendation","summary":" Multimodal foundation models (MFMs) have revolutionized sequential\nrecommender systems through advanced representation learning. While\nParameter-efficient Fine-tuning (PEFT) is commonly used to adapt these models,\nstudies often prioritize parameter efficiency, neglecting GPU memory and\ntraining speed. To address this, we introduced the IISAN framework,\nsignificantly enhancing efficiency. However, IISAN was limited to symmetrical\nMFMs and identical text and image encoders, preventing the use of\nstate-of-the-art Large Language Models. To overcome this, we developed\nIISAN-Versa, a versatile plug-and-play architecture compatible with both\nsymmetrical and asymmetrical MFMs. IISAN-Versa employs a Decoupled PEFT\nstructure and utilizes both intra- and inter-modal adaptation. It effectively\nhandles asymmetry through a simple yet effective combination of group\nlayer-dropping and dimension transformation alignment. Our research\ndemonstrates that IISAN-Versa effectively adapts large text encoders, and we\nfurther identify a scaling effect where larger encoders generally perform\nbetter. IISAN-Versa also demonstrates strong versatility in our defined\nmultimodal scenarios, which include raw titles and captions generated from\nimages and videos. Additionally, IISAN-Versa achieved state-of-the-art\nperformance on the Microlens public benchmark. We will release our code and\ndatasets to support future research.\n","authors":["Junchen Fu","Xuri Ge","Xin Xin","Alexandros Karatzoglou","Ioannis Arapakis","Kaiwen Zheng","Yongxin Ni","Joemon M. Jose"],"pdf_url":"https://arxiv.org/pdf/2411.02992v1.pdf","comment":"The extension of IISAN in SIGIR2024"},{"id":"http://arxiv.org/abs/2308.02905v3","updated":"2024-11-05T10:51:30Z","published":"2023-08-05T15:54:06Z","title":"FASTER: A Font-Agnostic Scene Text Editing and Rendering Framework","summary":" Scene Text Editing (STE) is a challenging research problem, that primarily\naims towards modifying existing texts in an image while preserving the\nbackground and the font style of the original text. Despite its utility in\nnumerous real-world applications, existing style-transfer-based approaches have\nshown sub-par editing performance due to (1) complex image backgrounds, (2)\ndiverse font attributes, and (3) varying word lengths within the text. To\naddress such limitations, in this paper, we propose a novel font-agnostic scene\ntext editing and rendering framework, named FASTER, for simultaneously\ngenerating text in arbitrary styles and locations while preserving a natural\nand realistic appearance and structure. A combined fusion of target mask\ngeneration and style transfer units, with a cascaded self-attention mechanism\nhas been proposed to focus on multi-level text region edits to handle varying\nword lengths. Extensive evaluation on a real-world database with further\nsubjective human evaluation study indicates the superiority of FASTER in both\nscene text editing and rendering tasks, in terms of model performance and\nefficiency. Our code will be released upon acceptance.\n","authors":["Alloy Das","Sanket Biswas","Prasun Roy","Subhankar Ghosh","Umapada Pal","Michael Blumenstein","Josep Lladós","Saumik Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2308.02905v3.pdf","comment":"Accepted in WACV 2025"},{"id":"http://arxiv.org/abs/2411.02979v1","updated":"2024-11-05T10:41:45Z","published":"2024-11-05T10:41:45Z","title":"CAD-NeRF: Learning NeRFs from Uncalibrated Few-view Images by CAD Model\n Retrieval","summary":" Reconstructing from multi-view images is a longstanding problem in 3D vision,\nwhere neural radiance fields (NeRFs) have shown great potential and get\nrealistic rendered images of novel views. Currently, most NeRF methods either\nrequire accurate camera poses or a large number of input images, or even both.\nReconstructing NeRF from few-view images without poses is challenging and\nhighly ill-posed. To address this problem, we propose CAD-NeRF, a method\nreconstructed from less than 10 images without any known poses. Specifically,\nwe build a mini library of several CAD models from ShapeNet and render them\nfrom many random views. Given sparse-view input images, we run a model and pose\nretrieval from the library, to get a model with similar shapes, serving as the\ndensity supervision and pose initializations. Here we propose a multi-view pose\nretrieval method to avoid pose conflicts among views, which is a new and unseen\nproblem in uncalibrated NeRF methods. Then, the geometry of the object is\ntrained by the CAD guidance. The deformation of the density field and camera\nposes are optimized jointly. Then texture and density are trained and\nfine-tuned as well. All training phases are in self-supervised manners.\nComprehensive evaluations of synthetic and real images show that CAD-NeRF\nsuccessfully learns accurate densities with a large deformation from retrieved\nCAD models, showing the generalization abilities.\n","authors":["Xin Wen","Xuening Zhu","Renjiao Yi","Zhifeng Wang","Chenyang Zhu","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2411.02979v1.pdf","comment":"The article has been accepted by Frontiers of Computer Science (FCS)"},{"id":"http://arxiv.org/abs/2305.11616v5","updated":"2024-11-05T10:41:42Z","published":"2023-05-19T11:47:51Z","title":"Diversifying Deep Ensembles: A Saliency Map Approach for Enhanced OOD\n Detection, Calibration, and Accuracy","summary":" Deep ensembles are capable of achieving state-of-the-art results in\nclassification and out-of-distribution (OOD) detection. However, their\neffectiveness is limited due to the homogeneity of learned patterns within\nensembles. To overcome this issue, our study introduces Saliency Diversified\nDeep Ensemble (SDDE), a novel approach that promotes diversity among ensemble\nmembers by leveraging saliency maps. Through incorporating saliency map\ndiversification, our method outperforms conventional ensemble techniques and\nimproves calibration in multiple classification and OOD detection tasks. In\nparticular, the proposed method achieves state-of-the-art OOD detection\nquality, calibration, and accuracy on multiple benchmarks, including\nCIFAR10/100 and large-scale ImageNet datasets.\n","authors":["Stanislav Dereka","Ivan Karpukhin","Maksim Zhdanov","Sergey Kolesnikov"],"pdf_url":"https://arxiv.org/pdf/2305.11616v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10464v2","updated":"2024-11-05T10:32:02Z","published":"2023-12-16T14:46:24Z","title":"Identity Curvature Laplace Approximation for Improved\n Out-of-Distribution Detection","summary":" Uncertainty estimation is crucial in safety-critical applications, where\nrobust out-of-distribution (OOD) detection is essential. Traditional Bayesian\nmethods, though effective, are often hindered by high computational demands. As\nan alternative, Laplace approximation offers a more practical and efficient\napproach to uncertainty estimation. In this paper, we introduce the Identity\nCurvature Laplace Approximation (ICLA), a novel method that challenges the\nconventional posterior covariance formulation by using identity curvature and\noptimizing prior precision. This innovative design significantly enhances OOD\ndetection performance on well-known datasets such as CIFAR-10, CIFAR-100, and\nImageNet, while maintaining calibration scores. We attribute this improvement\nto the alignment issues between typical feature embeddings and curvature as\nmeasured by the Fisher information matrix. Our findings are further supported\nby demonstrating that incorporating Fisher penalty or sharpness-aware\nminimization techniques can greatly enhance the uncertainty estimation\ncapabilities of standard Laplace approximation.\n","authors":["Maksim Zhdanov","Stanislav Dereka","Sergey Kolesnikov"],"pdf_url":"https://arxiv.org/pdf/2312.10464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01739v2","updated":"2024-11-05T10:23:00Z","published":"2024-11-04T01:42:41Z","title":"Not Just Object, But State: Compositional Incremental Learning without\n Forgetting","summary":" Most incremental learners excessively prioritize coarse classes of objects\nwhile neglecting various kinds of states (e.g. color and material) attached to\nthe objects. As a result, they are limited in the ability to reason\nfine-grained compositionality of state-object pairs. To remedy this limitation,\nwe propose a novel task called Compositional Incremental Learning\n(composition-IL), enabling the model to recognize state-object compositions as\na whole in an incremental learning fashion. Since the lack of suitable\nbenchmarks, we re-organize two existing datasets and make them tailored for\ncomposition-IL. Then, we propose a prompt-based Composition Incremental Learner\n(CompILer), to overcome the ambiguous composition boundary problem which\nchallenges composition-IL largely. Specifically, we exploit multi-pool prompt\nlearning, which is regularized by inter-pool prompt discrepancy and intra-pool\nprompt diversity. Besides, we devise object-injected state prompting by using\nobject prompts to guide the selection of state prompts. Furthermore, we fuse\nthe selected prompts by a generalized-mean strategy, to eliminate irrelevant\ninformation learned in the prompts. Extensive experiments on two datasets\nexhibit state-of-the-art performance achieved by CompILer.\n","authors":["Yanyi Zhang","Binglin Qiu","Qi Jia","Yu Liu","Ran He"],"pdf_url":"https://arxiv.org/pdf/2411.01739v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.21130v2","updated":"2024-11-05T10:22:35Z","published":"2024-10-28T15:31:47Z","title":"Extrapolating Prospective Glaucoma Fundus Images through Diffusion Model\n in Irregular Longitudinal Sequences","summary":" The utilization of longitudinal datasets for glaucoma progression prediction\noffers a compelling approach to support early therapeutic interventions.\nPredominant methodologies in this domain have primarily focused on the direct\nprediction of glaucoma stage labels from longitudinal datasets. However, such\nmethods may not adequately encapsulate the nuanced developmental trajectory of\nthe disease. To enhance the diagnostic acumen of medical practitioners, we\npropose a novel diffusion-based model to predict prospective images by\nextrapolating from existing longitudinal fundus images of patients. The\nmethodology delineated in this study distinctively leverages sequences of\nimages as inputs. Subsequently, a time-aligned mask is employed to select a\nspecific year for image generation. During the training phase, the time-aligned\nmask resolves the issue of irregular temporal intervals in longitudinal image\nsequence sampling. Additionally, we utilize a strategy of randomly masking a\nframe in the sequence to establish the ground truth. This methodology aids the\nnetwork in continuously acquiring knowledge regarding the internal\nrelationships among the sequences throughout the learning phase. Moreover, the\nintroduction of textual labels is instrumental in categorizing images generated\nwithin the sequence. The empirical findings from the conducted experiments\nindicate that our proposed model not only effectively generates longitudinal\ndata but also significantly improves the precision of downstream classification\ntasks.\n","authors":["Zhihao Zhao","Junjie Yang","Shahrooz Faghihroohi","Yinzheng Zhao","Daniel Zapp","Kai Huang","Nassir Navab","M. Ali Nasseri"],"pdf_url":"https://arxiv.org/pdf/2410.21130v2.pdf","comment":"Accepted at BIBM 2024"},{"id":"http://arxiv.org/abs/2411.02974v1","updated":"2024-11-05T10:21:21Z","published":"2024-11-05T10:21:21Z","title":"Region-Guided Attack on the Segment Anything Model (SAM)","summary":" The Segment Anything Model (SAM) is a cornerstone of image segmentation,\ndemonstrating exceptional performance across various applications, particularly\nin autonomous driving and medical imaging, where precise segmentation is\ncrucial. However, SAM is vulnerable to adversarial attacks that can\nsignificantly impair its functionality through minor input perturbations.\nTraditional techniques, such as FGSM and PGD, are often ineffective in\nsegmentation tasks due to their reliance on global perturbations that overlook\nspatial nuances. Recent methods like Attack-SAM-K and UAD have begun to address\nthese challenges, but they frequently depend on external cues and do not fully\nleverage the structural interdependencies within segmentation processes. This\nlimitation underscores the need for a novel adversarial strategy that exploits\nthe unique characteristics of segmentation tasks. In response, we introduce the\nRegion-Guided Attack (RGA), designed specifically for SAM. RGA utilizes a\nRegion-Guided Map (RGM) to manipulate segmented regions, enabling targeted\nperturbations that fragment large segments and expand smaller ones, resulting\nin erroneous outputs from SAM. Our experiments demonstrate that RGA achieves\nhigh success rates in both white-box and black-box scenarios, emphasizing the\nneed for robust defenses against such sophisticated attacks. RGA not only\nreveals SAM's vulnerabilities but also lays the groundwork for developing more\nresilient defenses against adversarial threats in image segmentation.\n","authors":["Xiaoliang Liu","Furao Shen","Jian Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.02974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02972v1","updated":"2024-11-05T10:16:14Z","published":"2024-11-05T10:16:14Z","title":"Exploring Seasonal Variability in the Context of Neural Radiance Fields\n for 3D Reconstruction on Satellite Imagery","summary":" In this work, the seasonal predictive capabilities of Neural Radiance Fields\n(NeRF) applied to satellite images are investigated. Focusing on the\nutilization of satellite data, the study explores how Sat-NeRF, a novel\napproach in computer vision, performs in predicting seasonal variations across\ndifferent months. Through comprehensive analysis and visualization, the study\nexamines the model's ability to capture and predict seasonal changes,\nhighlighting specific challenges and strengths. Results showcase the impact of\nthe sun direction on predictions, revealing nuanced details in seasonal\ntransitions, such as snow cover, color accuracy, and texture representation in\ndifferent landscapes. Given these results, we propose Planet-NeRF, an extension\nto Sat-NeRF capable of incorporating seasonal variability through a set of\nmonth embedding vectors. Comparative evaluations reveal that Planet-NeRF\noutperforms prior models in the case where seasonal changes are present. The\nextensive evaluation combined with the proposed method offers promising avenues\nfor future research in this domain.\n","authors":["Liv Kåreborn","Erica Ingerstad","Amanda Berg","Justus Karlsson","Leif Haglund"],"pdf_url":"https://arxiv.org/pdf/2411.02972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02969v1","updated":"2024-11-05T10:13:23Z","published":"2024-11-05T10:13:23Z","title":"Multi-modal NeRF Self-Supervision for LiDAR Semantic Segmentation","summary":" LiDAR Semantic Segmentation is a fundamental task in autonomous driving\nperception consisting of associating each LiDAR point to a semantic label.\nFully-supervised models have widely tackled this task, but they require labels\nfor each scan, which either limits their domain or requires impractical amounts\nof expensive annotations. Camera images, which are generally recorded alongside\nLiDAR pointclouds, can be processed by the widely available 2D foundation\nmodels, which are generic and dataset-agnostic. However, distilling knowledge\nfrom 2D data to improve LiDAR perception raises domain adaptation challenges.\nFor example, the classical perspective projection suffers from the parallax\neffect produced by the position shift between both sensors at their respective\ncapture times. We propose a Semi-Supervised Learning setup to leverage\nunlabeled LiDAR pointclouds alongside distilled knowledge from the camera\nimages. To self-supervise our model on the unlabeled scans, we add an auxiliary\nNeRF head and cast rays from the camera viewpoint over the unlabeled voxel\nfeatures. The NeRF head predicts densities and semantic logits at each sampled\nray location which are used for rendering pixel semantics. Concurrently, we\nquery the Segment-Anything (SAM) foundation model with the camera image to\ngenerate a set of unlabeled generic masks. We fuse the masks with the rendered\npixel semantics from LiDAR to produce pseudo-labels that supervise the pixel\npredictions. During inference, we drop the NeRF head and run our model with\nonly LiDAR. We show the effectiveness of our approach in three public LiDAR\nSemantic Segmentation benchmarks: nuScenes, SemanticKITTI and ScribbleKITTI.\n","authors":["Xavier Timoneda","Markus Herb","Fabian Duerr","Daniel Goehring","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2411.02969v1.pdf","comment":"IEEE/RSJ International Conference on Intelligent Robots and Systems\n (IROS) 2024"},{"id":"http://arxiv.org/abs/2406.15831v2","updated":"2024-11-05T09:56:56Z","published":"2024-06-22T12:24:49Z","title":"Shape2.5D: A Dataset of Texture-less Surfaces for Depth and Normals\n Estimation","summary":" Reconstructing texture-less surfaces poses unique challenges in computer\nvision, primarily due to the lack of specialized datasets that cater to the\nnuanced needs of depth and normals estimation in the absence of textural\ninformation. We introduce \"Shape2.5D,\" a novel, large-scale dataset designed to\naddress this gap. Comprising 1.17 million frames spanning over 39,772 3D models\nand 48 unique objects, our dataset provides depth and surface normal maps for\ntexture-less object reconstruction. The proposed dataset includes synthetic\nimages rendered with 3D modeling software to simulate various lighting\nconditions and viewing angles. It also includes a real-world subset comprising\n4,672 frames captured with a depth camera. Our comprehensive benchmarks\ndemonstrate the dataset's ability to support the development of algorithms that\nrobustly estimate depth and normals from RGB images and perform voxel\nreconstruction. Our open-source data generation pipeline allows the dataset to\nbe extended and adapted for future research. The dataset is publicly available\nat https://github.com/saifkhichi96/Shape25D.\n","authors":["Muhammad Saif Ullah Khan","Sankalp Sinha","Didier Stricker","Marcus Liwicki","Muhammad Zeshan Afzal"],"pdf_url":"https://arxiv.org/pdf/2406.15831v2.pdf","comment":"Accepted for publication in IEEE Access"},{"id":"http://arxiv.org/abs/2410.20084v2","updated":"2024-11-05T09:52:50Z","published":"2024-10-26T05:28:02Z","title":"UniVST: A Unified Framework for Training-free Localized Video Style\n Transfer","summary":" This paper presents UniVST, a unified framework for localized video style\ntransfer. It operates without the need for training, offering a distinct\nadvantage over existing methods that transfer style across entire videos. The\nendeavors of this paper comprise: (1) A point-matching mask propagation\nstrategy that leverages feature maps from the DDIM inversion. This streamlines\nthe model's architecture by obviating the need for tracking models. (2) An\nAdaIN-guided style transfer mechanism that operates at both the latent and\nattention levels. This balances content fidelity and style richness, mitigating\nthe loss of localized details commonly associated with direct video\nstylization. (3) A sliding window smoothing strategy that harnesses optical\nflow within the pixel representation and refines predicted noise to update the\nlatent space. This significantly enhances temporal consistency and diminishes\nartifacts in video outputs. Our proposed UniVST has been validated to be\nsuperior to existing methods in quantitative and qualitative metrics. It\nadeptly addresses the challenges of preserving the primary object's style while\nensuring temporal consistency and detail preservation.\n","authors":["Quanjian Song","Mingbao Lin","Wengyi Zhan","Shuicheng Yan","Liujuan Cao"],"pdf_url":"https://arxiv.org/pdf/2410.20084v2.pdf","comment":"10 pages not including reference"},{"id":"http://arxiv.org/abs/2411.02951v1","updated":"2024-11-05T09:51:59Z","published":"2024-11-05T09:51:59Z","title":"LDPM: Towards undersampled MRI reconstruction with MR-VAE and Latent\n Diffusion Prior","summary":" Diffusion model, as a powerful generative model, has found a wide range of\napplications including MRI reconstruction. However, most existing diffusion\nmodel-based MRI reconstruction methods operate directly in pixel space, which\nmakes their optimization and inference computationally expensive. Latent\ndiffusion models were introduced to address this problem in natural image\nprocessing, but directly applying them to MRI reconstruction still faces many\nchallenges, including the lack of control over the generated results, the\nadaptability of Variational AutoEncoder (VAE) to MRI, and the exploration of\napplicable data consistency in latent space. To address these challenges, a\nLatent Diffusion Prior based undersampled MRI reconstruction (LDPM) method is\nproposed. A sketcher module is utilized to provide appropriate control and\nbalance the quality and fidelity of the reconstructed MR images. A VAE adapted\nfor MRI tasks (MR-VAE) is explored, which can serve as the backbone for future\nMR-related tasks. Furthermore, a variation of the DDIM sampler, called the\nDual-Stage Sampler, is proposed to achieve high-fidelity reconstruction in the\nlatent space. The proposed method achieves competitive results on fastMRI\ndatasets, and the effectiveness of each module is demonstrated in ablation\nexperiments.\n","authors":["Xingjian Tang","Jingwei Guan","Linge Li","Youmei Zhang","Mengye Lyu","Li Yan"],"pdf_url":"https://arxiv.org/pdf/2411.02951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02485v2","updated":"2024-11-05T09:46:45Z","published":"2024-06-04T16:54:28Z","title":"Stable-Pose: Leveraging Transformers for Pose-Guided Text-to-Image\n Generation","summary":" Controllable text-to-image (T2I) diffusion models have shown impressive\nperformance in generating high-quality visual content through the incorporation\nof various conditions. Current methods, however, exhibit limited performance\nwhen guided by skeleton human poses, especially in complex pose conditions such\nas side or rear perspectives of human figures. To address this issue, we\npresent Stable-Pose, a novel adapter model that introduces a coarse-to-fine\nattention masking strategy into a vision Transformer (ViT) to gain accurate\npose guidance for T2I models. Stable-Pose is designed to adeptly handle pose\nconditions within pre-trained Stable Diffusion, providing a refined and\nefficient way of aligning pose representation during image synthesis. We\nleverage the query-key self-attention mechanism of ViTs to explore the\ninterconnections among different anatomical parts in human pose skeletons.\nMasked pose images are used to smoothly refine the attention maps based on\ntarget pose-related features in a hierarchical manner, transitioning from\ncoarse to fine levels. Additionally, our loss function is formulated to\nallocate increased emphasis to the pose region, thereby augmenting the model's\nprecision in capturing intricate pose details. We assessed the performance of\nStable-Pose across five public datasets under a wide range of indoor and\noutdoor human pose scenarios. Stable-Pose achieved an AP score of 57.1 in the\nLAION-Human dataset, marking around 13% improvement over the established\ntechnique ControlNet. The project link and code is available at\nhttps://github.com/ai-med/StablePose.\n","authors":["Jiajun Wang","Morteza Ghahremani","Yitong Li","Björn Ommer","Christian Wachinger"],"pdf_url":"https://arxiv.org/pdf/2406.02485v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.02327v2","updated":"2024-11-05T09:43:59Z","published":"2024-11-04T17:50:36Z","title":"PPLLaVA: Varied Video Sequence Understanding With Prompt Guidance","summary":" The past year has witnessed the significant advancement of video-based large\nlanguage models. However, the challenge of developing a unified model for both\nshort and long video understanding remains unresolved. Most existing video LLMs\ncannot handle hour-long videos, while methods custom for long videos tend to be\nineffective for shorter videos and images. In this paper, we identify the key\nissue as the redundant content in videos. To address this, we propose a novel\npooling strategy that simultaneously achieves token compression and\ninstruction-aware visual feature aggregation. Our model is termed Prompt-guided\nPooling LLaVA, or PPLLaVA for short. Specifically, PPLLaVA consists of three\ncore components: the CLIP-based visual-prompt alignment that extracts visual\ninformation relevant to the user's instructions, the prompt-guided pooling that\ncompresses the visual sequence to arbitrary scales using convolution-style\npooling, and the clip context extension designed for lengthy prompt common in\nvisual dialogue. Moreover, our codebase also integrates the most advanced video\nDirect Preference Optimization (DPO) and visual interleave training. Extensive\nexperiments have validated the performance of our model. With superior\nthroughput and only 1024 visual context, PPLLaVA achieves better results on\nimage benchmarks as a video LLM, while achieving state-of-the-art performance\nacross various video benchmarks, excelling in tasks ranging from caption\ngeneration to multiple-choice questions, and handling video lengths from\nseconds to hours. Codes have been available at\nhttps://github.com/farewellthree/PPLLaVA.\n","authors":["Ruyang Liu","Haoran Tang","Haibo Liu","Yixiao Ge","Ying Shan","Chen Li","Jiankun Yang"],"pdf_url":"https://arxiv.org/pdf/2411.02327v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08773v4","updated":"2024-11-05T09:37:33Z","published":"2024-06-13T03:05:36Z","title":"DenoiseRep: Denoising Model for Representation Learning","summary":" The denoising model has been proven a powerful generative model but has\nlittle exploration of discriminative tasks. Representation learning is\nimportant in discriminative tasks, which is defined as \"learning\nrepresentations (or features) of the data that make it easier to extract useful\ninformation when building classifiers or other predictors\". In this paper, we\npropose a novel Denoising Model for Representation Learning (DenoiseRep) to\nimprove feature discrimination with joint feature extraction and denoising.\nDenoiseRep views each embedding layer in a backbone as a denoising layer,\nprocessing the cascaded embedding layers as if we are recursively denoise\nfeatures step-by-step. This unifies the frameworks of feature extraction and\ndenoising, where the former progressively embeds features from low-level to\nhigh-level, and the latter recursively denoises features step-by-step. After\nthat, DenoiseRep fuses the parameters of feature extraction and denoising\nlayers, and theoretically demonstrates its equivalence before and after the\nfusion, thus making feature denoising computation-free. DenoiseRep is a\nlabel-free algorithm that incrementally improves features but also\ncomplementary to the label if available. Experimental results on various\ndiscriminative vision tasks, including re-identification (Market-1501,\nDukeMTMC-reID, MSMT17, CUHK-03, vehicleID), image classification (ImageNet,\nUB200, Oxford-Pet, Flowers), object detection (COCO), image segmentation\n(ADE20K) show stability and impressive improvements. We also validate its\neffectiveness on the CNN (ResNet) and Transformer (ViT, Swin, Vmamda)\narchitectures.\n","authors":["Zhengrui Xu","Guan'an Wang","Xiaowen Huang","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2406.08773v4.pdf","comment":"Accepted by NeurIPS 2024 (Oral)"},{"id":"http://arxiv.org/abs/2411.02935v1","updated":"2024-11-05T09:24:59Z","published":"2024-11-05T09:24:59Z","title":"Mapping Africa Settlements: High Resolution Urban and Rural Map by Deep\n Learning and Satellite Imagery","summary":" Accurate Land Use and Land Cover (LULC) maps are essential for understanding\nthe drivers of sustainable development, in terms of its complex\ninterrelationships between human activities and natural resources. However,\nexisting LULC maps often lack precise urban and rural classifications,\nparticularly in diverse regions like Africa. This study presents a novel\nconstruction of a high-resolution rural-urban map using deep learning\ntechniques and satellite imagery. We developed a deep learning model based on\nthe DeepLabV3 architecture, which was trained on satellite imagery from\nLandsat-8 and the ESRI LULC dataset, augmented with human settlement data from\nthe GHS-SMOD. The model utilizes semantic segmentation to classify land into\ndetailed categories, including urban and rural areas, at a 10-meter resolution.\nOur findings demonstrate that incorporating LULC along with urban and rural\nclassifications significantly enhances the model's ability to accurately\ndistinguish between urban, rural, and non-human settlement areas. Therefore,\nour maps can support more informed decision-making for policymakers,\nresearchers, and stakeholders. We release a continent wide urban-rural map,\ncovering the period 2016 and 2022.\n","authors":["Mohammad Kakooei","James Bailie","Albin Söderberg","Albin Becevic","Adel Daoud"],"pdf_url":"https://arxiv.org/pdf/2411.02935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12814v2","updated":"2024-11-05T09:09:12Z","published":"2024-10-01T08:52:46Z","title":"Leveraging generative models to characterize the failure conditions of\n image classifiers","summary":" We address in this work the question of identifying the failure conditions of\na given image classifier. To do so, we exploit the capacity of producing\ncontrollable distributions of high quality image data made available by recent\nGenerative Adversarial Networks (StyleGAN2): the failure conditions are\nexpressed as directions of strong performance degradation in the generative\nmodel latent space. This strategy of analysis is used to discover corner cases\nthat combine multiple sources of corruption, and to compare in more details the\nbehavior of different classifiers. The directions of degradation can also be\nrendered visually by generating data for better interpretability. Some\ndegradations such as image quality can affect all classes, whereas other ones\nsuch as shape are more class-specific. The approach is demonstrated on the\nMNIST dataset that has been completed by two sources of corruption: noise and\nblur, and shows a promising way to better understand and control the risks of\nexploiting Artificial Intelligence components for safety-critical applications.\n","authors":["Adrien LeCoz","Stéphane Herbin","Faouzi Adjed"],"pdf_url":"https://arxiv.org/pdf/2410.12814v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02920v1","updated":"2024-11-05T09:08:46Z","published":"2024-11-05T09:08:46Z","title":"Domain Expansion and Boundary Growth for Open-Set Single-Source Domain\n Generalization","summary":" Open-set single-source domain generalization aims to use a single-source\ndomain to learn a robust model that can be generalized to unknown target\ndomains with both domain shifts and label shifts. The scarcity of the source\ndomain and the unknown data distribution of the target domain pose a great\nchallenge for domain-invariant feature learning and unknown class recognition.\nIn this paper, we propose a novel learning approach based on domain expansion\nand boundary growth to expand the scarce source samples and enlarge the\nboundaries across the known classes that indirectly broaden the boundary\nbetween the known and unknown classes. Specifically, we achieve domain\nexpansion by employing both background suppression and style augmentation on\nthe source data to synthesize new samples. Then we force the model to distill\nconsistent knowledge from the synthesized samples so that the model can learn\ndomain-invariant information. Furthermore, we realize boundary growth across\nclasses by using edge maps as an additional modality of samples when training\nmulti-binary classifiers. In this way, it enlarges the boundary between the\ninliers and outliers, and consequently improves the unknown class recognition\nduring open-set generalization. Extensive experiments show that our approach\ncan achieve significant improvements and reach state-of-the-art performance on\nseveral cross-domain image classification datasets.\n","authors":["Pengkun Jiao","Na Zhao","Jingjing Chen","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2411.02920v1.pdf","comment":"TMM 2024"},{"id":"http://arxiv.org/abs/2406.06079v2","updated":"2024-11-05T09:07:21Z","published":"2024-06-10T07:52:29Z","title":"Latent Representation Matters: Human-like Sketches in One-shot Drawing\n Tasks","summary":" Humans can effortlessly draw new categories from a single exemplar, a feat\nthat has long posed a challenge for generative models. However, this gap has\nstarted to close with recent advances in diffusion models. This one-shot\ndrawing task requires powerful inductive biases that have not been\nsystematically investigated. Here, we study how different inductive biases\nshape the latent space of Latent Diffusion Models (LDMs). Along with standard\nLDM regularizers (KL and vector quantization), we explore supervised\nregularizations (including classification and prototype-based representation)\nand contrastive inductive biases (using SimCLR and redundancy reduction\nobjectives). We demonstrate that LDMs with redundancy reduction and\nprototype-based regularizations produce near-human-like drawings (regarding\nboth samples' recognizability and originality) -- better mimicking human\nperception (as evaluated psychophysically). Overall, our results suggest that\nthe gap between humans and machines in one-shot drawings is almost closed.\n","authors":["Victor Boutin","Rishav Mukherji","Aditya Agrawal","Sabine Muzellec","Thomas Fel","Thomas Serre","Rufin VanRullen"],"pdf_url":"https://arxiv.org/pdf/2406.06079v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13871v2","updated":"2024-11-05T09:07:14Z","published":"2024-10-02T12:14:31Z","title":"Explaining an image classifier with a generative model conditioned by\n uncertainty","summary":" We propose to condition a generative model by a given image classifier\nuncertainty in order to analyze and explain its behavior. Preliminary\nexperiments on synthetic data and a corrupted version of MNIST dataset\nillustrate the idea.\n","authors":["Adrien LeCoz","Stéphane Herbin","Faouzi Adjed"],"pdf_url":"https://arxiv.org/pdf/2410.13871v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02116v2","updated":"2024-11-05T08:35:14Z","published":"2024-11-04T14:29:28Z","title":"Advancements and limitations of LLMs in replicating human color-word\n associations","summary":" Color-word associations play a fundamental role in human cognition and design\napplications. Large Language Models (LLMs) have become widely available and\ndemonstrated intelligent behaviors in various benchmarks with natural\nconversation skills. However, their ability to replicate human color-word\nassociations remains understudied. We compared multiple generations of LLMs\n(from GPT-3 to GPT-4o) against human color-word associations using data\ncollected from over 10,000 Japanese participants, involving 17 colors and words\nfrom eight categories in Japanese. Our findings reveal a clear progression in\nLLM performance across generations, with GPT-4o achieving the highest accuracy\nin predicting the best voted word for each color and category. However, the\nhighest median performance was approximately 50% even for GPT-4o with visual\ninputs (chance level is 10%), and the performance levels varied significantly\nacross word categories and colors, indicating a failure to fully replicate\nhuman color-word associations. On the other hand, color discrimination ability\nestimated from our color-word association data showed that LLMs demonstrated\nhigh correlation with human color discrimination patterns, similarly to\nprevious studies. Our study highlights both the advancements in LLM\ncapabilities and their persistent limitations, suggesting differences in\nsemantic memory structures between humans and LLMs in representing color-word\nassociations.\n","authors":["Makoto Fukushima","Shusuke Eshita","Hiroshige Fukuhara"],"pdf_url":"https://arxiv.org/pdf/2411.02116v2.pdf","comment":"20 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2411.02902v1","updated":"2024-11-05T08:35:08Z","published":"2024-11-05T08:35:08Z","title":"Membership Inference Attacks against Large Vision-Language Models","summary":" Large vision-language models (VLLMs) exhibit promising capabilities for\nprocessing multi-modal tasks across various application scenarios. However,\ntheir emergence also raises significant data security concerns, given the\npotential inclusion of sensitive information, such as private photos and\nmedical records, in their training datasets. Detecting inappropriately used\ndata in VLLMs remains a critical and unresolved issue, mainly due to the lack\nof standardized datasets and suitable methodologies. In this study, we\nintroduce the first membership inference attack (MIA) benchmark tailored for\nvarious VLLMs to facilitate training data detection. Then, we propose a novel\nMIA pipeline specifically designed for token-level image detection. Lastly, we\npresent a new metric called MaxR\\'enyi-K%, which is based on the confidence of\nthe model output and applies to both text and image data. We believe that our\nwork can deepen the understanding and methodology of MIAs in the context of\nVLLMs. Our code and datasets are available at\nhttps://github.com/LIONS-EPFL/VL-MIA.\n","authors":["Zhan Li","Yongtao Wu","Yihang Chen","Francesco Tonin","Elias Abad Rocamora","Volkan Cevher"],"pdf_url":"https://arxiv.org/pdf/2411.02902v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.02890v1","updated":"2024-11-05T08:04:43Z","published":"2024-11-05T08:04:43Z","title":"Fried deconvolution","summary":" In this paper we present a new approach to deblur the effect of atmospheric\nturbulence in the case of long range imaging. Our method is based on an\nanalytical formulation, the Fried kernel, of the atmosphere modulation transfer\nfunction (MTF) and a framelet based deconvolution algorithm. An important\nparameter is the refractive index structure which requires specific\nmeasurements to be known. Then we propose a method which provides a good\nestimation of this parameter from the input blurred image. The final algorithms\nare very easy to implement and show very good results on both simulated blur\nand real images.\n","authors":["Jerome Gilles","Stanley Osher"],"pdf_url":"https://arxiv.org/pdf/2411.02890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02889v1","updated":"2024-11-05T08:04:29Z","published":"2024-11-05T08:04:29Z","title":"Turbulence stabilization","summary":" We recently developed a new approach to get a stabilized image from a\nsequence of frames acquired through atmospheric turbulence. The goal of this\nalgorihtm is to remove the geometric distortions due by the atmosphere\nmovements. This method is based on a variational formulation and is efficiently\nsolved by the use of Bregman iterations and the operator splitting method. In\nthis paper we propose to study the influence of the choice of the regularizing\nterm in the model. Then we proposed to experiment some of the most used\nregularization constraints available in the litterature.\n","authors":["Yu Mao","Jerome Gilles"],"pdf_url":"https://arxiv.org/pdf/2411.02889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02888v1","updated":"2024-11-05T08:02:44Z","published":"2024-11-05T08:02:44Z","title":"A Symmetric Dynamic Learning Framework for Diffeomorphic Medical Image\n Registration","summary":" Diffeomorphic image registration is crucial for various medical imaging\napplications because it can preserve the topology of the transformation. This\nstudy introduces DCCNN-LSTM-Reg, a learning framework that evolves dynamically\nand learns a symmetrical registration path by satisfying a specified control\nincrement system. This framework aims to obtain symmetric diffeomorphic\ndeformations between moving and fixed images. To achieve this, we combine deep\nlearning networks with diffeomorphic mathematical mechanisms to create a\ncontinuous and dynamic registration architecture, which consists of multiple\nSymmetric Registration (SR) modules cascaded on five different scales.\nSpecifically, our method first uses two U-nets with shared parameters to\nextract multiscale feature pyramids from the images. We then develop an\nSR-module comprising a sequential CNN-LSTM architecture to progressively\ncorrect the forward and reverse multiscale deformation fields using control\nincrement learning and the homotopy continuation technique. Through extensive\nexperiments on three 3D registration tasks, we demonstrate that our method\noutperforms existing approaches in both quantitative and qualitative\nevaluations.\n","authors":["Jinqiu Deng","Ke Chen","Mingke Li","Daoping Zhang","Chong Chen","Alejandro F. Frangi","Jianping Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.02888v1.pdf","comment":"12 pages,7 figures"},{"id":"http://arxiv.org/abs/2411.02871v1","updated":"2024-11-05T07:26:24Z","published":"2024-11-05T07:26:24Z","title":"Enhancing Adversarial Robustness via Uncertainty-Aware Distributional\n Adversarial Training","summary":" Despite remarkable achievements in deep learning across various domains, its\ninherent vulnerability to adversarial examples still remains a critical concern\nfor practical deployment. Adversarial training has emerged as one of the most\neffective defensive techniques for improving model robustness against such\nmalicious inputs. However, existing adversarial training schemes often lead to\nlimited generalization ability against underlying adversaries with diversity\ndue to their overreliance on a point-by-point augmentation strategy by mapping\neach clean example to its adversarial counterpart during training. In addition,\nadversarial examples can induce significant disruptions in the statistical\ninformation w.r.t. the target model, thereby introducing substantial\nuncertainty and challenges to modeling the distribution of adversarial\nexamples. To circumvent these issues, in this paper, we propose a novel\nuncertainty-aware distributional adversarial training method, which enforces\nadversary modeling by leveraging both the statistical information of\nadversarial examples and its corresponding uncertainty estimation, with the\ngoal of augmenting the diversity of adversaries. Considering the potentially\nnegative impact induced by aligning adversaries to misclassified clean\nexamples, we also refine the alignment reference based on the statistical\nproximity to clean examples during adversarial training, thereby reframing\nadversarial training within a distribution-to-distribution matching framework\ninteracted between the clean and adversarial domains. Furthermore, we design an\nintrospective gradient alignment approach via matching input gradients between\nthese domains without introducing external models. Extensive experiments across\nfour benchmark datasets and various network architectures demonstrate that our\napproach achieves state-of-the-art adversarial robustness and maintains natural\nperformance.\n","authors":["Junhao Dong","Xinghua Qu","Z. Jane Wang","Yew-Soon Ong"],"pdf_url":"https://arxiv.org/pdf/2411.02871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04802v3","updated":"2024-11-05T07:25:42Z","published":"2024-06-07T10:06:13Z","title":"Predictive Dynamic Fusion","summary":" Multimodal fusion is crucial in joint decision-making systems for rendering\nholistic judgments. Since multimodal data changes in open environments, dynamic\nfusion has emerged and achieved remarkable progress in numerous applications.\nHowever, most existing dynamic multimodal fusion methods lack theoretical\nguarantees and easily fall into suboptimal problems, yielding unreliability and\ninstability. To address this issue, we propose a Predictive Dynamic Fusion\n(PDF) framework for multimodal learning. We proceed to reveal the multimodal\nfusion from a generalization perspective and theoretically derive the\npredictable Collaborative Belief (Co-Belief) with Mono- and Holo-Confidence,\nwhich provably reduces the upper bound of generalization error. Accordingly, we\nfurther propose a relative calibration strategy to calibrate the predicted\nCo-Belief for potential uncertainty. Extensive experiments on multiple\nbenchmarks confirm our superiority. Our code is available at\nhttps://github.com/Yinan-Xia/PDF.\n","authors":["Bing Cao","Yinan Xia","Yi Ding","Changqing Zhang","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/2406.04802v3.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2407.12735v3","updated":"2024-11-05T07:24:15Z","published":"2024-07-17T16:55:42Z","title":"EchoSight: Advancing Visual-Language Models with Wiki Knowledge","summary":" Knowledge-based Visual Question Answering (KVQA) tasks require answering\nquestions about images using extensive background knowledge. Despite\nsignificant advancements, generative models often struggle with these tasks due\nto the limited integration of external knowledge. In this paper, we introduce\nEchoSight, a novel multimodal Retrieval-Augmented Generation (RAG) framework\nthat enables large language models (LLMs) to answer visual questions requiring\nfine-grained encyclopedic knowledge. To strive for high-performing retrieval,\nEchoSight first searches wiki articles by using visual-only information,\nsubsequently, these candidate articles are further reranked according to their\nrelevance to the combined text-image query. This approach significantly\nimproves the integration of multimodal knowledge, leading to enhanced retrieval\noutcomes and more accurate VQA responses. Our experimental results on the\nEncyclopedic VQA and InfoSeek datasets demonstrate that EchoSight establishes\nnew state-of-the-art results in knowledge-based VQA, achieving an accuracy of\n41.8% on Encyclopedic VQA and 31.3% on InfoSeek.\n","authors":["Yibin Yan","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2407.12735v3.pdf","comment":"Technical Report; Project Page: https://go2heart.github.io/echosight"},{"id":"http://arxiv.org/abs/2411.02867v1","updated":"2024-11-05T07:16:32Z","published":"2024-11-05T07:16:32Z","title":"AtlasSeg: Atlas Prior Guided Dual-U-Net for Cortical Segmentation in\n Fetal Brain MRI","summary":" Accurate tissue segmentation in fetal brain MRI remains challenging due to\nthe dynamically changing anatomical anatomy and contrast during fetal\ndevelopment. To enhance segmentation accuracy throughout gestation, we\nintroduced AtlasSeg, a dual-U-shape convolution network incorporating\ngestational age (GA) specific information as guidance. By providing a publicly\navailable fetal brain atlas with segmentation label at the corresponding GA,\nAtlasSeg effectively extracted the contextual features of age-specific patterns\nin atlas branch and generated tissue segmentation in segmentation branch.\nMulti-scale attentive atlas feature fusions were constructed in all stages\nduring encoding and decoding, giving rise to a dual-U-shape network to assist\nfeature flow and information interactions between two branches. AtlasSeg\noutperformed six well-known segmentation networks in both our internal fetal\nbrain MRI dataset and the external FeTA dataset. Ablation experiments\ndemonstrate the efficiency of atlas guidance and the attention mechanism. The\nproposed AtlasSeg demonstrated superior segmentation performance against other\nconvolution networks with higher segmentation accuracy, and may facilitate\nfetal brain MRI analysis in large-scale fetal brain studies.\n","authors":["Haoan Xu","Tianshu Zheng","Xinyi Xu","Yao Shen","Jiwei Sun","Cong Sun","Guangbin Wang","Dan Wu"],"pdf_url":"https://arxiv.org/pdf/2411.02867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02861v1","updated":"2024-11-05T07:09:27Z","published":"2024-11-05T07:09:27Z","title":"Centerness-based Instance-aware Knowledge Distillation with Task-wise\n Mutual Lifting for Object Detection on Drone Imagery","summary":" Developing accurate and efficient detectors for drone imagery is challenging\ndue to the inherent complexity of aerial scenes. While some existing methods\naim to achieve high accuracy by utilizing larger models, their computational\ncost is prohibitive for drones. Recently, Knowledge Distillation (KD) has shown\npromising potential for maintaining satisfactory accuracy while significantly\ncompressing models in general object detection. Considering the advantages of\nKD, this paper presents the first attempt to adapt it to object detection on\ndrone imagery and addresses two intrinsic issues: (1) low foreground-background\nratio and (2) small instances and complex backgrounds, which lead to inadequate\ntraining, resulting insufficient distillation. Therefore, we propose a\ntask-wise Lightweight Mutual Lifting (Light-ML) module with a Centerness-based\nInstance-aware Distillation (CID) strategy. The Light-ML module mutually\nharmonizes the classification and localization branches by channel shuffling\nand convolution, integrating teacher supervision across different tasks during\nback-propagation, thus facilitating training the student model. The CID\nstrategy extracts valuable regions surrounding instances through the centerness\nof proposals, enhancing distillation efficacy. Experiments on the VisDrone,\nUAVDT, and COCO benchmarks demonstrate that the proposed approach promotes the\naccuracies of existing state-of-the-art KD methods with comparable\ncomputational requirements. Codes will be available upon acceptance.\n","authors":["Bowei Du","Zhixuan Liao","Yanan Zhang","Zhi Cai","Jiaxin Chen","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2411.02861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02860v1","updated":"2024-11-05T07:09:14Z","published":"2024-11-05T07:09:14Z","title":"Continual Audio-Visual Sound Separation","summary":" In this paper, we introduce a novel continual audio-visual sound separation\ntask, aiming to continuously separate sound sources for new classes while\npreserving performance on previously learned classes, with the aid of visual\nguidance. This problem is crucial for practical visually guided auditory\nperception as it can significantly enhance the adaptability and robustness of\naudio-visual sound separation models, making them more applicable for\nreal-world scenarios where encountering new sound sources is commonplace. The\ntask is inherently challenging as our models must not only effectively utilize\ninformation from both modalities in current tasks but also preserve their\ncross-modal association in old tasks to mitigate catastrophic forgetting during\naudio-visual continual learning. To address these challenges, we propose a\nnovel approach named ContAV-Sep (\\textbf{Cont}inual\n\\textbf{A}udio-\\textbf{V}isual Sound \\textbf{Sep}aration). ContAV-Sep presents\na novel Cross-modal Similarity Distillation Constraint (CrossSDC) to uphold the\ncross-modal semantic similarity through incremental tasks and retain previously\nacquired knowledge of semantic similarity in old models, mitigating the risk of\ncatastrophic forgetting. The CrossSDC can seamlessly integrate into the\ntraining process of different audio-visual sound separation frameworks.\nExperiments demonstrate that ContAV-Sep can effectively mitigate catastrophic\nforgetting and achieve significantly better performance compared to other\ncontinual learning baselines for audio-visual sound separation. Code is\navailable at: \\url{https://github.com/weiguoPian/ContAV-Sep_NeurIPS2024}.\n","authors":["Weiguo Pian","Yiyang Nan","Shijian Deng","Shentong Mo","Yunhui Guo","Yapeng Tian"],"pdf_url":"https://arxiv.org/pdf/2411.02860v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.02858v1","updated":"2024-11-05T07:02:25Z","published":"2024-11-05T07:02:25Z","title":"OLAF: A Plug-and-Play Framework for Enhanced Multi-object Multi-part\n Scene Parsing","summary":" Multi-object multi-part scene segmentation is a challenging task whose\ncomplexity scales exponentially with part granularity and number of scene\nobjects. To address the task, we propose a plug-and-play approach termed OLAF.\nFirst, we augment the input (RGB) with channels containing object-based\nstructural cues (fg/bg mask, boundary edge mask). We propose a weight\nadaptation technique which enables regular (RGB) pre-trained models to process\nthe augmented (5-channel) input in a stable manner during optimization. In\naddition, we introduce an encoder module termed LDF to provide low-level dense\nfeature guidance. This assists segmentation, particularly for smaller parts.\nOLAF enables significant mIoU gains of $\\mathbf{3.3}$ (Pascal-Parts-58),\n$\\mathbf{3.5}$ (Pascal-Parts-108) over the SOTA model. On the most challenging\nvariant (Pascal-Parts-201), the gain is $\\mathbf{4.0}$. Experimentally, we show\nthat OLAF's broad applicability enables gains across multiple architectures\n(CNN, U-Net, Transformer) and datasets. The code is available at\nolafseg.github.io\n","authors":["Pranav Gupta","Rishubh Singh","Pradeep Shenoy","Ravikiran Sarvadevabhatla"],"pdf_url":"https://arxiv.org/pdf/2411.02858v1.pdf","comment":"Accepted in The European Conference on Computer Vision (ECCV) 2024"},{"id":"http://arxiv.org/abs/2411.02855v1","updated":"2024-11-05T06:59:05Z","published":"2024-11-05T06:59:05Z","title":"Analyzing Poverty through Intra-Annual Time-Series: A Wavelet Transform\n Approach","summary":" Reducing global poverty is a key objective of the Sustainable Development\nGoals (SDGs). Achieving this requires high-frequency, granular data to capture\nneighborhood-level changes, particularly in data scarce regions such as low-\nand middle-income countries. To fill in the data gaps, recent computer vision\nmethods combining machine learning (ML) with earth observation (EO) data to\nimprove poverty estimation. However, while much progress have been made, they\noften omit intra-annual variations, which are crucial for estimating poverty in\nagriculturally dependent countries. We explored the impact of integrating\nintra-annual NDVI information with annual multi-spectral data on model\naccuracy. To evaluate our method, we created a simulated dataset using Landsat\nimagery and nighttime light data to evaluate EO-ML methods that use\nintra-annual EO data. Additionally, we evaluated our method against the\nDemographic and Health Survey (DHS) dataset across Africa. Our results indicate\nthat integrating specific NDVI-derived features with multi-spectral data\nprovides valuable insights for poverty analysis, emphasizing the importance of\nretaining intra-annual information.\n","authors":["Mohammad Kakooei","Klaudia Solska","Adel Daoud"],"pdf_url":"https://arxiv.org/pdf/2411.02855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01781v2","updated":"2024-11-05T06:55:19Z","published":"2024-11-04T04:14:39Z","title":"MSTA3D: Multi-scale Twin-attention for 3D Instance Segmentation","summary":" Recently, transformer-based techniques incorporating superpoints have become\nprevalent in 3D instance segmentation. However, they often encounter an\nover-segmentation problem, especially noticeable with large objects.\nAdditionally, unreliable mask predictions stemming from superpoint mask\nprediction further compound this issue. To address these challenges, we propose\na novel framework called MSTA3D. It leverages multi-scale feature\nrepresentation and introduces a twin-attention mechanism to effectively capture\nthem. Furthermore, MSTA3D integrates a box query with a box regularizer,\noffering a complementary spatial constraint alongside semantic queries.\nExperimental evaluations on ScanNetV2, ScanNet200 and S3DIS datasets\ndemonstrate that our approach surpasses state-of-the-art 3D instance\nsegmentation methods.\n","authors":["Duc Dang Trung Tran","Byeongkeun Kang","Yeejin Lee"],"pdf_url":"https://arxiv.org/pdf/2411.01781v2.pdf","comment":"14 pages, 9 figures, 7 tables, conference"},{"id":"http://arxiv.org/abs/2409.17612v2","updated":"2024-11-05T06:47:37Z","published":"2024-09-26T08:03:19Z","title":"Diversity-Driven Synthesis: Enhancing Dataset Distillation through\n Directed Weight Adjustment","summary":" The sharp increase in data-related expenses has motivated research into\ncondensing datasets while retaining the most informative features. Dataset\ndistillation has thus recently come to the fore. This paradigm generates\nsynthetic datasets that are representative enough to replace the original\ndataset in training a neural network. To avoid redundancy in these synthetic\ndatasets, it is crucial that each element contains unique features and remains\ndiverse from others during the synthesis stage. In this paper, we provide a\nthorough theoretical and empirical analysis of diversity within synthesized\ndatasets. We argue that enhancing diversity can improve the parallelizable yet\nisolated synthesizing approach. Specifically, we introduce a novel method that\nemploys dynamic and directed weight adjustment techniques to modulate the\nsynthesis process, thereby maximizing the representativeness and diversity of\neach synthetic instance. Our method ensures that each batch of synthetic data\nmirrors the characteristics of a large, varying subset of the original dataset.\nExtensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet,\nand ImageNet-1K, demonstrate the superior performance of our method,\nhighlighting its effectiveness in producing diverse and representative\nsynthetic datasets with minimal computational expense. Our code is available at\nhttps://github.com/AngusDujw/Diversity-Driven-Synthesis.https://github.com/AngusDujw/Diversity-Driven-Synthesis.\n","authors":["Jiawei Du","Xin Zhang","Juncheng Hu","Wenxin Huang","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.17612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23775v3","updated":"2024-11-05T06:41:27Z","published":"2024-10-31T09:45:00Z","title":"In-Context LoRA for Diffusion Transformers","summary":" Recent research arXiv:2410.15027 has explored the use of diffusion\ntransformers (DiTs) for task-agnostic image generation by simply concatenating\nattention tokens across images. However, despite substantial computational\nresources, the fidelity of the generated images remains suboptimal. In this\nstudy, we reevaluate and streamline this framework by hypothesizing that\ntext-to-image DiTs inherently possess in-context generation capabilities,\nrequiring only minimal tuning to activate them. Through diverse task\nexperiments, we qualitatively demonstrate that existing text-to-image DiTs can\neffectively perform in-context generation without any tuning. Building on this\ninsight, we propose a remarkably simple pipeline to leverage the in-context\nabilities of DiTs: (1) concatenate images instead of tokens, (2) perform joint\ncaptioning of multiple images, and (3) apply task-specific LoRA tuning using\nsmall datasets (e.g., 20~100 samples) instead of full-parameter tuning with\nlarge datasets. We name our models In-Context LoRA (IC-LoRA). This approach\nrequires no modifications to the original DiT models, only changes to the\ntraining data. Remarkably, our pipeline generates high-fidelity image sets that\nbetter adhere to prompts. While task-specific in terms of tuning data, our\nframework remains task-agnostic in architecture and pipeline, offering a\npowerful tool for the community and providing valuable insights for further\nresearch on product-level task-agnostic generation systems. We release our\ncode, data, and models at https://github.com/ali-vilab/In-Context-LoRA\n","authors":["Lianghua Huang","Wei Wang","Zhi-Fan Wu","Yupeng Shi","Huanzhang Dou","Chen Liang","Yutong Feng","Yu Liu","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.23775v3.pdf","comment":"Tech report. Project page:\n https://ali-vilab.github.io/In-Context-LoRA-Page/"},{"id":"http://arxiv.org/abs/2407.09562v3","updated":"2024-11-05T06:35:52Z","published":"2024-07-03T10:21:07Z","title":"Edge AI-Enabled Chicken Health Detection Based on Enhanced FCOS-Lite and\n Knowledge Distillation","summary":" The utilization of AIoT technology has become a crucial trend in modern\npoultry management, offering the potential to optimize farming operations and\nreduce human workloads. This paper presents a real-time and compact edge-AI\nenabled detector designed to identify chickens and their healthy statuses using\nframes captured by a lightweight and intelligent camera equipped with an\nedge-AI enabled CMOS sensor. To ensure efficient deployment of the proposed\ncompact detector within the memory-constrained edge-AI enabled CMOS sensor, we\nemploy a FCOS-Lite detector leveraging MobileNet as the backbone. To mitigate\nthe issue of reduced accuracy in compact edge-AI detectors without incurring\nadditional inference costs, we propose a gradient weighting loss function as\nclassification loss and introduce CIOU loss function as localization loss.\nAdditionally, we propose a knowledge distillation scheme to transfer valuable\ninformation from a large teacher detector to the proposed FCOS-Lite detector,\nthereby enhancing its performance while preserving a compact model size.\nExperimental results demonstrate the proposed edge-AI enabled detector achieves\ncommendable performance metrics, including a mean average precision (mAP) of\n95.1$\\%$ and an F1-score of 94.2$\\%$, etc. Notably, the proposed detector can\nbe efficiently deployed and operates at a speed exceeding 20 FPS on the edge-AI\nenabled CMOS sensor, achieved through int8 quantization. That meets practical\ndemands for automated poultry health monitoring using lightweight intelligent\ncameras with low power consumption and minimal bandwidth costs.\n","authors":["Qiang Tong","Jinrui Wang","Wenshuang Yang","Songtao Wu","Wenqi Zhang","Chen Sun","Kuanhong Xu"],"pdf_url":"https://arxiv.org/pdf/2407.09562v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12629v4","updated":"2024-11-05T06:34:40Z","published":"2024-06-18T13:55:13Z","title":"SeTAR: Out-of-Distribution Detection with Selective Low-Rank\n Approximation","summary":" Out-of-distribution (OOD) detection is crucial for the safe deployment of\nneural networks. Existing CLIP-based approaches perform OOD detection by\ndevising novel scoring functions or sophisticated fine-tuning methods. In this\nwork, we propose SeTAR, a novel, training-free OOD detection method that\nleverages selective low-rank approximation of weight matrices in\nvision-language and vision-only models. SeTAR enhances OOD detection via\npost-hoc modification of the model's weight matrices using a simple greedy\nsearch algorithm. Based on SeTAR, we further propose SeTAR+FT, a fine-tuning\nextension optimizing model performance for OOD detection tasks. Extensive\nevaluations on ImageNet1K and Pascal-VOC benchmarks show SeTAR's superior\nperformance, reducing the relatively false positive rate by up to 18.95% and\n36.80% compared to zero-shot and fine-tuning baselines. Ablation studies\nfurther validate SeTAR's effectiveness, robustness, and generalizability across\ndifferent model backbones. Our work offers a scalable, efficient solution for\nOOD detection, setting a new state-of-the-art in this area.\n","authors":["Yixia Li","Boya Xiong","Guanhua Chen","Yun Chen"],"pdf_url":"https://arxiv.org/pdf/2406.12629v4.pdf","comment":"Accepted by NeurIPS 2024. Project page is live at\n https://SeTAR-OOD.github.io. Code are available at\n https://github.com/X1AOX1A/SeTAR"},{"id":"http://arxiv.org/abs/2411.02844v1","updated":"2024-11-05T06:34:19Z","published":"2024-11-05T06:34:19Z","title":"Correlation of Object Detection Performance with Visual Saliency and\n Depth Estimation","summary":" As object detection techniques continue to evolve, understanding their\nrelationships with complementary visual tasks becomes crucial for optimising\nmodel architectures and computational resources. This paper investigates the\ncorrelations between object detection accuracy and two fundamental visual\ntasks: depth prediction and visual saliency prediction. Through comprehensive\nexperiments using state-of-the-art models (DeepGaze IIE, Depth Anything,\nDPT-Large, and Itti's model) on COCO and Pascal VOC datasets, we find that\nvisual saliency shows consistently stronger correlations with object detection\naccuracy (mA$\\rho$ up to 0.459 on Pascal VOC) compared to depth prediction\n(mA$\\rho$ up to 0.283). Our analysis reveals significant variations in these\ncorrelations across object categories, with larger objects showing correlation\nvalues up to three times higher than smaller objects. These findings suggest\nincorporating visual saliency features into object detection architectures\ncould be more beneficial than depth information, particularly for specific\nobject categories. The observed category-specific variations also provide\ninsights for targeted feature engineering and dataset design improvements,\npotentially leading to more efficient and accurate object detection systems.\n","authors":["Matthias Bartolo","Dylan Seychell"],"pdf_url":"https://arxiv.org/pdf/2411.02844v1.pdf","comment":"Code Available at:\n https://github.com/mbar0075/Object-Detection-Correlation-Saliency-vs-Depth"},{"id":"http://arxiv.org/abs/2411.02843v1","updated":"2024-11-05T06:31:48Z","published":"2024-11-05T06:31:48Z","title":"Advances in Photoacoustic Imaging Reconstruction and Quantitative\n Analysis for Biomedical Applications","summary":" Photoacoustic imaging (PAI) represents an innovative biomedical imaging\nmodality that harnesses the advantages of optical resolution and acoustic\npenetration depth while ensuring enhanced safety. Despite its promising\npotential across a diverse array of preclinical and clinical applications, the\nclinical implementation of PAI faces significant challenges, including the\ntrade-off between penetration depth and spatial resolution, as well as the\ndemand for faster imaging speeds. This paper explores the fundamental\nprinciples underlying PAI, with a particular emphasis on three primary\nimplementations: photoacoustic computed tomography (PACT), photoacoustic\nmicroscopy (PAM), and photoacoustic endoscopy (PAE). We undertake a critical\nassessment of their respective strengths and practical limitations.\nFurthermore, recent developments in utilizing conventional or deep learning\n(DL) methodologies for image reconstruction and artefact mitigation across\nPACT, PAM, and PAE are outlined, demonstrating considerable potential to\nenhance image quality and accelerate imaging processes. Furthermore, this paper\nexamines the recent developments in quantitative analysis within PAI, including\nthe quantification of haemoglobin concentration, oxygen saturation, and other\nphysiological parameters within tissues. Finally, our discussion encompasses\ncurrent trends and future directions in PAI research while emphasizing the\ntransformative impact of deep learning on advancing PAI.\n","authors":["Lei Wang","Weiming Zeng","Kai Long","Rongfeng Lan","Li Liu","Wai Ting Siok","Nizhuan Wang"],"pdf_url":"https://arxiv.org/pdf/2411.02843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02840v1","updated":"2024-11-05T06:23:44Z","published":"2024-11-05T06:23:44Z","title":"Test-Time Dynamic Image Fusion","summary":" The inherent challenge of image fusion lies in capturing the correlation of\nmulti-source images and comprehensively integrating effective information from\ndifferent sources. Most existing techniques fail to perform dynamic image\nfusion while notably lacking theoretical guarantees, leading to potential\ndeployment risks in this field. Is it possible to conduct dynamic image fusion\nwith a clear theoretical justification? In this paper, we give our solution\nfrom a generalization perspective. We proceed to reveal the generalized form of\nimage fusion and derive a new test-time dynamic image fusion paradigm. It\nprovably reduces the upper bound of generalization error. Specifically, we\ndecompose the fused image into multiple components corresponding to its source\ndata. The decomposed components represent the effective information from the\nsource data, thus the gap between them reflects the Relative Dominability (RD)\nof the uni-source data in constructing the fusion image. Theoretically, we\nprove that the key to reducing generalization error hinges on the negative\ncorrelation between the RD-based fusion weight and the uni-source\nreconstruction loss. Intuitively, RD dynamically highlights the dominant\nregions of each source and can be naturally converted to the corresponding\nfusion weight, achieving robust results. Extensive experiments and discussions\nwith in-depth analysis on multiple benchmarks confirm our findings and\nsuperiority. Our code is available at https://github.com/Yinan-Xia/TTD.\n","authors":["Bing Cao","Yinan Xia","Yi Ding","Changqing Zhang","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/2411.02840v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2404.11031v3","updated":"2024-11-05T06:21:42Z","published":"2024-04-17T03:13:58Z","title":"TaCOS: Task-Specific Camera Optimization with Simulation","summary":" The performance of perception tasks is heavily influenced by imaging systems.\nHowever, designing cameras with high task performance is costly, requiring\nextensive camera knowledge and experimentation with physical hardware.\nAdditionally, cameras and perception tasks are mostly designed in isolation,\nwhereas recent methods that jointly design cameras and tasks have shown\nimproved performance. Therefore, we present a novel end-to-end optimization\napproach that co-designs cameras with specific vision tasks. This method\ncombines derivative-free and gradient-based optimizers to support both\ncontinuous and discrete camera parameters within manufacturing constraints. We\nleverage recent computer graphics techniques and physical camera\ncharacteristics to simulate the cameras in virtual environments, making the\ndesign process cost-effective. We validate our simulations against physical\ncameras and provide a procedurally generated virtual environment. Our\nexperiments demonstrate that our method designs cameras that outperform common\noff-the-shelf options, and more efficiently compared to the state-of-the-art\napproach, requiring only 2 minutes to design a camera on an example experiment\ncompared with 67 minutes for the competing method. Designed to support the\ndevelopment of cameras under manufacturing constraints, multiple cameras, and\nunconventional cameras, we believe this approach can advance the fully\nautomated design of cameras.\n","authors":["Chengyang Yan","Donald G. Dansereau"],"pdf_url":"https://arxiv.org/pdf/2404.11031v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16014v3","updated":"2024-11-05T06:13:07Z","published":"2023-12-26T11:49:23Z","title":"Passive Non-Line-of-Sight Imaging with Light Transport Modulation","summary":" Passive non-line-of-sight (NLOS) imaging has witnessed rapid development in\nrecent years, due to its ability to image objects that are out of sight. The\nlight transport condition plays an important role in this task since changing\nthe conditions will lead to different imaging models. Existing learning-based\nNLOS methods usually train independent models for different light transport\nconditions, which is computationally inefficient and impairs the practicality\nof the models. In this work, we propose NLOS-LTM, a novel passive NLOS imaging\nmethod that effectively handles multiple light transport conditions with a\nsingle network. We achieve this by inferring a latent light transport\nrepresentation from the projection image and using this representation to\nmodulate the network that reconstructs the hidden image from the projection\nimage. We train a light transport encoder together with a vector quantizer to\nobtain the light transport representation. To further regulate this\nrepresentation, we jointly learn both the reconstruction network and the\nreprojection network during training. A set of light transport modulation\nblocks is used to modulate the two jointly trained networks in a multi-scale\nway. Extensive experiments on a large-scale passive NLOS dataset demonstrate\nthe superiority of the proposed method. The code is available at\nhttps://github.com/JerryOctopus/NLOS-LTM.\n","authors":["Jiarui Zhang","Ruixu Geng","Xiaolong Du","Yan Chen","Houqiang Li","Yang Hu"],"pdf_url":"https://arxiv.org/pdf/2312.16014v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02833v1","updated":"2024-11-05T06:13:01Z","published":"2024-11-05T06:13:01Z","title":"Lost in Context: The Influence of Context on Feature Attribution Methods\n for Object Recognition","summary":" Contextual information plays a critical role in object recognition models\nwithin computer vision, where changes in context can significantly affect\naccuracy, underscoring models' dependence on contextual cues. This study\ninvestigates how context manipulation influences both model accuracy and\nfeature attribution, providing insights into the reliance of object recognition\nmodels on contextual information as understood through the lens of feature\nattribution methods.\n We employ a range of feature attribution techniques to decipher the reliance\nof deep neural networks on context in object recognition tasks. Using the\nImageNet-9 and our curated ImageNet-CS datasets, we conduct experiments to\nevaluate the impact of contextual variations, analyzed through feature\nattribution methods. Our findings reveal several key insights: (a) Correctly\nclassified images predominantly emphasize object volume attribution over\ncontext volume attribution. (b) The dependence on context remains relatively\nstable across different context modifications, irrespective of classification\naccuracy. (c) Context change exerts a more pronounced effect on model\nperformance than Context perturbations. (d) Surprisingly, context attribution\nin `no-information' scenarios is non-trivial. Our research moves beyond\ntraditional methods by assessing the implications of broad-level modifications\non object recognition, either in the object or its context.\n","authors":["Sayanta Adhikari","Rishav Kumar","Konda Reddy Mopuri","Rajalakshmi Pachamuthu"],"pdf_url":"https://arxiv.org/pdf/2411.02833v1.pdf","comment":"Published in ICVGIP 2024"},{"id":"http://arxiv.org/abs/2411.02319v2","updated":"2024-11-05T06:08:43Z","published":"2024-11-04T17:45:44Z","title":"GenXD: Generating Any 3D and 4D Scenes","summary":" Recent developments in 2D visual generation have been remarkably successful.\nHowever, 3D and 4D generation remain challenging in real-world applications due\nto the lack of large-scale 4D data and effective model design. In this paper,\nwe propose to jointly investigate general 3D and 4D generation by leveraging\ncamera and object movements commonly observed in daily life. Due to the lack of\nreal-world 4D data in the community, we first propose a data curation pipeline\nto obtain camera poses and object motion strength from videos. Based on this\npipeline, we introduce a large-scale real-world 4D scene dataset: CamVid-30K.\nBy leveraging all the 3D and 4D data, we develop our framework, GenXD, which\nallows us to produce any 3D or 4D scene. We propose multiview-temporal modules,\nwhich disentangle camera and object movements, to seamlessly learn from both 3D\nand 4D data. Additionally, GenXD employs masked latent conditions to support a\nvariety of conditioning views. GenXD can generate videos that follow the camera\ntrajectory as well as consistent 3D views that can be lifted into 3D\nrepresentations. We perform extensive evaluations across various real-world and\nsynthetic datasets, demonstrating GenXD's effectiveness and versatility\ncompared to previous methods in 3D and 4D generation.\n","authors":["Yuyang Zhao","Chung-Ching Lin","Kevin Lin","Zhiwen Yan","Linjie Li","Zhengyuan Yang","Jianfeng Wang","Gim Hee Lee","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2411.02319v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19582v2","updated":"2024-11-05T05:39:33Z","published":"2024-09-29T07:03:05Z","title":"Self-supervised Auxiliary Learning for Texture and Model-based Hybrid\n Robust and Fair Featuring in Face Analysis","summary":" In this work, we explore Self-supervised Learning (SSL) as an auxiliary task\nto blend the texture-based local descriptors into feature modelling for\nefficient face analysis. Combining a primary task and a self-supervised\nauxiliary task is beneficial for robust representation. Therefore, we used the\nSSL task of mask auto-encoder (MAE) as an auxiliary task to reconstruct texture\nfeatures such as local patterns along with the primary task for robust and\nunbiased face analysis. We experimented with our hypothesis on three major\nparadigms of face analysis: face attribute and face-based emotion analysis, and\ndeepfake detection. Our experiment results exhibit that better feature\nrepresentation can be gleaned from our proposed model for fair and bias-less\nface analysis.\n","authors":["Shukesh Reddy","Nishit Poddar","Srijan Das","Abhijit Das"],"pdf_url":"https://arxiv.org/pdf/2409.19582v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02818v1","updated":"2024-11-05T05:36:17Z","published":"2024-11-05T05:36:17Z","title":"LiVOS: Light Video Object Segmentation with Gated Linear Matching","summary":" Semi-supervised video object segmentation (VOS) has been largely driven by\nspace-time memory (STM) networks, which store past frame features in a\nspatiotemporal memory to segment the current frame via softmax attention.\nHowever, STM networks face memory limitations due to the quadratic complexity\nof softmax matching, restricting their applicability as video length and\nresolution increase. To address this, we propose LiVOS, a lightweight memory\nnetwork that employs linear matching via linear attention, reformulating memory\nmatching into a recurrent process that reduces the quadratic attention matrix\nto a constant-size, spatiotemporal-agnostic 2D state. To enhance selectivity,\nwe introduce gated linear matching, where a data-dependent gate matrix is\nmultiplied with the state matrix to control what information to retain or\ndiscard. Experiments on diverse benchmarks demonstrated the effectiveness of\nour method. It achieved 64.8 J&F on MOSE and 85.1 J&F on DAVIS, surpassing all\nnon-STM methods and narrowing the gap with STM-based approaches. For longer and\nhigher-resolution videos, it matched STM-based methods with 53% less GPU memory\nand supports 4096p inference on a 32G consumer-grade GPU--a previously\ncost-prohibitive capability--opening the door for long and high-resolution\nvideo foundation models.\n","authors":["Qin Liu","Jianfeng Wang","Zhengyuan Yang","Linjie Li","Kevin Lin","Marc Niethammer","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2411.02818v1.pdf","comment":"Code&models: https://github.com/uncbiag/LiVOS"},{"id":"http://arxiv.org/abs/2411.02817v1","updated":"2024-11-05T05:30:39Z","published":"2024-11-05T05:30:39Z","title":"Conditional Vendi Score: An Information-Theoretic Approach to Diversity\n Evaluation of Prompt-based Generative Models","summary":" Text-conditioned generation models are commonly evaluated based on the\nquality of the generated data and its alignment with the input text prompt. On\nthe other hand, several applications of prompt-based generative models require\nsufficient diversity in the generated data to ensure the models' capability of\ngenerating image and video samples possessing a variety of features. However,\nmost existing diversity metrics are designed for unconditional generative\nmodels, and thus cannot distinguish the diversity arising from variations in\ntext prompts and that contributed by the generative model itself. In this work,\nour goal is to quantify the prompt-induced and model-induced diversity in\nsamples generated by prompt-based models. We propose an information-theoretic\napproach for internal diversity quantification, where we decompose the\nkernel-based entropy $H(X)$ of the generated data $X$ into the sum of the\nconditional entropy $H(X|T)$, given text variable $T$, and the mutual\ninformation $I(X; T)$ between the text and data variables. We introduce the\n\\emph{Conditional-Vendi} score based on $H(X|T)$ to quantify the internal\ndiversity of the model and the \\emph{Information-Vendi} score based on $I(X;\nT)$ to measure the statistical relevance between the generated data and text\nprompts. We provide theoretical results to statistically interpret these scores\nand relate them to the unconditional Vendi score. We conduct several numerical\nexperiments to show the correlation between the Conditional-Vendi score and the\ninternal diversity of text-conditioned generative models. The codebase is\navailable at\n\\href{https://github.com/mjalali/conditional-vendi}{https://github.com/mjalali/conditional-vendi}.\n","authors":["Mohammad Jalali","Azim Ospanov","Amin Gohari","Farzan Farnia"],"pdf_url":"https://arxiv.org/pdf/2411.02817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02816v1","updated":"2024-11-05T05:29:00Z","published":"2024-11-05T05:29:00Z","title":"ChatGPT in Research and Education: Exploring Benefits and Threats","summary":" In recent years, advanced artificial intelligence technologies, such as\nChatGPT, have significantly impacted various fields, including education and\nresearch. Developed by OpenAI, ChatGPT is a powerful language model that\npresents numerous opportunities for students and educators. It offers\npersonalized feedback, enhances accessibility, enables interactive\nconversations, assists with lesson preparation and evaluation, and introduces\nnew methods for teaching complex subjects. However, ChatGPT also poses\nchallenges to traditional education and research systems. These challenges\ninclude the risk of cheating on online exams, the generation of human-like text\nthat may compromise academic integrity, a potential decline in critical\nthinking skills, and difficulties in assessing the reliability of information\ngenerated by AI. This study examines both the opportunities and challenges\nChatGPT brings to education from the perspectives of students and educators.\nSpecifically, it explores the role of ChatGPT in helping students develop their\nsubjective skills. To demonstrate its effectiveness, we conducted several\nsubjective experiments using ChatGPT, such as generating solutions from\nsubjective problem descriptions. Additionally, surveys were conducted with\nstudents and teachers to gather insights into how ChatGPT supports subjective\nlearning and teaching. The results and analysis of these surveys are presented\nto highlight the impact of ChatGPT in this context.\n","authors":["Abu Saleh Musa Miah","Md Mahbubur Rahman Tusher","Md. Moazzem Hossain","Md Mamun Hossain","Md Abdur Rahim","Md Ekramul Hamid","Md. Saiful Islam","Jungpil Shin"],"pdf_url":"https://arxiv.org/pdf/2411.02816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23280v2","updated":"2024-11-05T05:28:46Z","published":"2024-10-30T17:57:21Z","title":"RelationBooth: Towards Relation-Aware Customized Object Generation","summary":" Customized image generation is crucial for delivering personalized content\nbased on user-provided image prompts, aligning large-scale text-to-image\ndiffusion models with individual needs. However, existing models often overlook\nthe relationships between customized objects in generated images. Instead, this\nwork addresses that gap by focusing on relation-aware customized image\ngeneration, which aims to preserve the identities from image prompts while\nmaintaining the predicate relations described in text prompts. Specifically, we\nintroduce RelationBooth, a framework that disentangles identity and relation\nlearning through a well-curated dataset. Our training data consists of\nrelation-specific images, independent object images containing identity\ninformation, and text prompts to guide relation generation. Then, we propose\ntwo key modules to tackle the two main challenges: generating accurate and\nnatural relations, especially when significant pose adjustments are required,\nand avoiding object confusion in cases of overlap. First, we introduce a\nkeypoint matching loss that effectively guides the model in adjusting object\nposes closely tied to their relationships. Second, we incorporate local\nfeatures from the image prompts to better distinguish between objects,\npreventing confusion in overlapping cases. Extensive results on three\nbenchmarks demonstrate the superiority of RelationBooth in generating precise\nrelations while preserving object identities across a diverse set of objects\nand relations. The source code and trained models will be made available to the\npublic.\n","authors":["Qingyu Shi","Lu Qi","Jianzong Wu","Jinbin Bai","Jingbo Wang","Yunhai Tong","Xiangtai Li","Ming-Husan Yang"],"pdf_url":"https://arxiv.org/pdf/2410.23280v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02815v1","updated":"2024-11-05T05:27:03Z","published":"2024-11-05T05:27:03Z","title":"Artificial Intelligence-Enhanced Couinaud Segmentation for Precision\n Liver Cancer Therapy","summary":" Precision therapy for liver cancer necessitates accurately delineating liver\nsub-regions to protect healthy tissue while targeting tumors, which is\nessential for reducing recurrence and improving survival rates. However, the\nsegmentation of hepatic segments, known as Couinaud segmentation, is\nchallenging due to indistinct sub-region boundaries and the need for extensive\nannotated datasets. This study introduces LiverFormer, a novel Couinaud\nsegmentation model that effectively integrates global context with low-level\nlocal features based on a 3D hybrid CNN-Transformer architecture. Additionally,\na registration-based data augmentation strategy is equipped to enhance the\nsegmentation performance with limited labeled data. Evaluated on CT images from\n123 patients, LiverFormer demonstrated high accuracy and strong concordance\nwith expert annotations across various metrics, allowing for enhanced treatment\nplanning for surgery and radiation therapy. It has great potential to reduces\ncomplications and minimizes potential damages to surrounding tissue, leading to\nimproved outcomes for patients undergoing complex liver cancer treatments.\n","authors":["Liang Qiu","Wenhao Chi","Xiaohan Xing","Praveenbalaji Rajendran","Mingjie Li","Yuming Jiang","Oscar Pastor-Serrano","Sen Yang","Xiyue Wang","Yuanfeng Ji","Qiang Wen"],"pdf_url":"https://arxiv.org/pdf/2411.02815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00393v2","updated":"2024-11-05T05:22:24Z","published":"2024-11-01T06:40:47Z","title":"Advantages of Neural Population Coding for Deep Learning","summary":" Scalar variables, e.g., the orientation of a shape in an image, are commonly\npredicted using a single output neuron in a neural network. In contrast, the\nmammalian cortex represents variables with a population of neurons. In this\npopulation code, each neuron is most active at its preferred value and shows\npartial activity for other values. Here, we investigate the benefit of using a\npopulation code for the output layer of a neural network. We compare population\ncodes against single-neuron outputs and one-hot vectors. First, we show\ntheoretically and in experiments with synthetic data that population codes\nimprove robustness to input noise in networks of stacked linear layers. Second,\nwe demonstrate the benefit of using population codes to encode ambiguous\noutputs, such as the pose of symmetric objects. Using the T-LESS dataset of\nfeature-less real-world objects, we show that population codes improve the\naccuracy of predicting 3D object orientation from image input.\n","authors":["Heiko Hoffmann"],"pdf_url":"https://arxiv.org/pdf/2411.00393v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20213v2","updated":"2024-11-05T05:21:57Z","published":"2024-03-29T14:50:43Z","title":"VHM: Versatile and Honest Vision Language Model for Remote Sensing Image\n Analysis","summary":" This paper develops a Versatile and Honest vision language Model (VHM) for\nremote sensing image analysis. VHM is built on a large-scale remote sensing\nimage-text dataset with rich-content captions (VersaD), and an honest\ninstruction dataset comprising both factual and deceptive questions (HnstD).\nUnlike prevailing remote sensing image-text datasets, in which image captions\nfocus on a few prominent objects and their relationships, VersaD captions\nprovide detailed information about image properties, object attributes, and the\noverall scene. This comprehensive captioning enables VHM to thoroughly\nunderstand remote sensing images and perform diverse remote sensing tasks.\nMoreover, different from existing remote sensing instruction datasets that only\ninclude factual questions, HnstD contains additional deceptive questions\nstemming from the non-existence of objects. This feature prevents VHM from\nproducing affirmative answers to nonsense queries, thereby ensuring its\nhonesty. In our experiments, VHM significantly outperforms various vision\nlanguage models on common tasks of scene classification, visual question\nanswering, and visual grounding. Additionally, VHM achieves competent\nperformance on several unexplored tasks, such as building vectorizing,\nmulti-label classification and honest question answering.\n","authors":["Chao Pang","Xingxing Weng","Jiang Wu","Jiayu Li","Yi Liu","Jiaxing Sun","Weijia Li","Shuai Wang","Litong Feng","Gui-Song Xia","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2403.20213v2.pdf","comment":"Equal contribution: Chao Pang, Xingxing Weng, Jiang Wu; Corresponding\n author: Gui-Song Xia, Conghui He"},{"id":"http://arxiv.org/abs/2402.10884v2","updated":"2024-11-05T05:13:13Z","published":"2024-02-16T18:42:08Z","title":"Multi-modal Preference Alignment Remedies Degradation of Visual\n Instruction Tuning on Language Models","summary":" Multi-modal large language models (MLLMs) are expected to support multi-turn\nqueries of interchanging image and text modalities in production. However, the\ncurrent MLLMs trained with visual-question-answering (VQA) datasets could\nsuffer from degradation, as VQA datasets lack the diversity and complexity of\nthe original text instruction datasets with which the underlying language model\nwas trained. To address this degradation, we first collect a lightweight,\n5k-sample VQA preference dataset where answers were annotated by Gemini for\nfive quality metrics in a granular fashion and investigate standard Supervised\nFine-tuning, rejection sampling, Direct Preference Optimization (DPO) and\nSteerLM algorithms. Our findings indicate that with DPO, we can surpass the\ninstruction-following capabilities of the language model, achieving a 6.73\nscore on MT-Bench, compared to Vicuna's 6.57 and LLaVA's 5.99. This enhancement\nin textual instruction-following capability correlates with boosted visual\ninstruction performance (+4.9\\% on MM-Vet, +6\\% on LLaVA-Bench), with minimal\nalignment tax on visual knowledge benchmarks compared to the previous RLHF\napproach. In conclusion, we propose a distillation-based multi-modal alignment\nmodel with fine-grained annotations on a small dataset that restores and boosts\nMLLM's language capability after visual instruction tuning.\n","authors":["Shengzhi Li","Rongyu Lin","Shichao Pei"],"pdf_url":"https://arxiv.org/pdf/2402.10884v2.pdf","comment":"Project code, model and data: https://github.com/findalexli/mllm-dpo"},{"id":"http://arxiv.org/abs/2411.02812v1","updated":"2024-11-05T05:04:12Z","published":"2024-11-05T05:04:12Z","title":"NEOviz: Uncertainty-Driven Visual Analysis of Asteroid Trajectories","summary":" We introduce NEOviz, an interactive visualization system designed to assist\nplanetary defense experts in the visual analysis of the movements of near-Earth\nobjects in the Solar System that might prove hazardous to Earth. Asteroids are\noften discovered using optical telescopes and their trajectories are calculated\nfrom images, resulting in an inherent asymmetric uncertainty in their position\nand velocity. Consequently, we typically cannot determine the exact trajectory\nof an asteroid, and an ensemble of trajectories must be generated to estimate\nan asteroid's movement over time. When propagating these ensembles over\ndecades, it is challenging to visualize the varying paths and determine their\npotential impact on Earth, which could cause catastrophic damage. NEOviz equips\nexperts with the necessary tools to effectively analyze the existing catalog of\nasteroid observations. In particular, we present a novel approach for\nvisualizing the 3D uncertainty region through which an asteroid travels, while\nproviding accurate spatial context in relation to system-critical\ninfrastructure such as Earth, the Moon, and artificial satellites. Furthermore,\nwe use NEOviz to visualize the divergence of asteroid trajectories, capturing\nhigh-variance events in an asteroid's orbital properties. For potential\nimpactors, we combine the 3D visualization with an uncertainty-aware impact map\nto illustrate the potential risks to human populations. NEOviz was developed\nwith continuous input from members of the planetary defense community through a\nparticipatory design process. It is exemplified in three real-world use cases\nand evaluated via expert feedback interviews.\n","authors":["Fangfei Lan","Malin Ejdbo","Joachim Moeyens","Bei Wang","Anders Ynnerman","Alexander Bock"],"pdf_url":"https://arxiv.org/pdf/2411.02812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02799v1","updated":"2024-11-05T04:20:06Z","published":"2024-11-05T04:20:06Z","title":"ERUP-YOLO: Enhancing Object Detection Robustness for Adverse Weather\n Condition by Unified Image-Adaptive Processing","summary":" We propose an image-adaptive object detection method for adverse weather\nconditions such as fog and low-light. Our framework employs differentiable\npreprocessing filters to perform image enhancement suitable for later-stage\nobject detections. Our framework introduces two differentiable filters: a\nB\\'ezier curve-based pixel-wise (BPW) filter and a kernel-based local (KBL)\nfilter. These filters unify the functions of classical image processing filters\nand improve performance of object detection. We also propose a domain-agnostic\ndata augmentation strategy using the BPW filter. Our method does not require\ndata-specific customization of the filter combinations, parameter ranges, and\ndata augmentation. We evaluate our proposed approach, called Enhanced\nRobustness by Unified Image Processing (ERUP)-YOLO, by applying it to the\nYOLOv3 detector. Experiments on adverse weather datasets demonstrate that our\nproposed filters match or exceed the expressiveness of conventional methods and\nour ERUP-YOLO achieved superior performance in a wide range of adverse weather\nconditions, including fog and low-light conditions.\n","authors":["Yuka Ogino","Yuho Shoji","Takahiro Toizumi","Atsushi Ito"],"pdf_url":"https://arxiv.org/pdf/2411.02799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02796v1","updated":"2024-11-05T04:10:59Z","published":"2024-11-05T04:10:59Z","title":"Specialized Foundation Models Struggle to Beat Supervised Baselines","summary":" Following its success for vision and text, the \"foundation model\" (FM)\nparadigm -- pretraining large models on massive data, then fine-tuning on\ntarget tasks -- has rapidly expanded to domains in the sciences, engineering,\nhealthcare, and beyond. Has this achieved what the original FMs accomplished,\ni.e. the supplanting of traditional supervised learning in their domains? To\nanswer we look at three modalities -- genomics, satellite imaging, and time\nseries -- with multiple recent FMs and compare them to a standard supervised\nlearning workflow: model development, hyperparameter tuning, and training, all\nusing only data from the target task. Across these three specialized domains,\nwe find that it is consistently possible to train simple supervised models --\nno more complicated than a lightly modified wide ResNet or UNet -- that match\nor even outperform the latest foundation models. Our work demonstrates that the\nbenefits of large-scale pretraining have yet to be realized in many specialized\nareas, reinforces the need to compare new FMs to strong, well-tuned baselines,\nand introduces two new, easy-to-use, open-source, and automated workflows for\ndoing so.\n","authors":["Zongzhe Xu","Ritvik Gupta","Wenduo Cheng","Alexander Shen","Junhong Shen","Ameet Talwalkar","Mikhail Khodak"],"pdf_url":"https://arxiv.org/pdf/2411.02796v1.pdf","comment":"The first two authors contributed equally. The order was determined\n by coin flip"},{"id":"http://arxiv.org/abs/2411.02794v1","updated":"2024-11-05T04:08:59Z","published":"2024-11-05T04:08:59Z","title":"Real-Time Text Detection with Similar Mask in Traffic, Industrial, and\n Natural Scenes","summary":" Texts on the intelligent transportation scene include mass information. Fully\nharnessing this information is one of the critical drivers for advancing\nintelligent transportation. Unlike the general scene, detecting text in\ntransportation has extra demand, such as a fast inference speed, except for\nhigh accuracy. Most existing real-time text detection methods are based on the\nshrink mask, which loses some geometry semantic information and needs complex\npost-processing. In addition, the previous method usually focuses on correct\noutput, which ignores feature correction and lacks guidance during the\nintermediate process. To this end, we propose an efficient multi-scene text\ndetector that contains an effective text representation similar mask (SM) and a\nfeature correction module (FCM). Unlike previous methods, the former aims to\npreserve the geometric information of the instances as much as possible. Its\npost-progressing saves 50$\\%$ of the time, accurately and efficiently\nreconstructing text contours. The latter encourages false positive features to\nmove away from the positive feature center, optimizing the predictions from the\nfeature level. Some ablation studies demonstrate the efficiency of the SM and\nthe effectiveness of the FCM. Moreover, the deficiency of existing traffic\ndatasets (such as the low-quality annotation or closed source data\nunavailability) motivated us to collect and annotate a traffic text dataset,\nwhich introduces motion blur. In addition, to validate the scene robustness of\nthe SM-Net, we conduct experiments on traffic, industrial, and natural scene\ndatasets. Extensive experiments verify it achieves (SOTA) performance on\nseveral benchmarks. The code and dataset are available at:\n\\url{https://github.com/fengmulin/SMNet}.\n","authors":["Xu Han","Junyu Gao","Chuang Yang","Yuan Yuan","Qi Wang"],"pdf_url":"https://arxiv.org/pdf/2411.02794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02793v1","updated":"2024-11-05T04:04:41Z","published":"2024-11-05T04:04:41Z","title":"Toward Robust Incomplete Multimodal Sentiment Analysis via Hierarchical\n Representation Learning","summary":" Multimodal Sentiment Analysis (MSA) is an important research area that aims\nto understand and recognize human sentiment through multiple modalities. The\ncomplementary information provided by multimodal fusion promotes better\nsentiment analysis compared to utilizing only a single modality. Nevertheless,\nin real-world applications, many unavoidable factors may lead to situations of\nuncertain modality missing, thus hindering the effectiveness of multimodal\nmodeling and degrading the model's performance. To this end, we propose a\nHierarchical Representation Learning Framework (HRLF) for the MSA task under\nuncertain missing modalities. Specifically, we propose a fine-grained\nrepresentation factorization module that sufficiently extracts valuable\nsentiment information by factorizing modality into sentiment-relevant and\nmodality-specific representations through crossmodal translation and sentiment\nsemantic reconstruction. Moreover, a hierarchical mutual information\nmaximization mechanism is introduced to incrementally maximize the mutual\ninformation between multi-scale representations to align and reconstruct the\nhigh-level semantics in the representations. Ultimately, we propose a\nhierarchical adversarial learning mechanism that further aligns and adapts the\nlatent distribution of sentiment-relevant representations to produce robust\njoint multimodal representations. Comprehensive experiments on three datasets\ndemonstrate that HRLF significantly improves MSA performance under uncertain\nmodality missing cases.\n","authors":["Mingcheng Li","Dingkang Yang","Yang Liu","Shunli Wang","Jiawei Chen","Shuaibing Wang","Jinjie Wei","Yue Jiang","Qingyao Xu","Xiaolu Hou","Mingyang Sun","Ziyun Qian","Dongliang Kou","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.02793v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.08695v2","updated":"2024-11-05T03:56:21Z","published":"2024-10-11T10:33:51Z","title":"Dynamic Multimodal Evaluation with Flexible Complexity by\n Vision-Language Bootstrapping","summary":" Large Vision-Language Models (LVLMs) have demonstrated remarkable\ncapabilities across multimodal tasks such as visual perception and reasoning,\nleading to good performance on various multimodal evaluation benchmarks.\nHowever, these benchmarks keep a static nature and overlap with the\npre-training data, resulting in fixed complexity constraints and data\ncontamination issues. This raises the concern regarding the validity of the\nevaluation. To address these two challenges, we introduce a dynamic multimodal\nevaluation protocol called Vision-Language Bootstrapping (VLB). VLB provides a\nrobust and comprehensive assessment for LVLMs with reduced data contamination\nand flexible complexity. To this end, VLB dynamically generates new visual\nquestion-answering samples through a multimodal bootstrapping module that\nmodifies both images and language, while ensuring that newly generated samples\nremain consistent with the original ones by a judge module. By composing\nvarious bootstrapping strategies, VLB offers dynamic variants of existing\nbenchmarks with diverse complexities, enabling the evaluation to co-evolve with\nthe ever-evolving capabilities of LVLMs. Extensive experimental results across\nmultiple benchmarks, including SEEDBench, MMBench, and MME, show that VLB\nsignificantly reduces data contamination and exposes performance limitations of\nLVLMs.\n","authors":["Yue Yang","Shuibai Zhang","Wenqi Shao","Kaipeng Zhang","Yi Bin","Yu Wang","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2410.08695v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02779v1","updated":"2024-11-05T03:44:54Z","published":"2024-11-05T03:44:54Z","title":"Advancing Recycling Efficiency: A Comparative Analysis of Deep Learning\n Models in Waste Classification","summary":" With the ongoing increase in the worldwide population and escalating\nconsumption habits,there's a surge in the amount of waste produced.The\nsituation poses considerable challenges for waste management and the\noptimization of recycling operations.The research tackles the pressing issue of\nwaste classification for recycling by analyzing various deep learning\nmodels,including Convolutional Neural Network(CNN),AlexNet,ResNet,ResNet50 plus\nSupport Vector Machine(SVM),and transformers,across a wide array of waste\ncategories.The research meticulously compares these models on several targets\nlike parameters settings,category accuracy,total accuracy and model parameters\nto establish a uniform evaluation criterion.This research presents a novel\nmethod that incorporates SVM with deep learning frameworks,particularly\nResNet50.The results indicate the method significantly boosts accuracy in\ncomplex waste categories.Moreover,the transformer model outshines others in\naverage accuracy,showcasing its aptitude for intricate classification tasks.To\nimprove performance in poorly performing categories,the research advocates for\nenlarging the dataset,employing data augmentation,and leveraging sophisticated\nmodels such as transformers,along with refining training methodologies.The\nresearch paves the way for future advancements in multi-category waste\nrecycling and underscores the pivotal role of deep learning in promoting\nenvironmental sustainability.\n","authors":["Zhanshan Qiao"],"pdf_url":"https://arxiv.org/pdf/2411.02779v1.pdf","comment":"Accepted by the 6th International Conference on Computing and Data\n Science (CONF-CDS 2024), 12 pages, 8 figures, references added"},{"id":"http://arxiv.org/abs/2407.11781v3","updated":"2024-11-05T03:43:07Z","published":"2024-07-16T14:38:13Z","title":"Sliding Gaussian ball adaptive growth (SlingBAG): point cloud-based\n iterative algorithm for large-scale 3D photoacoustic imaging","summary":" Large-scale 3D photoacoustic (PA) imaging has become increasingly important\nfor both clinical and pre-clinical applications. Limited by cost and system\ncomplexity, only systems with sparsely-distributed sensors can be widely\nimplemented, which desires advanced reconstruction algorithms to reduce\nartifacts. However, high computing memory and time consumption of traditional\niterative reconstruction (IR) algorithms is practically unacceptable for\nlarge-scale 3D PA imaging. Here, we propose a point cloud-based IR algorithm\nthat reduces memory consumption by several orders, wherein the 3D PA scene is\nmodeled as a series of Gaussian-distributed spherical sources stored in form of\npoint cloud. During the IR process, not only are properties of each Gaussian\nsource, including its peak intensity (initial pressure value), standard\ndeviation (size) and mean (position) continuously optimized, but also each\nGaussian source itself adaptively undergoes destroying, splitting, and\nduplication along the gradient direction. This method, named the sliding\nGaussian ball adaptive growth (SlingBAG) algorithm, enables high-quality\nlarge-scale 3D PA reconstruction with fast iteration and extremely low memory\nusage. We validated SlingBAG algorithm in both simulation study and in vivo\nanimal experiments. The source code and data for SlingBAG, along with\nsupplementary materials and demonstration videos, are now available in the\nfollowing GitHub repository: https://github.com/JaegerCQ/SlingBAG.\n","authors":["Shuang Li","Yibing Wang","Jian Gao","Chulhong Kim","Seongwook Choi","Yu Zhang","Qian Chen","Yao Yao","Changhui Li"],"pdf_url":"https://arxiv.org/pdf/2407.11781v3.pdf","comment":"Added SlingBAG reconstruction of rat kidney and rat liver results;\n updated methods; added references"},{"id":"http://arxiv.org/abs/2411.02149v2","updated":"2024-11-05T03:41:28Z","published":"2024-11-04T15:06:57Z","title":"Improving Domain Generalization in Self-supervised Monocular Depth\n Estimation via Stabilized Adversarial Training","summary":" Learning a self-supervised Monocular Depth Estimation (MDE) model with great\ngeneralization remains significantly challenging. Despite the success of\nadversarial augmentation in the supervised learning generalization, naively\nincorporating it into self-supervised MDE models potentially causes\nover-regularization, suffering from severe performance degradation. In this\npaper, we conduct qualitative analysis and illuminate the main causes: (i)\ninherent sensitivity in the UNet-alike depth network and (ii) dual optimization\nconflict caused by over-regularization. To tackle these issues, we propose a\ngeneral adversarial training framework, named Stabilized Conflict-optimization\nAdversarial Training (SCAT), integrating adversarial data augmentation into\nself-supervised MDE methods to achieve a balance between stability and\ngeneralization. Specifically, we devise an effective scaling depth network that\ntunes the coefficients of long skip connection and effectively stabilizes the\ntraining process. Then, we propose a conflict gradient surgery strategy, which\nprogressively integrates the adversarial gradient and optimizes the model\ntoward a conflict-free direction. Extensive experiments on five benchmarks\ndemonstrate that SCAT can achieve state-of-the-art performance and\nsignificantly improve the generalization capability of existing self-supervised\nMDE methods.\n","authors":["Yuanqi Yao","Gang Wu","Kui Jiang","Siao Liu","Jian Kuai","Xianming Liu","Junjun Jiang"],"pdf_url":"https://arxiv.org/pdf/2411.02149v2.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2411.02773v1","updated":"2024-11-05T03:34:53Z","published":"2024-11-05T03:34:53Z","title":"FedBlock: A Blockchain Approach to Federated Learning against Backdoor\n Attacks","summary":" Federated Learning (FL) is a machine learning method for training with\nprivate data locally stored in distributed machines without gathering them into\none place for central learning. Despite its promises, FL is prone to critical\nsecurity risks. First, because FL depends on a central server to aggregate\nlocal training models, this is a single point of failure. The server might\nfunction maliciously. Second, due to its distributed nature, FL might encounter\nbackdoor attacks by participating clients. They can poison the local model\nbefore submitting to the server. Either type of attack, on the server or the\nclient side, would severely degrade learning accuracy. We propose FedBlock, a\nnovel blockchain-based FL framework that addresses both of these security\nrisks. FedBlock is uniquely desirable in that it involves only smart contract\nprogramming, thus deployable atop any blockchain network. Our framework is\nsubstantiated with a comprehensive evaluation study using real-world datasets.\nIts robustness against backdoor attacks is competitive with the literature of\nFL backdoor defense. The latter, however, does not address the server risk as\nwe do.\n","authors":["Duong H. Nguyen","Phi L. Nguyen","Truong T. Nguyen","Hieu H. Pham","Duc A. Tran"],"pdf_url":"https://arxiv.org/pdf/2411.02773v1.pdf","comment":"This paper has been accepted as a full paper for the IEEE Special\n Session Federated Learning on Big Data 2024 (IEEE BigData 2024)"},{"id":"http://arxiv.org/abs/2202.08498v3","updated":"2024-11-05T03:29:58Z","published":"2022-02-17T08:03:48Z","title":"Mirror-Yolo: A Novel Attention Focus, Instance Segmentation and Mirror\n Detection Model","summary":" Mirrors can degrade the performance of computer vision models, but research\ninto detecting them is in the preliminary phase. YOLOv4 achieves phenomenal\nresults in terms of object detection accuracy and speed, but it still fails in\ndetecting mirrors. Thus, we propose Mirror-YOLO, which targets mirror\ndetection, containing a novel attention focus mechanism for features\nacquisition, a hypercolumn-stairstep approach to better fusion the feature\nmaps, and the mirror bounding polygons for instance segmentation. Compared to\nthe existing mirror detection networks and YOLO series, our proposed network\nachieves superior performance in average accuracy on our proposed mirror\ndataset and another state-of-art mirror dataset, which demonstrates the\nvalidity and effectiveness of Mirror-YOLO.\n","authors":["Fengze Li","Jieming Ma","Zhongbei Tian","Ji Ge","Hai-Ning Liang","Yungang Zhang","Tianxi Wen"],"pdf_url":"https://arxiv.org/pdf/2202.08498v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02768v1","updated":"2024-11-05T03:26:26Z","published":"2024-11-05T03:26:26Z","title":"One-Stage-TFS: Thai One-Stage Fingerspelling Dataset for Fingerspelling\n Recognition Frameworks","summary":" The Thai One-Stage Fingerspelling (One-Stage-TFS) dataset is a comprehensive\nresource designed to advance research in hand gesture recognition, explicitly\nfocusing on the recognition of Thai sign language. This dataset comprises 7,200\nimages capturing 15 one-stage consonant gestures performed by undergraduate\nstudents from Rajabhat Maha Sarakham University, Thailand. The contributors\ninclude both expert students from the Special Education Department with\nproficiency in Thai sign language and students from other departments without\nprior sign language experience. Images were collected between July and December\n2021 using a DSLR camera, with contributors demonstrating hand gestures against\nboth simple and complex backgrounds. The One-Stage-TFS dataset presents\nchallenges in detecting and recognizing hand gestures, offering opportunities\nto develop novel end-to-end recognition frameworks. Researchers can utilize\nthis dataset to explore deep learning methods, such as YOLO, EfficientDet,\nRetinaNet, and Detectron, for hand detection, followed by feature extraction\nand recognition using techniques like convolutional neural networks,\ntransformers, and adaptive feature fusion networks. The dataset is accessible\nvia the Mendeley Data repository and supports a wide range of applications in\ncomputer science, including deep learning, computer vision, and pattern\nrecognition, thereby encouraging further innovation and exploration in these\nfields.\n","authors":["Siriwiwat Lata","Sirawan Phiphitphatphaisit","Emmanuel Okafor","Olarik Surinta"],"pdf_url":"https://arxiv.org/pdf/2411.02768v1.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2411.02762v1","updated":"2024-11-05T03:14:36Z","published":"2024-11-05T03:14:36Z","title":"EcoCropsAID: Economic Crops Aerial Image Dataset for Land Use\n Classification","summary":" The EcoCropsAID dataset is a comprehensive collection of 5,400 aerial images\ncaptured between 2014 and 2018 using the Google Earth application. This dataset\nfocuses on five key economic crops in Thailand: rice, sugarcane, cassava,\nrubber, and longan. The images were collected at various crop growth stages:\nearly cultivation, growth, and harvest, resulting in significant variability\nwithin each category and similarities across different categories. These\nvariations, coupled with differences in resolution, color, and contrast\nintroduced by multiple remote imaging sensors, present substantial challenges\nfor land use classification. The dataset is an interdisciplinary resource that\nspans multiple research domains, including remote sensing, geoinformatics,\nartificial intelligence, and computer vision. The unique features of the\nEcoCropsAID dataset offer opportunities for researchers to explore novel\napproaches, such as extracting spatial and temporal features, developing deep\nlearning architectures, and implementing transformer-based models. The\nEcoCropsAID dataset provides a valuable platform for advancing research in land\nuse classification, with implications for optimizing agricultural practices and\nenhancing sustainable development. This study explicitly investigates the use\nof deep learning algorithms to classify economic crop areas in northeastern\nThailand, utilizing satellite imagery to address the challenges posed by\ndiverse patterns and similarities across categories.\n","authors":["Sangdaow Noppitak","Emmanuel Okafor","Olarik Surinta"],"pdf_url":"https://arxiv.org/pdf/2411.02762v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.07714v3","updated":"2024-11-05T02:59:08Z","published":"2024-09-12T02:50:04Z","title":"CollaMamba: Efficient Collaborative Perception with Cross-Agent\n Spatial-Temporal State Space Model","summary":" By sharing complementary perceptual information, multi-agent collaborative\nperception fosters a deeper understanding of the environment. Recent studies on\ncollaborative perception mostly utilize CNNs or Transformers to learn feature\nrepresentation and fusion in the spatial dimension, which struggle to handle\nlong-range spatial-temporal features under limited computing and communication\nresources. Holistically modeling the dependencies over extensive spatial areas\nand extended temporal frames is crucial to enhancing feature quality. To this\nend, we propose a resource efficient cross-agent spatial-temporal collaborative\nstate space model (SSM), named CollaMamba. Initially, we construct a\nfoundational backbone network based on spatial SSM. This backbone adeptly\ncaptures positional causal dependencies from both single-agent and cross-agent\nviews, yielding compact and comprehensive intermediate features while\nmaintaining linear complexity. Furthermore, we devise a history-aware feature\nboosting module based on temporal SSM, extracting contextual cues from extended\nhistorical frames to refine vague features while preserving low overhead.\nExtensive experiments across several datasets demonstrate that CollaMamba\noutperforms state-of-the-art methods, achieving higher model accuracy while\nreducing computational and communication overhead by up to 71.9% and 1/64,\nrespectively. This work pioneers the exploration of the Mamba's potential in\ncollaborative perception. The source code will be made available.\n","authors":["Yang Li","Quan Yuan","Guiyang Luo","Xiaoyuan Fu","Xuanhan Zhu","Yujia Yang","Rui Pan","Jinglin Li"],"pdf_url":"https://arxiv.org/pdf/2409.07714v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17157v2","updated":"2024-11-05T02:57:31Z","published":"2024-07-24T11:00:08Z","title":"Establishing Causal Relationship Between Whole Slide Image Predictions\n and Diagnostic Evidence Subregions in Deep Learning","summary":" Due to the lack of fine-grained annotation guidance, current Multiple\nInstance Learning (MIL) struggles to establish a robust causal relationship\nbetween Whole Slide Image (WSI) diagnosis and evidence sub-images, just like\nfully supervised learning. So many noisy images can undermine the network's\nprediction. The proposed Causal Inference Multiple Instance Learning (CI-MIL),\nuses out-of-distribution generalization to reduce the recognition confusion of\nsub-images by MIL network, without requiring pixelwise annotations.\nSpecifically, feature distillation is introduced to roughly identify the\nfeature representation of lesion patches. Then, in the random Fourier feature\nspace, these features are re-weighted to minimize the cross-correlation,\neffectively correcting the feature distribution deviation. These processes\nreduce the uncertainty when tracing the prediction results back to patches.\nPredicted diagnoses are more direct and reliable because the causal\nrelationship between them and diagnostic evidence images is more clearly\nrecognized by the network. Experimental results demonstrate that CI-MIL\noutperforms state-of-the-art methods, achieving 92.25% accuracy and 95.28% AUC\non the Camelyon16 dataset (breast cancer), while 94.29% accuracy and 98.07% AUC\non the TCGA-NSCLC dataset (non-small cell lung cancer). Additionally, CI-MIL\nexhibits superior interpretability, as its selected regions demonstrate high\nconsistency with ground truth annotations, promising more reliable diagnostic\nassistance for pathologists.\n","authors":["Tianhang Nan","Yong Ding","Hao Quan","Deliang Li","Lisha Li","Guanghong Zhao","Xiaoyu Cui"],"pdf_url":"https://arxiv.org/pdf/2407.17157v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14746v4","updated":"2024-11-05T02:51:22Z","published":"2023-08-28T17:55:33Z","title":"CoVR-2: Automatic Data Construction for Composed Video Retrieval","summary":" Composed Image Retrieval (CoIR) has recently gained popularity as a task that\nconsiders both text and image queries together, to search for relevant images\nin a database. Most CoIR approaches require manually annotated datasets,\ncomprising image-text-image triplets, where the text describes a modification\nfrom the query image to the target image. However, manual curation of CoIR\ntriplets is expensive and prevents scalability. In this work, we instead\npropose a scalable automatic dataset creation methodology that generates\ntriplets given video-caption pairs, while also expanding the scope of the task\nto include composed video retrieval (CoVR). To this end, we mine paired videos\nwith a similar caption from a large database, and leverage a large language\nmodel to generate the corresponding modification text. Applying this\nmethodology to the extensive WebVid2M collection, we automatically construct\nour WebVid-CoVR dataset, resulting in 1.6 million triplets. Moreover, we\nintroduce a new benchmark for CoVR with a manually annotated evaluation set,\nalong with baseline results. We further validate that our methodology is\nequally applicable to image-caption pairs, by generating 3.3 million CoIR\ntraining triplets using the Conceptual Captions dataset. Our model builds on\nBLIP-2 pretraining, adapting it to composed video (or image) retrieval, and\nincorporates an additional caption retrieval loss to exploit extra supervision\nbeyond the triplet. We provide extensive ablations to analyze the design\nchoices on our new CoVR benchmark. Our experiments also demonstrate that\ntraining a CoVR model on our datasets effectively transfers to CoIR, leading to\nimproved state-of-the-art performance in the zero-shot setup on the CIRR,\nFashionIQ, and CIRCO benchmarks. Our code, datasets, and models are publicly\navailable at https://imagine.enpc.fr/ ventural/covr.\n","authors":["Lucas Ventura","Antoine Yang","Cordelia Schmid","Gül Varol"],"pdf_url":"https://arxiv.org/pdf/2308.14746v4.pdf","comment":"Appears in TPAMI 2024 (DOI: 10.1109/TPAMI.2024.3463799). Journal\n extension of the AAAI 2024 conference paper arXiv:2308.14746v3. Project page:\n https://imagine.enpc.fr/~ventural/covr/"},{"id":"http://arxiv.org/abs/2411.02753v1","updated":"2024-11-05T02:50:47Z","published":"2024-11-05T02:50:47Z","title":"Label Critic: Design Data Before Models","summary":" As medical datasets rapidly expand, creating detailed annotations of\ndifferent body structures becomes increasingly expensive and time-consuming. We\nconsider that requesting radiologists to create detailed annotations is\nunnecessarily burdensome and that pre-existing AI models can largely automate\nthis process. Following the spirit don't use a sledgehammer on a nut, we find\nthat, rather than creating annotations from scratch, radiologists only have to\nreview and edit errors if the Best-AI Labels have mistakes. To obtain the\nBest-AI Labels among multiple AI Labels, we developed an automatic tool, called\nLabel Critic, that can assess label quality through tireless pairwise\ncomparisons. Extensive experiments demonstrate that, when incorporated with our\ndeveloped Image-Prompt pairs, pre-existing Large Vision-Language Models (LVLM),\ntrained on natural images and texts, achieve 96.5% accuracy when choosing the\nbest label in a pair-wise comparison, without extra fine-tuning. By\ntransforming the manual annotation task (30-60 min/scan) into an automatic\ncomparison task (15 sec/scan), we effectively reduce the manual efforts\nrequired from radiologists by an order of magnitude. When the Best-AI Labels\nare sufficiently accurate (81% depending on body structures), they will be\ndirectly adopted as the gold-standard annotations for the dataset, with\nlower-quality AI Labels automatically discarded. Label Critic can also check\nthe label quality of a single AI Label with 71.8% accuracy when no alternatives\nare available for comparison, prompting radiologists to review and edit if the\nestimated quality is low (19% depending on body structures).\n","authors":["Pedro R. A. S. Bassi","Qilong Wu","Wenxuan Li","Sergio Decherchi","Andrea Cavalli","Alan Yuille","Zongwei Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.02753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02747v1","updated":"2024-11-05T02:33:25Z","published":"2024-11-05T02:33:25Z","title":"Efficient Feature Aggregation and Scale-Aware Regression for Monocular\n 3D Object Detection","summary":" Monocular 3D object detection has attracted great attention due to simplicity\nand low cost. Existing methods typically follow conventional 2D detection\nparadigms, first locating object centers and then predicting 3D attributes via\nneighboring features. However, these methods predominantly rely on progressive\ncross-scale feature aggregation and focus solely on local information, which\nmay result in a lack of global awareness and the omission of small-scale\nobjects. In addition, due to large variation in object scales across different\nscenes and depths, inaccurate receptive fields often lead to background noise\nand degraded feature representation. To address these issues, we introduces\nMonoASRH, a novel monocular 3D detection framework composed of Efficient Hybrid\nFeature Aggregation Module (EH-FAM) and Adaptive Scale-Aware 3D Regression Head\n(ASRH). Specifically, EH-FAM employs multi-head attention with a global\nreceptive field to extract semantic features for small-scale objects and\nleverages lightweight convolutional modules to efficiently aggregate visual\nfeatures across different scales. The ASRH encodes 2D bounding box dimensions\nand then fuses scale features with the semantic features aggregated by EH-FAM\nthrough a scale-semantic feature fusion module. The scale-semantic feature\nfusion module guides ASRH in learning dynamic receptive field offsets,\nincorporating scale priors into 3D position prediction for better\nscale-awareness. Extensive experiments on the KITTI and Waymo datasets\ndemonstrate that MonoASRH achieves state-of-the-art performance.\n","authors":["Yifan Wang","Xiaochen Yang","Fanqi Pu","Qingmin Liao","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2411.02747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04828v3","updated":"2024-11-05T02:32:06Z","published":"2024-09-07T13:41:37Z","title":"POINTS: Improving Your Vision-language Model with Affordable Strategies","summary":" In recent years, vision-language models have made significant strides,\nexcelling in tasks like optical character recognition and geometric\nproblem-solving. However, several critical issues remain: 1) Proprietary models\noften lack transparency about their architectures, while open-source models\nneed more detailed ablations of their training strategies. 2) Pre-training data\nin open-source works is under-explored, with datasets added empirically, making\nthe process cumbersome. 3) Fine-tuning often focuses on adding datasets,\nleading to diminishing returns. To address these issues, we propose the\nfollowing contributions: 1) We trained a robust baseline model using the latest\nadvancements in vision-language models, introducing effective improvements and\nconducting comprehensive ablation and validation for each technique. 2)\nInspired by recent work on large language models, we filtered pre-training data\nusing perplexity, selecting the lowest perplexity data for training. This\napproach allowed us to train on a curated 1M dataset, achieving competitive\nperformance. 3) During visual instruction tuning, we used model soup on\ndifferent datasets when adding more datasets yielded marginal improvements.\nThese innovations resulted in a 9B parameter model that performs competitively\nwith state-of-the-art models. Our strategies are efficient and lightweight,\nmaking them easily adoptable by the community.\n","authors":["Yuan Liu","Zhongyin Zhao","Ziyuan Zhuang","Le Tian","Xiao Zhou","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.04828v3.pdf","comment":"v2"},{"id":"http://arxiv.org/abs/2411.02745v1","updated":"2024-11-05T02:31:49Z","published":"2024-11-05T02:31:49Z","title":"Foundation AI Model for Medical Image Segmentation","summary":" Foundation models refer to artificial intelligence (AI) models that are\ntrained on massive amounts of data and demonstrate broad generalizability\nacross various tasks with high accuracy. These models offer versatile,\none-for-many or one-for-all solutions, eliminating the need for developing\ntask-specific AI models. Examples of such foundation models include the Chat\nGenerative Pre-trained Transformer (ChatGPT) and the Segment Anything Model\n(SAM). These models have been trained on millions to billions of samples and\nhave shown wide-ranging and accurate applications in numerous tasks such as\ntext processing (using ChatGPT) and natural image segmentation (using SAM). In\nmedical image segmentation - finding target regions in medical images - there\nis a growing need for these one-for-many or one-for-all foundation models. Such\nmodels could obviate the need to develop thousands of task-specific AI models,\nwhich is currently standard practice in the field. They can also be adapted to\ntasks with datasets too small for effective training. We discuss two paths to\nachieve foundation models for medical image segmentation and comment on\nprogress, challenges, and opportunities. One path is to adapt or fine-tune\nexisting models, originally developed for natural images, for use with medical\nimages. The second path entails building models from scratch, exclusively\ntraining on medical images.\n","authors":["Rina Bao","Erfan Darzi","Sheng He","Chuan-Heng Hsiao","Mohammad Arafat Hussain","Jingpeng Li","Atle Bjornerud","Ellen Grant","Yangming Ou"],"pdf_url":"https://arxiv.org/pdf/2411.02745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11190v3","updated":"2024-11-05T02:27:57Z","published":"2024-10-15T02:10:45Z","title":"Mini-Omni2: Towards Open-source GPT-4o with Vision, Speech and Duplex\n Capabilities","summary":" GPT-4o, an all-encompassing model, represents a milestone in the development\nof large multi-modal language models. It can understand visual, auditory, and\ntextual modalities, directly output audio, and support flexible duplex\ninteraction. Models from the open-source community often achieve some\nfunctionalities of GPT-4o, such as visual understanding and voice chat.\nNevertheless, training a unified model that incorporates all modalities is\nchallenging due to the complexities of multi-modal data, intricate model\narchitectures, and training processes. In this paper, we introduce Mini-Omni2,\na visual-audio assistant capable of providing real-time, end-to-end voice\nresponses to visoin and audio queries. By integrating pretrained visual and\nauditory encoders, Mini-Omni2 maintains performance in individual modalities.\nWe propose a three-stage training process to align modalities, allowing the\nlanguage model to handle multi-modal inputs and outputs after training on a\nlimited dataset. For interaction, we introduce a command-based interruption\nmechanism, enabling more flexible interaction with users. To the best of our\nknowledge, Mini-Omni2 is one of the closest reproductions of GPT-4o, which have\nsimilar form of functionality, and we hope it can offer valuable insights for\nsubsequent research.\n","authors":["Zhifei Xie","Changqiao Wu"],"pdf_url":"https://arxiv.org/pdf/2410.11190v3.pdf","comment":"Technical report, work in progress. Demo and code:\n https://github.com/gpt-omni/mini-omni2"},{"id":"http://arxiv.org/abs/2410.12816v2","updated":"2024-11-05T02:27:30Z","published":"2024-10-01T09:33:45Z","title":"Rethinking Misalignment in Vision-Language Model Adaptation from a\n Causal Perspective","summary":" Foundational Vision-Language models such as CLIP have exhibited impressive\ngeneralization in downstream tasks. However, CLIP suffers from a two-level\nmisalignment issue, i.e., task misalignment and data misalignment, when\nadapting to specific tasks. Soft prompt tuning has mitigated the task\nmisalignment, yet the data misalignment remains a challenge. To analyze the\nimpacts of the data misalignment, we revisit the pre-training and adaptation\nprocesses of CLIP and develop a structural causal model. We discover that while\nwe expect to capture task-relevant information for downstream tasks accurately,\nthe task-irrelevant knowledge impacts the prediction results and hampers the\nmodeling of the true relationships between the images and the predicted\nclasses. As task-irrelevant knowledge is unobservable, we leverage the\nfront-door adjustment and propose Causality-Guided Semantic Decoupling and\nClassification (CDC) to mitigate the interference of task-irrelevant knowledge.\nSpecifically, we decouple semantics contained in the data of downstream tasks\nand perform classification based on each semantic. Furthermore, we employ the\nDempster-Shafer evidence theory to evaluate the uncertainty of each prediction\ngenerated by diverse semantics. Experiments conducted in multiple different\nsettings have consistently demonstrated the effectiveness of CDC.\n","authors":["Yanan Zhang","Jiangmeng Li","Lixiang Liu","Wenwen Qiang"],"pdf_url":"https://arxiv.org/pdf/2410.12816v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.17871v2","updated":"2024-11-05T02:26:51Z","published":"2024-05-28T06:44:13Z","title":"Seeing the Image: Prioritizing Visual Correlation by Contrastive\n Alignment","summary":" Existing image-text modality alignment in Vision Language Models (VLMs)\ntreats each text token equally in an autoregressive manner. Despite being\nsimple and effective, this method results in sub-optimal cross-modal alignment\nby over-emphasizing the text tokens that are less correlated with or even\ncontradictory with the input images. In this paper, we advocate for assigning\ndistinct contributions for each text token based on its visual correlation.\nSpecifically, we present by contrasting image inputs, the difference in\nprediction logits on each text token provides strong guidance of visual\ncorrelation. We therefore introduce Contrastive ALignment (CAL), a simple yet\neffective re-weighting strategy that prioritizes training visually correlated\ntokens. Our experimental results demonstrate that CAL consistently improves\ndifferent types of VLMs across different resolutions and model sizes on various\nbenchmark datasets. Importantly, our method incurs minimal additional\ncomputational overhead, rendering it highly efficient compared to alternative\ndata scaling strategies. Codes are available at\nhttps://github.com/foundation-multimodal-models/CAL.\n","authors":["Xin Xiao","Bohong Wu","Jiacong Wang","Chunyuan Li","Xun Zhou","Haoyuan Guo"],"pdf_url":"https://arxiv.org/pdf/2405.17871v2.pdf","comment":"NeurlPS 2024, Camera ready"},{"id":"http://arxiv.org/abs/2309.07918v5","updated":"2024-11-05T02:17:22Z","published":"2023-09-14T17:59:49Z","title":"Unified Human-Scene Interaction via Prompted Chain-of-Contacts","summary":" Human-Scene Interaction (HSI) is a vital component of fields like embodied AI\nand virtual reality. Despite advancements in motion quality and physical\nplausibility, two pivotal factors, versatile interaction control and the\ndevelopment of a user-friendly interface, require further exploration before\nthe practical application of HSI. This paper presents a unified HSI framework,\nUniHSI, which supports unified control of diverse interactions through language\ncommands. This framework is built upon the definition of interaction as Chain\nof Contacts (CoC): steps of human joint-object part pairs, which is inspired by\nthe strong correlation between interaction types and human-object contact\nregions. Based on the definition, UniHSI constitutes a Large Language Model\n(LLM) Planner to translate language prompts into task plans in the form of CoC,\nand a Unified Controller that turns CoC into uniform task execution. To\nfacilitate training and evaluation, we collect a new dataset named ScenePlan\nthat encompasses thousands of task plans generated by LLMs based on diverse\nscenarios. Comprehensive experiments demonstrate the effectiveness of our\nframework in versatile task execution and generalizability to real scanned\nscenes. The project page is at https://github.com/OpenRobotLab/UniHSI .\n","authors":["Zeqi Xiao","Tai Wang","Jingbo Wang","Jinkun Cao","Wenwei Zhang","Bo Dai","Dahua Lin","Jiangmiao Pang"],"pdf_url":"https://arxiv.org/pdf/2309.07918v5.pdf","comment":"A unified Human-Scene Interaction framework that supports versatile\n interactions through language commands.Project URL:\n https://xizaoqu.github.io/unihsi/ . Code:\n https://github.com/OpenRobotLab/UniHSI"},{"id":"http://arxiv.org/abs/2411.02733v1","updated":"2024-11-05T02:03:12Z","published":"2024-11-05T02:03:12Z","title":"DDFAV: Remote Sensing Large Vision Language Models Dataset and\n Evaluation Benchmark","summary":" With the rapid development of large vision language models (LVLMs), these\nmodels have shown excellent results in various multimodal tasks. Since LVLMs\nare prone to hallucinations and there are currently few datasets and evaluation\nmethods specifically designed for remote sensing, their performance is\ntypically poor when applied to remote sensing tasks. To address these issues,\nthis paper introduces a high quality remote sensing LVLMs dataset, DDFAV,\ncreated using data augmentation and data mixing strategies. Next, a training\ninstruction set is produced based on some high-quality remote sensing images\nselected from the proposed dataset. Finally, we develop a remote sensing LVLMs\nhallucination evaluation method RSPOPE based on the proposed dataset and\nevaluate the zero-shot capabilities of different LVLMs. Our proposed dataset,\ninstruction set, and evaluation method files are available at\nhttps://github.com/HaodongLi2024/rspope.\n","authors":["Haodong Li","Haicheng Qu","Xiaofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.02733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01469v2","updated":"2024-11-05T01:59:41Z","published":"2024-11-03T07:33:41Z","title":"Exploring PCA-based feature representations of image pixels via CNN to\n enhance food image segmentation","summary":" For open vocabulary recognition of ingredients in food images, segmenting the\ningredients is a crucial step. This paper proposes a novel approach that\nexplores PCA-based feature representations of image pixels using a\nconvolutional neural network (CNN) to enhance segmentation. An internal\nclustering metric based on the silhouette score is defined to evaluate the\nclustering quality of various pixel-level feature representations generated by\ndifferent feature maps derived from various CNN backbones. Using this metric,\nthe paper explores optimal feature representation selection and suitable\nclustering methods for ingredient segmentation. Additionally, it is found that\nprincipal component (PC) maps derived from concatenations of backbone feature\nmaps improve the clustering quality of pixel-level feature representations,\nresulting in stable segmentation outcomes. Notably, the number of selected\neigenvalues can be used as the number of clusters to achieve good segmentation\nresults. The proposed method performs well on the ingredient-labeled dataset\nFoodSeg103, achieving a mean Intersection over Union (mIoU) score of 0.5423.\nImportantly, the proposed method is unsupervised, and pixel-level feature\nrepresentations from backbones are not fine-tuned on specific datasets. This\ndemonstrates the flexibility, generalizability, and interpretability of the\nproposed method, while reducing the need for extensive labeled datasets.\n","authors":["Ying Dai"],"pdf_url":"https://arxiv.org/pdf/2411.01469v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02724v1","updated":"2024-11-05T01:44:22Z","published":"2024-11-05T01:44:22Z","title":"TransUNext: towards a more advanced U-shaped framework for automatic\n vessel segmentation in the fundus image","summary":" Purpose: Automatic and accurate segmentation of fundus vessel images has\nbecome an essential prerequisite for computer-aided diagnosis of ophthalmic\ndiseases such as diabetes mellitus. The task of high-precision retinal vessel\nsegmentation still faces difficulties due to the low contrast between the\nbranch ends of retinal vessels and the background, the long and thin vessel\nspan, and the variable morphology of the optic disc and optic cup in fundus\nvessel images. Methods: We propose a more advanced U-shaped architecture for a\nhybrid Transformer and CNN: TransUNext, which integrates an Efficient\nSelf-attention Mechanism into the encoder and decoder of U-Net to capture both\nlocal features and global dependencies with minimal computational overhead.\nMeanwhile, the Global Multi-Scale Fusion (GMSF) module is further introduced to\nupgrade skip-connections, fuse high-level semantic and low-level detailed\ninformation, and eliminate high- and low-level semantic differences. Inspired\nby ConvNeXt, TransNeXt Block is designed to optimize the computational\ncomplexity of each base block in U-Net and avoid the information loss caused by\nthe compressed dimension when the information is converted between the feature\nspaces of different dimensions. Results: We evaluated the proposed method on\nfour public datasets DRIVE, STARE, CHASE-DB1, and HRF. In the experimental\nresults, the AUC (area under the ROC curve) values were 0.9867, 0.9869, 0.9910,\nand 0.9887, which exceeded the other state-of-the-art.\n","authors":["Xiang Li","Mingsi Liu","Lixin Duan"],"pdf_url":"https://arxiv.org/pdf/2411.02724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02715v1","updated":"2024-11-05T01:27:25Z","published":"2024-11-05T01:27:25Z","title":"CIT: Rethinking Class-incremental Semantic Segmentation with a Class\n Independent Transformation","summary":" Class-incremental semantic segmentation (CSS) requires that a model learn to\nsegment new classes without forgetting how to segment previous ones: this is\ntypically achieved by distilling the current knowledge and incorporating the\nlatest data. However, bypassing iterative distillation by directly transferring\noutputs of initial classes to the current learning task is not supported in\nexisting class-specific CSS methods. Via Softmax, they enforce dependency\nbetween classes and adjust the output distribution at each learning step,\nresulting in a large probability distribution gap between initial and current\ntasks. We introduce a simple, yet effective Class Independent Transformation\n(CIT) that converts the outputs of existing semantic segmentation models into\nclass-independent forms with negligible cost or performance loss. By utilizing\nclass-independent predictions facilitated by CIT, we establish an accumulative\ndistillation framework, ensuring equitable incorporation of all class\ninformation. We conduct extensive experiments on various segmentation\narchitectures, including DeepLabV3, Mask2Former, and SegViTv2. Results from\nthese experiments show minimal task forgetting across different datasets, with\nless than 5% for ADE20K in the most challenging 11 task configurations and less\nthan 1% across all configurations for the PASCAL VOC 2012 dataset.\n","authors":["Jinchao Ge","Bowen Zhang","Akide Liu","Minh Hieu Phan","Qi Chen","Yangyang Shu","Yang Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.02715v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.02712v1","updated":"2024-11-05T01:24:37Z","published":"2024-11-05T01:24:37Z","title":"V-DPO: Mitigating Hallucination in Large Vision Language Models via\n Vision-Guided Direct Preference Optimization","summary":" Large vision-language models (LVLMs) suffer from hallucination, resulting in\nmisalignment between the output textual response and the input visual content.\nRecent research indicates that the over-reliance on the Large Language Model\n(LLM) backbone, as one cause of the LVLM hallucination, inherently introduces\nbias from language priors, leading to insufficient context attention to the\nvisual inputs.\n We tackle this issue of hallucination by mitigating such over-reliance\nthrough preference learning. We propose Vision-guided Direct Preference\nOptimization (V-DPO) to enhance visual context learning at training time. To\ninterpret the effectiveness and generalizability of V-DPO on different types of\ntraining data, we construct a synthetic dataset containing both response- and\nimage-contrast preference pairs, compared against existing human-annotated\nhallucination samples. Our approach achieves significant improvements compared\nwith baseline methods across various hallucination benchmarks. Our analysis\nindicates that V-DPO excels in learning from image-contrast preference data,\ndemonstrating its superior ability to elicit and understand nuances of visual\ncontext. Our code is publicly available at https://github.com/YuxiXie/V-DPO.\n","authors":["Yuxi Xie","Guanzhen Li","Xiao Xu","Min-Yen Kan"],"pdf_url":"https://arxiv.org/pdf/2411.02712v1.pdf","comment":"EMNLP 2024 Findings; 9 pages, 6 figures, 5 tables (16 pages, 8\n figures, 8 tables including references and appendices)"},{"id":"http://arxiv.org/abs/2411.02710v1","updated":"2024-11-05T01:13:34Z","published":"2024-11-05T01:13:34Z","title":"Full Field Digital Mammography Dataset from a Population Screening\n Program","summary":" Breast cancer presents the second largest cancer risk in the world to women.\nEarly detection of cancer has been shown to be effective in reducing mortality.\nPopulation screening programs schedule regular mammography imaging for\nparticipants, promoting early detection. Currently, such screening programs\nrequire manual reading. False-positive errors in the reading process\nunnecessarily leads to costly follow-up and patient anxiety. Automated methods\npromise to provide more efficient, consistent and effective reading. To\nfacilitate their development, a number of datasets have been created. With the\naim of specifically targeting population screening programs, we introduce\nNL-Breast-Screening, a dataset from a Canadian provincial screening program.\nThe dataset consists of 5997 mammography exams, each of which has four standard\nviews and is biopsy-confirmed. Cases where radiologist reading was a\nfalse-positive are identified. NL-Breast is made publicly available as a new\nresource to promote advances in automation for population screening programs.\n","authors":["Edward Kendall","Paraham Hajishafiezahramini","Matthew Hamilton","Gregory Doyle","Nancy Wadden","Oscar Meruvia-Pastor"],"pdf_url":"https://arxiv.org/pdf/2411.02710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07605v3","updated":"2024-11-05T01:11:08Z","published":"2024-03-12T12:44:34Z","title":"Optimizing Negative Prompts for Enhanced Aesthetics and Fidelity in\n Text-To-Image Generation","summary":" In text-to-image generation, using negative prompts, which describe\nundesirable image characteristics, can significantly boost image quality.\nHowever, producing good negative prompts is manual and tedious. To address\nthis, we propose NegOpt, a novel method for optimizing negative prompt\ngeneration toward enhanced image generation, using supervised fine-tuning and\nreinforcement learning. Our combined approach results in a substantial increase\nof 25% in Inception Score compared to other approaches and surpasses\nground-truth negative prompts from the test set. Furthermore, with NegOpt we\ncan preferentially optimize the metrics most important to us. Finally, we\nconstruct Negative Prompts DB\n(https://huggingface.co/datasets/mikeogezi/negopt_full), a publicly available\ndataset of negative prompts.\n","authors":["Michael Ogezi","Ning Shi"],"pdf_url":"https://arxiv.org/pdf/2403.07605v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02704v1","updated":"2024-11-05T01:02:51Z","published":"2024-11-05T01:02:51Z","title":"RT-Affordance: Affordances are Versatile Intermediate Representations\n for Robot Manipulation","summary":" We explore how intermediate policy representations can facilitate\ngeneralization by providing guidance on how to perform manipulation tasks.\nExisting representations such as language, goal images, and trajectory sketches\nhave been shown to be helpful, but these representations either do not provide\nenough context or provide over-specified context that yields less robust\npolicies. We propose conditioning policies on affordances, which capture the\npose of the robot at key stages of the task. Affordances offer expressive yet\nlightweight abstractions, are easy for users to specify, and facilitate\nefficient learning by transferring knowledge from large internet datasets. Our\nmethod, RT-Affordance, is a hierarchical model that first proposes an\naffordance plan given the task language, and then conditions the policy on this\naffordance plan to perform manipulation. Our model can flexibly bridge\nheterogeneous sources of supervision including large web datasets and robot\ntrajectories. We additionally train our model on cheap-to-collect in-domain\naffordance images, allowing us to learn new tasks without collecting any\nadditional costly robot trajectories. We show on a diverse set of novel tasks\nhow RT-Affordance exceeds the performance of existing methods by over 50%, and\nwe empirically demonstrate that affordances are robust to novel settings.\nVideos available at https://snasiriany.me/rt-affordance\n","authors":["Soroush Nasiriany","Sean Kirmani","Tianli Ding","Laura Smith","Yuke Zhu","Danny Driess","Dorsa Sadigh","Ted Xiao"],"pdf_url":"https://arxiv.org/pdf/2411.02704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02697v1","updated":"2024-11-05T00:49:47Z","published":"2024-11-05T00:49:47Z","title":"Transferable polychromatic optical encoder for neural networks","summary":" Artificial neural networks (ANNs) have fundamentally transformed the field of\ncomputer vision, providing unprecedented performance. However, these ANNs for\nimage processing demand substantial computational resources, often hindering\nreal-time operation. In this paper, we demonstrate an optical encoder that can\nperform convolution simultaneously in three color channels during the image\ncapture, effectively implementing several initial convolutional layers of a\nANN. Such an optical encoding results in ~24,000 times reduction in\ncomputational operations, with a state-of-the art classification accuracy\n(~73.2%) in free-space optical system. In addition, our analog optical encoder,\ntrained for CIFAR-10 data, can be transferred to the ImageNet subset, High-10,\nwithout any modifications, and still exhibits moderate accuracy. Our results\nevidence the potential of hybrid optical/digital computer vision system in\nwhich the optical frontend can pre-process an ambient scene to reduce the\nenergy and latency of the whole computer vision system.\n","authors":["Minho Choi","Jinlin Xiang","Anna Wirth-Singh","Seung-Hwan Baek","Eli Shlizerman","Arka Majumdar"],"pdf_url":"https://arxiv.org/pdf/2411.02697v1.pdf","comment":"21 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2302.00290v4","updated":"2024-11-05T00:44:33Z","published":"2023-02-01T07:45:10Z","title":"MS-DETR: Multispectral Pedestrian Detection Transformer with Loosely\n Coupled Fusion and Modality-Balanced Optimization","summary":" Multispectral pedestrian detection is an important task for many\naround-the-clock applications, since the visible and thermal modalities can\nprovide complementary information especially under low light conditions. Due to\nthe presence of two modalities, misalignment and modality imbalance are the\nmost significant issues in multispectral pedestrian detection. In this paper,\nwe propose M ulti S pectral pedestrian DE tection TR ansformer (MS-DETR) to fix\nabove issues. MS-DETR consists of two modality-specific backbones and\nTransformer encoders, followed by a multi-modal Transformer decoder, and the\nvisible and thermal features are fused in the multi-modal Transformer decoder.\nTo well resist the misalignment between multi-modal images, we design a loosely\ncoupled fusion strategy by sparsely sampling some keypoints from multi-modal\nfeatures independently and fusing them with adaptively learned attention\nweights. Moreover, based on the insight that not only different modalities, but\nalso different pedestrian instances tend to have different confidence scores to\nfinal detection, we further propose an instance-aware modality-balanced\noptimization strategy, which preserves visible and thermal decoder branches and\naligns their predicted slots through an instance-wise dynamic loss. Our\nend-to-end MS-DETR shows superior performance on the challenging KAIST, CVC-14\nand LLVIP benchmark datasets. The source code is available at\nhttps://github.com/YinghuiXing/MS-DETR.\n","authors":["Yinghui Xing","Shuo Yang","Song Wang","Shizhou Zhang","Guoqiang Liang","Xiuwei Zhang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.00290v4.pdf","comment":"The paper has been accepted by IEEE Transactions on Intelligent\n Transportation Systems"},{"id":"http://arxiv.org/abs/2205.14320v4","updated":"2024-11-05T00:38:29Z","published":"2022-05-28T03:32:56Z","title":"RIAV-MVS: Recurrent-Indexing an Asymmetric Volume for Multi-View Stereo","summary":" This paper presents a learning-based method for multi-view depth estimation\nfrom posed images. Our core idea is a \"learning-to-optimize\" paradigm that\niteratively indexes a plane-sweeping cost volume and regresses the depth map\nvia a convolutional Gated Recurrent Unit (GRU). Since the cost volume plays a\nparamount role in encoding the multi-view geometry, we aim to improve its\nconstruction both at pixel- and frame- levels. At the pixel level, we propose\nto break the symmetry of the Siamese network (which is typically used in MVS to\nextract image features) by introducing a transformer block to the reference\nimage (but not to the source images). Such an asymmetric volume allows the\nnetwork to extract global features from the reference image to predict its\ndepth map. Given potential inaccuracies in the poses between reference and\nsource images, we propose to incorporate a residual pose network to correct the\nrelative poses. This essentially rectifies the cost volume at the frame level.\nWe conduct extensive experiments on real-world MVS datasets and show that our\nmethod achieves state-of-the-art performance in terms of both within-dataset\nevaluation and cross-dataset generalization. Code available:\nhttps://github.com/oppo-us-research/riav-mvs.\n","authors":["Changjiang Cai","Pan Ji","Qingan Yan","Yi Xu"],"pdf_url":"https://arxiv.org/pdf/2205.14320v4.pdf","comment":"CVPR 2023. Code link added"},{"id":"http://arxiv.org/abs/2411.03561v1","updated":"2024-11-05T23:53:19Z","published":"2024-11-05T23:53:19Z","title":"Estimating Ego-Body Pose from Doubly Sparse Egocentric Video Data","summary":" We study the problem of estimating the body movements of a camera wearer from\negocentric videos. Current methods for ego-body pose estimation rely on\ntemporally dense sensor data, such as IMU measurements from spatially sparse\nbody parts like the head and hands. However, we propose that even temporally\nsparse observations, such as hand poses captured intermittently from egocentric\nvideos during natural or periodic hand movements, can effectively constrain\noverall body motion. Naively applying diffusion models to generate full-body\npose from head pose and sparse hand pose leads to suboptimal results. To\novercome this, we develop a two-stage approach that decomposes the problem into\ntemporal completion and spatial completion. First, our method employs masked\nautoencoders to impute hand trajectories by leveraging the spatiotemporal\ncorrelations between the head pose sequence and intermittent hand poses,\nproviding uncertainty estimates. Subsequently, we employ conditional diffusion\nmodels to generate plausible full-body motions based on these temporally dense\ntrajectories of the head and hands, guided by the uncertainty estimates from\nthe imputation. The effectiveness of our method was rigorously tested and\nvalidated through comprehensive experiments conducted on various HMD setup with\nAMASS and Ego-Exo4D datasets.\n","authors":["Seunggeun Chi","Pin-Hao Huang","Enna Sachdeva","Hengbo Ma","Karthik Ramani","Kwonjoon Lee"],"pdf_url":"https://arxiv.org/pdf/2411.03561v1.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.07832v5","updated":"2024-11-05T23:50:14Z","published":"2024-07-31T14:49:35Z","title":"LADDER: Language Driven Slice Discovery and Error Rectification","summary":" Error slice discovery associates structured patterns with model errors.\nExisting methods discover error slices by clustering the error-prone samples\nwith similar patterns or assigning discrete attributes to each sample for\npost-hoc analysis. While these methods aim for interpretability and easier\nmitigation through reweighting or rebalancing, they may not capture the full\ncomplexity of error patterns due to incomplete or missing attributes. Contrary\nto the existing approach, this paper utilizes the reasoning capabilities of the\nLarge Language Model (LLM) to analyze complex error patterns and generate\ntestable hypotheses. This paper proposes LADDER: Language Driven slice\nDiscovery and Error Rectification. It first projects the model's representation\ninto a language-aligned feature space (eg CLIP) to preserve semantics in the\noriginal model feature space. This ensures the accurate retrieval of sentences\nthat highlight the model's errors. Next, the LLM utilizes the sentences and\ngenerates hypotheses to discover error slices. Finally, we mitigate the error\nby fine-tuning the classification head by creating a group-balanced dataset\nusing the hypotheses. Our entire method does not require any attribute\nannotation, either explicitly or through external tagging models. We validate\nour method with \\textbf{five} image classification datasets.\n","authors":["Shantanu Ghosh","Rayan Syed","Chenyu Wang","Clare B. Poynton","Shyam Visweswaran","Kayhan Batmanghelich"],"pdf_url":"https://arxiv.org/pdf/2408.07832v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03555v1","updated":"2024-11-05T23:28:57Z","published":"2024-11-05T23:28:57Z","title":"Object and Contact Point Tracking in Demonstrations Using 3D Gaussian\n Splatting","summary":" This paper introduces a method to enhance Interactive Imitation Learning\n(IIL) by extracting touch interaction points and tracking object movement from\nvideo demonstrations. The approach extends current IIL systems by providing\nrobots with detailed knowledge of both where and how to interact with objects,\nparticularly complex articulated ones like doors and drawers. By leveraging\ncutting-edge techniques such as 3D Gaussian Splatting and FoundationPose for\ntracking, this method allows robots to better understand and manipulate objects\nin dynamic environments. The research lays the foundation for more effective\ntask learning and execution in autonomous robotic systems.\n","authors":["Michael Büttner","Jonathan Francis","Helge Rhodin","Andrew Melnik"],"pdf_url":"https://arxiv.org/pdf/2411.03555v1.pdf","comment":"CoRL 2024, Workshop on Lifelong Learning for Home Robots, Munich,\n Germany"},{"id":"http://arxiv.org/abs/2411.03554v1","updated":"2024-11-05T23:26:10Z","published":"2024-11-05T23:26:10Z","title":"Benchmarking Vision Language Model Unlearning via Fictitious Facial\n Identity Dataset","summary":" Machine unlearning has emerged as an effective strategy for forgetting\nspecific information in the training data. However, with the increasing\nintegration of visual data, privacy concerns in Vision Language Models (VLMs)\nremain underexplored. To address this, we introduce Facial Identity Unlearning\nBenchmark (FIUBench), a novel VLM unlearning benchmark designed to robustly\nevaluate the effectiveness of unlearning algorithms under the Right to be\nForgotten setting. Specifically, we formulate the VLM unlearning task via\nconstructing the Fictitious Facial Identity VQA dataset and apply a two-stage\nevaluation pipeline that is designed to precisely control the sources of\ninformation and their exposure levels. In terms of evaluation, since VLM\nsupports various forms of ways to ask questions with the same semantic meaning,\nwe also provide robust evaluation metrics including membership inference\nattacks and carefully designed adversarial privacy attacks to evaluate the\nperformance of algorithms. Through the evaluation of four baseline VLM\nunlearning algorithms within FIUBench, we find that all methods remain limited\nin their unlearning performance, with significant trade-offs between model\nutility and forget quality. Furthermore, our findings also highlight the\nimportance of privacy attacks for robust evaluations. We hope FIUBench will\ndrive progress in developing more effective VLM unlearning algorithms.\n","authors":["Yingzi Ma","Jiongxiao Wang","Fei Wang","Siyuan Ma","Jiazhao Li","Xiujun Li","Furong Huang","Lichao Sun","Bo Li","Yejin Choi","Muhao Chen","Chaowei Xiao"],"pdf_url":"https://arxiv.org/pdf/2411.03554v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03551v1","updated":"2024-11-05T23:11:26Z","published":"2024-11-05T23:11:26Z","title":"Enhancing Weakly Supervised Semantic Segmentation for Fibrosis via\n Controllable Image Generation","summary":" Fibrotic Lung Disease (FLD) is a severe condition marked by lung stiffening\nand scarring, leading to respiratory decline. High-resolution computed\ntomography (HRCT) is critical for diagnosing and monitoring FLD; however,\nfibrosis appears as irregular, diffuse patterns with unclear boundaries,\nleading to high inter-observer variability and time-intensive manual\nannotation. To tackle this challenge, we propose DiffSeg, a novel weakly\nsupervised semantic segmentation (WSSS) method that uses image-level\nannotations to generate pixel-level fibrosis segmentation, reducing the need\nfor fine-grained manual labeling. Additionally, our DiffSeg incorporates a\ndiffusion-based generative model to synthesize HRCT images with different\nlevels of fibrosis from healthy slices, enabling the generation of the\nfibrosis-injected slices and their paired fibrosis location. Experiments\nindicate that our method significantly improves the accuracy of pseudo masks\ngenerated by existing WSSS methods, greatly reducing the complexity of manual\nlabeling and enhancing the consistency of the generated masks.\n","authors":["Zhiling Yue","Yingying Fang","Liutao Yang","Nikhil Baid","Simon Walsh","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2411.03551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03531v1","updated":"2024-11-05T22:14:35Z","published":"2024-11-05T22:14:35Z","title":"Personalized Video Summarization by Multimodal Video Understanding","summary":" Video summarization techniques have been proven to improve the overall user\nexperience when it comes to accessing and comprehending video content. If the\nuser's preference is known, video summarization can identify significant\ninformation or relevant content from an input video, aiding them in obtaining\nthe necessary information or determining their interest in watching the\noriginal video. Adapting video summarization to various types of video and user\npreferences requires significant training data and expensive human labeling. To\nfacilitate such research, we proposed a new benchmark for video summarization\nthat captures various user preferences. Also, we present a pipeline called\nVideo Summarization with Language (VSL) for user-preferred video summarization\nthat is based on pre-trained visual language models (VLMs) to avoid the need to\ntrain a video summarization system on a large training dataset. The pipeline\ntakes both video and closed captioning as input and performs semantic analysis\nat the scene level by converting video frames into text. Subsequently, the\nuser's genre preference was used as the basis for selecting the pertinent\ntextual scenes. The experimental results demonstrate that our proposed pipeline\noutperforms current state-of-the-art unsupervised video summarization models.\nWe show that our method is more adaptable across different datasets compared to\nsupervised query-based video summarization models. In the end, the runtime\nanalysis demonstrates that our pipeline is more suitable for practical use when\nscaling up the number of user preferences and videos.\n","authors":["Brian Chen","Xiangyuan Zhao","Yingnan Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.03531v1.pdf","comment":"In Proceedings of CIKM 2024 Applied Research Track"},{"id":"http://arxiv.org/abs/2312.05269v3","updated":"2024-11-05T22:08:14Z","published":"2023-12-07T19:19:25Z","title":"LifelongMemory: Leveraging LLMs for Answering Queries in Long-form\n Egocentric Videos","summary":" In this paper we introduce LifelongMemory, a new framework for accessing\nlong-form egocentric videographic memory through natural language question\nanswering and retrieval. LifelongMemory generates concise video activity\ndescriptions of the camera wearer and leverages the zero-shot capabilities of\npretrained large language models to perform reasoning over long-form video\ncontext. Furthermore, LifelongMemory uses a confidence and explanation module\nto produce confident, high-quality, and interpretable answers. Our approach\nachieves state-of-the-art performance on the EgoSchema benchmark for question\nanswering and is highly competitive on the natural language query (NLQ)\nchallenge of Ego4D. Code is available at\nhttps://github.com/agentic-learning-ai-lab/lifelong-memory.\n","authors":["Ying Wang","Yanlai Yang","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2312.05269v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08798v2","updated":"2024-11-05T21:10:54Z","published":"2023-09-15T22:45:02Z","title":"D3: Data Diversity Design for Systematic Generalization in Visual\n Question Answering","summary":" Systematic generalization is a crucial aspect of intelligence, which refers\nto the ability to generalize to novel tasks by combining known subtasks and\nconcepts. One critical factor that has been shown to influence systematic\ngeneralization is the diversity of training data. However, diversity can be\ndefined in various ways, as data have many factors of variation. A more\ngranular understanding of how different aspects of data diversity affect\nsystematic generalization is lacking. We present new evidence in the problem of\nVisual Question Answering (VQA) that reveals that the diversity of simple tasks\n(i.e. tasks formed by a few subtasks and concepts) plays a key role in\nachieving systematic generalization. This implies that it may not be essential\nto gather a large and varied number of complex tasks, which could be costly to\nobtain. We demonstrate that this result is independent of the similarity\nbetween the training and testing data and applies to well-known families of\nneural network architectures for VQA (i.e. monolithic architectures and neural\nmodule networks). Additionally, we observe that neural module networks leverage\nall forms of data diversity we evaluated, while monolithic architectures\nrequire more extensive amounts of data to do so. These findings provide a first\nstep towards understanding the interactions between data diversity design,\nneural network architectures, and systematic generalization capabilities.\n","authors":["Amir Rahimi","Vanessa D'Amario","Moyuru Yamada","Kentaro Takemoto","Tomotake Sasaki","Xavier Boix"],"pdf_url":"https://arxiv.org/pdf/2309.08798v2.pdf","comment":"TMLR (https://openreview.net/forum?id=ZAin13msOp)"},{"id":"http://arxiv.org/abs/2411.03511v1","updated":"2024-11-05T21:08:19Z","published":"2024-11-05T21:08:19Z","title":"Beyond Complete Shapes: A quantitative Evaluation of 3D Shape Matching\n Algorithms","summary":" Finding correspondences between 3D shapes is an important and long-standing\nproblem in computer vision, graphics and beyond. While approaches based on\nmachine learning dominate modern 3D shape matching, almost all existing\n(learning-based) methods require that at least one of the involved shapes is\ncomplete. In contrast, the most challenging and arguably most practically\nrelevant setting of matching partially observed shapes, is currently\nunderexplored. One important factor is that existing datasets contain only a\nsmall number of shapes (typically below 100), which are unable to serve\ndata-hungry machine learning approaches, particularly in the unsupervised\nregime. In addition, the type of partiality present in existing datasets is\noften artificial and far from realistic. To address these limitations and to\nencourage research on these relevant settings, we provide a generic and\nflexible framework for the procedural generation of challenging partial shape\nmatching scenarios. Our framework allows for a virtually infinite generation of\npartial shape matching instances from a finite set of shapes with complete\ngeometry. Further, we manually create cross-dataset correspondences between\nseven existing (complete geometry) shape matching datasets, leading to a total\nof 2543 shapes. Based on this, we propose several challenging partial benchmark\nsettings, for which we evaluate respective state-of-the-art methods as\nbaselines.\n","authors":["Viktoria Ehm","Nafie El Amrani","Yizheng Xie","Lennart Bastian","Maolin Gao","Weikang Wang","Lu Sang","Dongliang Cao","Zorah Lähner","Daniel Cremers","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2411.03511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04090v2","updated":"2024-11-05T20:51:06Z","published":"2024-06-06T14:01:28Z","title":"Interpretable Lightweight Transformer via Unrolling of Learned Graph\n Smoothness Priors","summary":" We build interpretable and lightweight transformer-like neural networks by\nunrolling iterative optimization algorithms that minimize graph smoothness\npriors -- the quadratic graph Laplacian regularizer (GLR) and the $\\ell_1$-norm\ngraph total variation (GTV) -- subject to an interpolation constraint. The\ncrucial insight is that a normalized signal-dependent graph learning module\namounts to a variant of the basic self-attention mechanism in conventional\ntransformers. Unlike \"black-box\" transformers that require learning of large\nkey, query and value matrices to compute scaled dot products as affinities and\nsubsequent output embeddings, resulting in huge parameter sets, our unrolled\nnetworks employ shallow CNNs to learn low-dimensional features per node to\nestablish pairwise Mahalanobis distances and construct sparse similarity\ngraphs. At each layer, given a learned graph, the target interpolated signal is\nsimply a low-pass filtered output derived from the minimization of an assumed\ngraph smoothness prior, leading to a dramatic reduction in parameter count.\nExperiments for two image interpolation applications verify the restoration\nperformance, parameter efficiency and robustness to covariate shift of our\ngraph-based unrolled networks compared to conventional transformers.\n","authors":["Tam Thuc Do","Parham Eftekhar","Seyed Alireza Hosseini","Gene Cheung","Philip Chou"],"pdf_url":"https://arxiv.org/pdf/2406.04090v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03505v1","updated":"2024-11-05T20:42:23Z","published":"2024-11-05T20:42:23Z","title":"SynthSet: Generative Diffusion Model for Semantic Segmentation in\n Precision Agriculture","summary":" This paper introduces a methodology for generating synthetic annotated data\nto address data scarcity in semantic segmentation tasks within the precision\nagriculture domain. Utilizing Denoising Diffusion Probabilistic Models (DDPMs)\nand Generative Adversarial Networks (GANs), we propose a dual diffusion model\narchitecture for synthesizing realistic annotated agricultural data, without\nany human intervention. We employ super-resolution to enhance the phenotypic\ncharacteristics of the synthesized images and their coherence with the\ncorresponding generated masks. We showcase the utility of the proposed method\nfor wheat head segmentation. The high quality of synthesized data underscores\nthe effectiveness of the proposed methodology in generating image-mask pairs.\nFurthermore, models trained on our generated data exhibit promising performance\nwhen tested on an external, diverse dataset of real wheat fields. The results\nshow the efficacy of the proposed methodology for addressing data scarcity for\nsemantic segmentation tasks. Moreover, the proposed approach can be readily\nadapted for various segmentation tasks in precision agriculture and beyond.\n","authors":["Andrew Heschl","Mauricio Murillo","Keyhan Najafian","Farhad Maleki"],"pdf_url":"https://arxiv.org/pdf/2411.03505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03491v1","updated":"2024-11-05T20:16:15Z","published":"2024-11-05T20:16:15Z","title":"An Application-Agnostic Automatic Target Recognition System Using Vision\n Language Models","summary":" We present a novel Automatic Target Recognition (ATR) system using\nopen-vocabulary object detection and classification models. A primary advantage\nof this approach is that target classes can be defined just before runtime by a\nnon-technical end user, using either a few natural language text descriptions\nof the target, or a few image exemplars, or both. Nuances in the desired\ntargets can be expressed in natural language, which is useful for unique\ntargets with little or no training data. We also implemented a novel\ncombination of several techniques to improve performance, such as leveraging\nthe additional information in the sequence of overlapping frames to perform\ntubelet identification (i.e., sequential bounding box matching), bounding box\nre-scoring, and tubelet linking. Additionally, we developed a technique to\nvisualize the aggregate output of many overlapping frames as a mosaic of the\narea scanned during the aerial surveillance or reconnaissance, and a kernel\ndensity estimate (or heatmap) of the detected targets. We initially applied\nthis ATR system to the use case of detecting and clearing unexploded ordinance\non airfield runways and we are currently extending our research to other\nreal-world applications.\n","authors":["Anthony Palladino","Dana Gajewski","Abigail Aronica","Patryk Deptula","Alexander Hamme","Seiyoung C. Lee","Jeff Muri","Todd Nelling","Michael A. Riley","Brian Wong","Margaret Duff"],"pdf_url":"https://arxiv.org/pdf/2411.03491v1.pdf","comment":"Accepted to the Thirty-Seventh Annual Conference on Innovative\n Applications of Artificial Intelligence (IAAI-25)"},{"id":"http://arxiv.org/abs/2411.03480v1","updated":"2024-11-05T20:06:50Z","published":"2024-11-05T20:06:50Z","title":"Rainfall regression from C-band Synthetic Aperture Radar using\n Multi-Task Generative Adversarial Networks","summary":" This paper introduces a data-driven approach to estimate precipitation rates\nfrom Synthetic Aperture Radar (SAR) at a spatial resolution of 200 meters per\npixel. It addresses previous challenges related to the collocation of SAR and\nweather radar data, specifically the misalignment in collocations and the\nscarcity of rainfall examples under strong wind. To tackle these challenges,\nthe paper proposes a multi-objective formulation, introducing patch-level\ncomponents and an adversarial component. It exploits the full NEXRAD archive to\nlook for potential co-locations with Sentinel-1 data. With additional\nenhancements to the training procedure and the incorporation of additional\ninputs, the resulting model demonstrates improved accuracy in rainfall\nestimates and the ability to extend its performance to scenarios up to 15 m/s.\n","authors":["Aurélien Colin","Romain Husson"],"pdf_url":"https://arxiv.org/pdf/2411.03480v1.pdf","comment":"36 pages, 13 figures"},{"id":"http://arxiv.org/abs/2411.03475v1","updated":"2024-11-05T19:59:40Z","published":"2024-11-05T19:59:40Z","title":"Self Supervised Networks for Learning Latent Space Representations of\n Human Body Scans and Motions","summary":" This paper introduces self-supervised neural network models to tackle several\nfundamental problems in the field of 3D human body analysis and processing.\nFirst, we propose VariShaPE (Varifold Shape Parameter Estimator), a novel\narchitecture for the retrieval of latent space representations of body shapes\nand poses. This network offers a fast and robust method to estimate the\nembedding of arbitrary unregistered meshes into the latent space. Second, we\ncomplement the estimation of latent codes with MoGeN (Motion Geometry Network)\na framework that learns the geometry on the latent space itself. This is\nachieved by lifting the body pose parameter space into a higher dimensional\nEuclidean space in which body motion mini-sequences from a training set of 4D\ndata can be approximated by simple linear interpolation. Using the SMPL latent\nspace representation we illustrate how the combination of these network models,\nonce trained, can be used to perform a variety of tasks with very limited\ncomputational cost. This includes operations such as motion interpolation,\nextrapolation and transfer as well as random shape and pose generation.\n","authors":["Emmanuel Hartman","Nicolas Charon","Martin Bauer"],"pdf_url":"https://arxiv.org/pdf/2411.03475v1.pdf","comment":"23 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2401.08426v4","updated":"2024-11-05T19:57:19Z","published":"2024-01-16T15:11:29Z","title":"GD doesn't make the cut: Three ways that non-differentiability affects\n neural network training","summary":" This paper investigates the distinctions between gradient methods applied to\nnon-differentiable functions (NGDMs) and classical gradient descents (GDs)\ndesigned for differentiable functions. First, we demonstrate significant\ndifferences in the convergence properties of NGDMs compared to GDs, challenging\nthe applicability of the extensive neural network convergence literature based\non $L-smoothness$ to non-smooth neural networks. Next, we demonstrate the\nparadoxical nature of NGDM solutions for $L_{1}$-regularized problems, showing\nthat increasing the regularization penalty leads to an increase in the $L_{1}$\nnorm of optimal solutions in NGDMs. Consequently, we show that widely adopted\n$L_{1}$ penalization-based techniques for network pruning do not yield expected\nresults. Additionally, we dispel the common belief that optimization algorithms\nlike Adam and RMSProp perform similarly in non-differentiable contexts.\nFinally, we explore the Edge of Stability phenomenon, indicating its\ninapplicability even to Lipschitz continuous convex differentiable functions,\nleaving its relevance to non-convex non-differentiable neural networks\ninconclusive. Our analysis exposes misguided interpretations of NGDMs in widely\nreferenced papers and texts due to an overreliance on strong smoothness\nassumptions, emphasizing the necessity for a nuanced understanding of\nfoundational assumptions in the analysis of these systems.\n","authors":["Siddharth Krishna Kumar"],"pdf_url":"https://arxiv.org/pdf/2401.08426v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07079v2","updated":"2024-11-05T19:44:03Z","published":"2024-08-07T14:04:50Z","title":"Anatomical Foundation Models for Brain MRIs","summary":" Deep Learning (DL) in neuroimaging has become increasingly relevant for\ndetecting neurological conditions and neurodegenerative disorders. One of the\nmost predominant biomarkers in neuroimaging is represented by brain age, which\nhas been shown to be a good indicator for different conditions, such as\nAlzheimer's Disease. Using brain age for pretraining DL models in transfer\nlearning settings has also recently shown promising results, especially when\ndealing with data scarcity of different conditions. On the other hand,\nanatomical information of brain MRIs (e.g. cortical thickness) can provide\nimportant information for learning good representations that can be transferred\nto many downstream tasks. In this work, we propose AnatCL, an anatomical\nfoundation model for brain MRIs that i.) leverages anatomical information with\na weakly contrastive learning approach and ii.) achieves state-of-the-art\nperformances in many different downstream tasks. To validate our approach we\nconsider 12 different downstream tasks for diagnosis classification, and\nprediction of 10 different clinical assessment scores. Pretrained models can be\nfound at https://github.com/EIDOSLAB/AnatCL.\n","authors":["Carlo Alberto Barbano","Matteo Brunello","Benoit Dufumier","Marco Grangetto"],"pdf_url":"https://arxiv.org/pdf/2408.07079v2.pdf","comment":"12 pages; added source url"},{"id":"http://arxiv.org/abs/2411.03464v1","updated":"2024-11-05T19:35:10Z","published":"2024-11-05T19:35:10Z","title":"TopoTxR: A topology-guided deep convolutional network for breast\n parenchyma learning on DCE-MRIs","summary":" Characterization of breast parenchyma in dynamic contrast-enhanced magnetic\nresonance imaging (DCE-MRI) is a challenging task owing to the complexity of\nunderlying tissue structures. Existing quantitative approaches, like radiomics\nand deep learning models, lack explicit quantification of intricate and subtle\nparenchymal structures, including fibroglandular tissue. To address this, we\npropose a novel topological approach that explicitly extracts multi-scale\ntopological structures to better approximate breast parenchymal structures, and\nthen incorporates these structures into a deep-learning-based prediction model\nvia an attention mechanism. Our topology-informed deep learning model,\n\\emph{TopoTxR}, leverages topology to provide enhanced insights into tissues\ncritical for disease pathophysiology and treatment response. We empirically\nvalidate \\emph{TopoTxR} using the VICTRE phantom breast dataset, showing that\nthe topological structures extracted by our model effectively approximate the\nbreast parenchymal structures. We further demonstrate \\emph{TopoTxR}'s efficacy\nin predicting response to neoadjuvant chemotherapy. Our qualitative and\nquantitative analyses suggest differential topological behavior of breast\ntissue in treatment-na\\\"ive imaging, in patients who respond favorably to\ntherapy as achieving pathological complete response (pCR) versus those who do\nnot. In a comparative analysis with several baselines on the publicly available\nI-SPY 1 dataset (N=161, including 47 patients with pCR and 114 without) and the\nRutgers proprietary dataset (N=120, with 69 patients achieving pCR and 51 not),\n\\emph{TopoTxR} demonstrates a notable improvement, achieving a 2.6\\% increase\nin accuracy and a 4.6\\% enhancement in AUC compared to the state-of-the-art\nmethod.\n","authors":["Fan Wang","Zhilin Zou","Nicole Sakla","Luke Partyka","Nil Rawal","Gagandeep Singh","Wei Zhao","Haibin Ling","Chuan Huang","Prateek Prasanna","Chao Chen"],"pdf_url":"https://arxiv.org/pdf/2411.03464v1.pdf","comment":"22 pages, 8 figures, 8 tables, accepted by Medical Image Analysis (\n https://www.sciencedirect.com/science/article/abs/pii/S1361841524002986 )"},{"id":"http://arxiv.org/abs/2411.03456v1","updated":"2024-11-05T19:17:38Z","published":"2024-11-05T19:17:38Z","title":"BOston Neonatal Brain Injury Data for Hypoxic Ischemic Encephalopathy\n (BONBID-HIE): II. 2-year Neurocognitive Outcome and NICU Outcome","summary":" Hypoxic Ischemic Encephalopathy (HIE) affects approximately 1-5/1000 newborns\nglobally and leads to adverse neurocognitive outcomes in 30% to 50% of cases by\ntwo years of age. Despite therapeutic advances with Therapeutic Hypothermia\n(TH), prognosis remains challenging, highlighting the need for improved\nbiomarkers. This paper introduces the second release of the Boston Neonatal\nBrain Injury Dataset for Hypoxic-Ischemic Encephalopathy (BONBID-HIE), an\nopen-source, comprehensive MRI and clinical dataset featuring 237 patients,\nincluding NICU outcomes and 2-year neurocognitive outcomes from Massachusetts\nGeneral Hospital and Boston Children's Hospital.\n","authors":["Rina Bao","Yangming Ou"],"pdf_url":"https://arxiv.org/pdf/2411.03456v1.pdf","comment":"Data description for BONBID-HIE 2024 Challenge on MICCAI 2024"},{"id":"http://arxiv.org/abs/2411.02229v2","updated":"2024-11-05T19:06:16Z","published":"2024-11-04T16:21:00Z","title":"FewViewGS: Gaussian Splatting with Few View Matching and Multi-stage\n Training","summary":" The field of novel view synthesis from images has seen rapid advancements\nwith the introduction of Neural Radiance Fields (NeRF) and more recently with\n3D Gaussian Splatting. Gaussian Splatting became widely adopted due to its\nefficiency and ability to render novel views accurately. While Gaussian\nSplatting performs well when a sufficient amount of training images are\navailable, its unstructured explicit representation tends to overfit in\nscenarios with sparse input images, resulting in poor rendering performance. To\naddress this, we present a 3D Gaussian-based novel view synthesis method using\nsparse input images that can accurately render the scene from the viewpoints\nnot covered by the training images. We propose a multi-stage training scheme\nwith matching-based consistency constraints imposed on the novel views without\nrelying on pre-trained depth estimation or diffusion models. This is achieved\nby using the matches of the available training images to supervise the\ngeneration of the novel views sampled between the training frames with color,\ngeometry, and semantic losses. In addition, we introduce a locality preserving\nregularization for 3D Gaussians which removes rendering artifacts by preserving\nthe local color structure of the scene. Evaluation on synthetic and real-world\ndatasets demonstrates competitive or superior performance of our method in\nfew-shot novel view synthesis compared to existing state-of-the-art methods.\n","authors":["Ruihong Yin","Vladimir Yugay","Yue Li","Sezer Karaoglu","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2411.02229v2.pdf","comment":"Accepted by NeurIPS2024"},{"id":"http://arxiv.org/abs/2411.03445v1","updated":"2024-11-05T19:00:34Z","published":"2024-11-05T19:00:34Z","title":"Solving Trojan Detection Competitions with Linear Weight Classification","summary":" Neural networks can conceal malicious Trojan backdoors that allow a trigger\nto covertly change the model behavior. Detecting signs of these backdoors,\nparticularly without access to any triggered data, is the subject of ongoing\nresearch and open challenges. In one common formulation of the problem, we are\ngiven a set of clean and poisoned models and need to predict whether a given\ntest model is clean or poisoned. In this paper, we introduce a detector that\nworks remarkably well across many of the existing datasets and domains. It is\nobtained by training a binary classifier on a large number of models' weights\nafter performing a few different pre-processing steps including feature\nselection and standardization, reference model weights subtraction, and model\nalignment prior to detection. We evaluate this algorithm on a diverse set of\nTrojan detection benchmarks and domains and examine the cases where the\napproach is most and least effective.\n","authors":["Todd Huster","Peter Lin","Razvan Stefanescu","Emmanuel Ekwedike","Ritu Chadha"],"pdf_url":"https://arxiv.org/pdf/2411.03445v1.pdf","comment":"9 pages, 4 Figures"},{"id":"http://arxiv.org/abs/2411.03405v1","updated":"2024-11-05T18:39:25Z","published":"2024-11-05T18:39:25Z","title":"Fine-Grained Spatial and Verbal Losses for 3D Visual Grounding","summary":" 3D visual grounding consists of identifying the instance in a 3D scene which\nis referred by an accompanying language description. While several\narchitectures have been proposed within the commonly employed\ngrounding-by-selection framework, the utilized losses are comparatively\nunder-explored. In particular, most methods rely on a basic supervised\ncross-entropy loss on the predicted distribution over candidate instances,\nwhich fails to model both spatial relations between instances and the internal\nfine-grained word-level structure of the verbal referral. Sparse attempts to\nadditionally supervise verbal embeddings globally by learning the class of the\nreferred instance from the description or employing verbo-visual contrast to\nbetter separate instance embeddings do not fundamentally lift the\naforementioned limitations. Responding to these shortcomings, we introduce two\nnovel losses for 3D visual grounding: a visual-level offset loss on regressed\nvector offsets from each instance to the ground-truth referred instance and a\nlanguage-related span loss on predictions for the word-level span of the\nreferred instance in the description. In addition, we equip the verbo-visual\nfusion module of our new 3D visual grounding architecture AsphaltNet with a\ntop-down bidirectional attentive fusion block, which enables the supervisory\nsignals from our two losses to propagate to the respective converse branches of\nthe network and thus aid the latter to learn context-aware instance embeddings\nand grounding-aware verbal embeddings. AsphaltNet proposes novel auxiliary\nlosses to aid 3D visual grounding with competitive results compared to the\nstate-of-the-art on the ReferIt3D benchmark.\n","authors":["Sombit Dey","Ozan Unal","Christos Sakaridis","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2411.03405v1.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2411.03403v1","updated":"2024-11-05T18:38:42Z","published":"2024-11-05T18:38:42Z","title":"Enhancing Maritime Situational Awareness through End-to-End Onboard Raw\n Data Analysis","summary":" Satellite-based onboard data processing is crucial for time-sensitive\napplications requiring timely and efficient rapid response. Advances in edge\nartificial intelligence are shifting computational power from ground-based\ncenters to on-orbit platforms, transforming the\n\"sensing-communication-decision-feedback\" cycle and reducing latency from\nacquisition to delivery. The current research presents a framework addressing\nthe strict bandwidth, energy, and latency constraints of small satellites,\nfocusing on maritime monitoring. The study contributes three main innovations.\nFirstly, it investigates the application of deep learning techniques for direct\nship detection and classification from raw satellite imagery. By simplifying\nthe onboard processing chain, our approach facilitates direct analyses without\nrequiring computationally intensive steps such as calibration and\northo-rectification. Secondly, to address the scarcity of raw satellite data,\nwe introduce two novel datasets, VDS2Raw and VDV2Raw, which are derived from\nraw data from Sentinel-2 and Vegetation and Environment Monitoring New Micro\nSatellite (VENuS) missions, respectively, and enriched with Automatic\nIdentification System (AIS) records. Thirdly, we characterize the tasks'\noptimal single and multiple spectral band combinations through statistical and\nfeature-based analyses validated on both datasets. In sum, we demonstrate the\nfeasibility of the proposed method through a proof-of-concept on CubeSat-like\nhardware, confirming the models' potential for operational satellite-based\nmaritime monitoring.\n","authors":["Roberto Del Prete","Manuel Salvoldi","Domenico Barretta","Nicolas Longépé","Gabriele Meoni","Arnon Karnieli","Maria Daniela Graziano","Alfredo Renga"],"pdf_url":"https://arxiv.org/pdf/2411.03403v1.pdf","comment":"38 pages"},{"id":"http://arxiv.org/abs/2407.15668v2","updated":"2024-11-05T18:38:08Z","published":"2024-07-22T14:29:36Z","title":"SLVideo: A Sign Language Video Moment Retrieval Framework","summary":" SLVideo is a video moment retrieval system for Sign Language videos that\nincorporates facial expressions, addressing this gap in existing technology.\nThe system extracts embedding representations for the hand and face signs from\nvideo frames to capture the signs in their entirety, enabling users to search\nfor a specific sign language video segment with text queries. A collection of\neight hours of annotated Portuguese Sign Language videos is used as the\ndataset, and a CLIP model is used to generate the embeddings. The initial\nresults are promising in a zero-shot setting. In addition, SLVideo incorporates\na thesaurus that enables users to search for similar signs to those retrieved,\nusing the video segment embeddings, and also supports the edition and creation\nof video sign language annotations. Project web page:\nhttps://novasearch.github.io/SLVideo/\n","authors":["Gonçalo Vinagre Martins","João Magalhães","Afonso Quinaz","Carla Viegas","Sofia Cavaco"],"pdf_url":"https://arxiv.org/pdf/2407.15668v2.pdf","comment":"4 pages, 1 figure, 1 table"},{"id":"http://arxiv.org/abs/2405.17705v3","updated":"2024-11-05T18:02:53Z","published":"2024-05-27T23:38:10Z","title":"DC-Gaussian: Improving 3D Gaussian Splatting for Reflective Dash Cam\n Videos","summary":" We present DC-Gaussian, a new method for generating novel views from\nin-vehicle dash cam videos. While neural rendering techniques have made\nsignificant strides in driving scenarios, existing methods are primarily\ndesigned for videos collected by autonomous vehicles. However, these videos are\nlimited in both quantity and diversity compared to dash cam videos, which are\nmore widely used across various types of vehicles and capture a broader range\nof scenarios. Dash cam videos often suffer from severe obstructions such as\nreflections and occlusions on the windshields, which significantly impede the\napplication of neural rendering techniques. To address this challenge, we\ndevelop DC-Gaussian based on the recent real-time neural rendering technique 3D\nGaussian Splatting (3DGS). Our approach includes an adaptive image\ndecomposition module to model reflections and occlusions in a unified manner.\nAdditionally, we introduce illumination-aware obstruction modeling to manage\nreflections and occlusions under varying lighting conditions. Lastly, we employ\na geometry-guided Gaussian enhancement strategy to improve rendering details by\nincorporating additional geometry priors. Experiments on self-captured and\npublic dash cam videos show that our method not only achieves state-of-the-art\nperformance in novel view synthesis, but also accurately reconstructing\ncaptured scenes getting rid of obstructions. See the project page for code,\ndata: https://linhanwang.github.io/dcgaussian/.\n","authors":["Linhan Wang","Kai Cheng","Shuo Lei","Shengkun Wang","Wei Yin","Chenyang Lei","Xiaoxiao Long","Chang-Tien Lu"],"pdf_url":"https://arxiv.org/pdf/2405.17705v3.pdf","comment":"10 pages,7 figures;project page:\n https://linhanwang.github.io/dcgaussian/; Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.15199v2","updated":"2024-11-05T16:40:01Z","published":"2024-05-24T04:10:34Z","title":"ODGEN: Domain-specific Object Detection Data Generation with Diffusion\n Models","summary":" Modern diffusion-based image generative models have made significant progress\nand become promising to enrich training data for the object detection task.\nHowever, the generation quality and the controllability for complex scenes\ncontaining multi-class objects and dense objects with occlusions remain\nlimited. This paper presents ODGEN, a novel method to generate high-quality\nimages conditioned on bounding boxes, thereby facilitating data synthesis for\nobject detection. Given a domain-specific object detection dataset, we first\nfine-tune a pre-trained diffusion model on both cropped foreground objects and\nentire images to fit target distributions. Then we propose to control the\ndiffusion model using synthesized visual prompts with spatial constraints and\nobject-wise textual descriptions. ODGEN exhibits robustness in handling complex\nscenes and specific domains. Further, we design a dataset synthesis pipeline to\nevaluate ODGEN on 7 domain-specific benchmarks to demonstrate its\neffectiveness. Adding training data generated by ODGEN improves up to 25.3%\nmAP@.50:.95 with object detectors like YOLOv5 and YOLOv7, outperforming prior\ncontrollable generative methods. In addition, we design an evaluation protocol\nbased on COCO-2014 to validate ODGEN in general domains and observe an\nadvantage up to 5.6% in mAP@.50:.95 against existing methods.\n","authors":["Jingyuan Zhu","Shiyu Li","Yuxuan Liu","Ping Huang","Jiulong Shan","Huimin Ma","Jian Yuan"],"pdf_url":"https://arxiv.org/pdf/2405.15199v2.pdf","comment":"Accepted by NeurIPS2024"}]},"2024-11-06T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2411.04112v1","updated":"2024-11-06T18:44:09Z","published":"2024-11-06T18:44:09Z","title":"Fed-EC: Bandwidth-Efficient Clustering-Based Federated Learning For\n Autonomous Visual Robot Navigation","summary":" Centralized learning requires data to be aggregated at a central server,\nwhich poses significant challenges in terms of data privacy and bandwidth\nconsumption. Federated learning presents a compelling alternative, however,\nvanilla federated learning methods deployed in robotics aim to learn a single\nglobal model across robots that works ideally for all. But in practice one\nmodel may not be well suited for robots deployed in various environments. This\npaper proposes Federated-EmbedCluster (Fed-EC), a clustering-based federated\nlearning framework that is deployed with vision based autonomous robot\nnavigation in diverse outdoor environments. The framework addresses the key\nfederated learning challenge of deteriorating model performance of a single\nglobal model due to the presence of non-IID data across real-world robots.\nExtensive real-world experiments validate that Fed-EC reduces the communication\nsize by 23x for each robot while matching the performance of centralized\nlearning for goal-oriented navigation and outperforms local learning. Fed-EC\ncan transfer previously learnt models to new robots that join the cluster.\n","authors":["Shreya Gummadi","Mateus V. Gasparino","Deepak Vasisht","Girish Chowdhary"],"pdf_url":"https://arxiv.org/pdf/2411.04112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03294v2","updated":"2024-11-06T17:53:26Z","published":"2024-11-05T17:41:14Z","title":"Out-of-Distribution Recovery with Object-Centric Keypoint Inverse Policy\n For Visuomotor Imitation Learning","summary":" We propose an object-centric recovery policy framework to address the\nchallenges of out-of-distribution (OOD) scenarios in visuomotor policy\nlearning. Previous behavior cloning (BC) methods rely heavily on a large amount\nof labeled data coverage, failing in unfamiliar spatial states. Without relying\non extra data collection, our approach learns a recovery policy constructed by\nan inverse policy inferred from object keypoint manifold gradient in the\noriginal training data. The recovery policy serves as a simple add-on to any\nbase visuomotor BC policy, agnostic to a specific method, guiding the system\nback towards the training distribution to ensure task success even in OOD\nsituations. We demonstrate the effectiveness of our object-centric framework in\nboth simulation and real robot experiments, achieving an improvement of 77.7%\nover the base policy in OOD. Project Website:\nhttps://sites.google.com/view/ocr-penn\n","authors":["George Jiayuan Gao","Tianyu Li","Nadia Figueroa"],"pdf_url":"https://arxiv.org/pdf/2411.03294v2.pdf","comment":"Accepted for Spotlight (5 out of 21 papers) at CoRL 2024 Workshop on\n Lifelong Learning for Home Robots"},{"id":"http://arxiv.org/abs/2411.04073v1","updated":"2024-11-06T17:50:32Z","published":"2024-11-06T17:50:32Z","title":"Rescheduling after vehicle failures in the multi-depot rural postman\n problem with rechargeable and reusable vehicles","summary":" We present a centralized auction algorithm to solve the Multi-Depot Rural\nPostman Problem with Rechargeable and Reusable Vehicles (MD-RPP-RRV), focusing\non rescheduling arc routing after vehicle failures. The problem involves\nfinding heuristically obtained best feasible routes for multiple rechargeable\nand reusable vehicles with capacity constraints capable of performing multiple\ntrips from multiple depots, with the possibility of vehicle failures. Our\nalgorithm auctions the failed trips to active (non-failed) vehicles through\nlocal auctioning, modifying initial routes to handle dynamic vehicle failures\nefficiently. When a failure occurs, the algorithm searches for the best active\nvehicle to perform the failed trip and inserts the trip into that vehicle's\nroute, which avoids a complete rescheduling and reduces the computational\neffort. We compare the algorithm's solutions against offline optimal solutions\nobtained from solving a Mixed Integer Linear Programming (MILP) formulation\nusing the Gurobi solver; this formulation assumes that perfect information\nabout the vehicle failures and failure times is given. The results demonstrate\nthat the centralized auction algorithm produces solutions that are, in some\ncases, near optimal; moreover, the execution time for the proposed approach is\nmuch more consistent and is, for some instances, orders of magnitude less than\nthe execution time of the Gurobi solver. The theoretical analysis provides an\nupper bound for the competitive ratio and computational complexity of our\nalgorithm, offering a formal performance guarantee in dynamic failure\nscenarios.\n","authors":["Eashwar Sathyamurthy","Jeffrey W. Herrmann","Shapour Azarm"],"pdf_url":"https://arxiv.org/pdf/2411.04073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04056v1","updated":"2024-11-06T17:05:58Z","published":"2024-11-06T17:05:58Z","title":"Problem Space Transformations for Generalisation in Behavioural Cloning","summary":" The combination of behavioural cloning and neural networks has driven\nsignificant progress in robotic manipulation. As these algorithms may require a\nlarge number of demonstrations for each task of interest, they remain\nfundamentally inefficient in complex scenarios. This issue is aggravated when\nthe system is treated as a black-box, ignoring its physical properties. This\nwork characterises widespread properties of robotic manipulation, such as pose\nequivariance and locality. We empirically demonstrate that transformations\narising from each of these properties allow neural policies trained with\nbehavioural cloning to better generalise to out-of-distribution problem\ninstances.\n","authors":["Kiran Doshi","Marco Bagatella","Stelian Coros"],"pdf_url":"https://arxiv.org/pdf/2411.04056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08774v3","updated":"2024-11-06T16:57:36Z","published":"2024-02-13T20:16:31Z","title":"LDTrack: Dynamic People Tracking by Service Robots using Diffusion\n Models","summary":" Tracking of dynamic people in cluttered and crowded human-centered\nenvironments is a challenging robotics problem due to the presence of\nintraclass variations including occlusions, pose deformations, and lighting\nvariations. This paper introduces a novel deep learning architecture, using\nconditional latent diffusion models, the Latent Diffusion Track (LDTrack), for\ntracking multiple dynamic people under intraclass variations. By uniquely\nutilizing conditional latent diffusion models to capture temporal person\nembeddings, our architecture can adapt to appearance changes of people over\ntime. We incorporated a latent feature encoder network which enables the\ndiffusion process to operate within a high-dimensional latent space to allow\nfor the extraction and spatial-temporal refinement of such rich features as\nperson appearance, motion, location, identity, and contextual information.\nExtensive experiments demonstrate the effectiveness of LDTrack over other\nstate-of-the-art tracking methods in cluttered and crowded human-centered\nenvironments under intraclass variations. Namely, the results show our method\noutperforms existing deep learning robotic people tracking methods in both\ntracking accuracy and tracking precision with statistical significance.\nAdditionally, a comprehensive multi-object tracking comparison study was\nperformed against the state-of-the-art methods in urban environments,\ndemonstrating the generalizability of LDTrack. An ablation study was performed\nto validate the design choices of LDTrack.\n","authors":["Angus Fung","Beno Benhabib","Goldie Nejat"],"pdf_url":"https://arxiv.org/pdf/2402.08774v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04050v1","updated":"2024-11-06T16:57:36Z","published":"2024-11-06T16:57:36Z","title":"Memorized action chunking with Transformers: Imitation learning for\n vision-based tissue surface scanning","summary":" Optical sensing technologies are emerging technologies used in cancer\nsurgeries to ensure the complete removal of cancerous tissue. While point-wise\nassessment has many potential applications, incorporating automated large area\nscanning would enable holistic tissue sampling. However, such scanning tasks\nare challenging due to their long-horizon dependency and the requirement for\nfine-grained motion. To address these issues, we introduce Memorized Action\nChunking with Transformers (MACT), an intuitive yet efficient imitation\nlearning method for tissue surface scanning tasks. It utilizes a sequence of\npast images as historical information to predict near-future action sequences.\nIn addition, hybrid temporal-spatial positional embeddings were employed to\nfacilitate learning. In various simulation settings, MACT demonstrated\nsignificant improvements in contour scanning and area scanning over the\nbaseline model. In real-world testing, with only 50 demonstration trajectories,\nMACT surpassed the baseline model by achieving a 60-80% success rate on all\nscanning tasks. Our findings suggest that MACT is a promising model for\nadaptive scanning in surgical settings.\n","authors":["Bochen Yang","Kaizhong Deng","Christopher J Peters","George Mylonas","Daniel S. Elson"],"pdf_url":"https://arxiv.org/pdf/2411.04050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22997v2","updated":"2024-11-06T16:57:03Z","published":"2024-10-30T13:22:55Z","title":"A Comparison of Prompt Engineering Techniques for Task Planning and\n Execution in Service Robotics","summary":" Recent advances in LLM have been instrumental in autonomous robot control and\nhuman-robot interaction by leveraging their vast general knowledge and\ncapabilities to understand and reason across a wide range of tasks and\nscenarios. Previous works have investigated various prompt engineering\ntechniques for improving the performance of LLM to accomplish tasks, while\nothers have proposed methods that utilize LLMs to plan and execute tasks based\non the available functionalities of a given robot platform. In this work, we\nconsider both lines of research by comparing prompt engineering techniques and\ncombinations thereof within the application of high-level task planning and\nexecution in service robotics. We define a diverse set of tasks and a simple\nset of functionalities in simulation, and measure task completion accuracy and\nexecution time for several state-of-the-art models.\n","authors":["Jonas Bode","Bastian Pätzold","Raphael Memmesheimer","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2410.22997v2.pdf","comment":"6 pages, 3 figures, 2 tables, to be published in the 2024 IEEE-RAS\n International Conference on Humanoid Robots, We make our code, including all\n prompts, available at https://github.com/AIS-Bonn/Prompt_Engineering"},{"id":"http://arxiv.org/abs/2411.04046v1","updated":"2024-11-06T16:51:30Z","published":"2024-11-06T16:51:30Z","title":"Design and control of a robotic payload stabilization mechanism for\n rocket flights","summary":" The use of parallel manipulators in aerospace engineering has gained\nsignificant attention due to their ability to provide improved stability and\nprecision. This paper presents the design, control, and analysis of 'STEWIE',\nwhich is a three-degree-of-freedom (DoF) parallel manipulator robot developed\nby members of the thrustMIT rocketry team, as a payload stabilization mechanism\nfor their sounding rocket, 'Altair'. The goal of the robot was to demonstrate\nthe attitude control of the parallel plate against the continuous change in\norientation experienced by the rocket during its flight, stabilizing the\npayloads. At the same time, the high gravitational forces (G-forces) and\nvibrations experienced by the sounding rocket are counteracted. A novel design\nof the mechanism, inspired by a standard Stewart platform, is proposed which\nwas down-scaled to fit inside a 4U CubeSat within its space constraints. The\nrobot uses three micro servo motors to actuate the links that control the\nalignment of the parallel plate. In addition to the actuation mechanism, a\nrobust control system for its manipulation was developed for the robot. The\nrobot represents a significant advancement in the field of space robotics in\nthe aerospace industry by demonstrating the successful implementation of\ncomplex robotic mechanisms in small, confined spaces such as CubeSats, which\nare standard form factors for large payloads in the aerospace industry.\n","authors":["Utkarsh Anand","Diya Parekh","Thakur Pranav G. Singh","Hrishikesh S. Yadav","Ramya S. Moorthy","Srinivas G"],"pdf_url":"https://arxiv.org/pdf/2411.04046v1.pdf","comment":"For code and design files, refer to\n https://github.com/utkarshanand140/Stewie-Robot"},{"id":"http://arxiv.org/abs/2402.02989v3","updated":"2024-11-06T16:33:29Z","published":"2024-02-05T13:27:41Z","title":"DexDiffuser: Generating Dexterous Grasps with Diffusion Models","summary":" We introduce DexDiffuser, a novel dexterous grasping method that generates,\nevaluates, and refines grasps on partial object point clouds. DexDiffuser\nincludes the conditional diffusion-based grasp sampler DexSampler and the\ndexterous grasp evaluator DexEvaluator. DexSampler generates high-quality\ngrasps conditioned on object point clouds by iterative denoising of randomly\nsampled grasps. We also introduce two grasp refinement strategies:\nEvaluator-Guided Diffusion (EGD) and Evaluator-based Sampling Refinement (ESR).\nThe experiment results demonstrate that DexDiffuser consistently outperforms\nthe state-of-the-art multi-finger grasp generation method FFHNet with an, on\naverage, 9.12% and 19.44% higher grasp success rate in simulation and real\nrobot experiments, respectively. Supplementary materials are available at\nhttps://yulihn.github.io/DexDiffuser_page/\n","authors":["Zehang Weng","Haofei Lu","Danica Kragic","Jens Lundell"],"pdf_url":"https://arxiv.org/pdf/2402.02989v3.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2301.06668v3","updated":"2024-11-06T16:23:12Z","published":"2023-01-17T02:39:22Z","title":"UMIRobot: An Open-{Software, Hardware} Low-Cost Robotic Manipulator for\n Education","summary":" Robot teleoperation has been studied for the past 70 years and is relevant in\nmany contexts, such as in the handling of hazardous materials and telesurgery.\nThe COVID19 pandemic has rekindled interest in this topic, but the existing\nrobotic education kits fall short of being suitable for teleoperated robotic\nmanipulator learning. In addition, the global restrictions of motion motivated\nlarge investments in online/hybrid education. In this work, a newly developed\nrobotics education kit and its ecosystem are presented which is used as the\nbackbone of an online/hybrid course in teleoperated robots. The students are\ndivided into teams. Each team designs, fabricates (3D printing and assembling),\nand implements a control strategy for a master device and gripper. Coupling\nthose with the UMIRobot, provided as a kit, the students compete in a\nteleoperation challenge. The kit is low cost (< 100USD), which allows\nhigher-learning institutions to provide one kit per student and they can learn\nin a risk-free environment. As of now, 73 such kits have been assembled and\nsent to course participants in eight countries. As major success stories, we\nshow an example of gripper and master designed for the proposed course. In\naddition, we show a teleoperated task between Japan and Bangladesh executed by\ncourse participants. Design files, videos, source code, and more information\nare available at https://mmmarinho.github.io/UMIRobot/\n","authors":["Murilo M. Marinho","Hung-Ching Lin","Jiawei Zhao"],"pdf_url":"https://arxiv.org/pdf/2301.06668v3.pdf","comment":"Accepted on IROS 2023, 8 pages. Fixed a few typos"},{"id":"http://arxiv.org/abs/2411.04006v1","updated":"2024-11-06T15:44:59Z","published":"2024-11-06T15:44:59Z","title":"Select2Plan: Training-Free ICL-Based Planning through VQA and Memory\n Retrieval","summary":" This study explores the potential of off-the-shelf Vision-Language Models\n(VLMs) for high-level robot planning in the context of autonomous navigation.\nIndeed, while most of existing learning-based approaches for path planning\nrequire extensive task-specific training/fine-tuning, we demonstrate how such\ntraining can be avoided for most practical cases. To do this, we introduce\nSelect2Plan (S2P), a novel training-free framework for high-level robot\nplanning which completely eliminates the need for fine-tuning or specialised\ntraining. By leveraging structured Visual Question-Answering (VQA) and\nIn-Context Learning (ICL), our approach drastically reduces the need for data\ncollection, requiring a fraction of the task-specific data typically used by\ntrained models, or even relying only on online data. Our method facilitates the\neffective use of a generally trained VLM in a flexible and cost-efficient way,\nand does not require additional sensing except for a simple monocular camera.\nWe demonstrate its adaptability across various scene types, context sources,\nand sensing setups. We evaluate our approach in two distinct scenarios:\ntraditional First-Person View (FPV) and infrastructure-driven Third-Person View\n(TPV) navigation, demonstrating the flexibility and simplicity of our method.\nOur technique significantly enhances the navigational capabilities of a\nbaseline VLM of approximately 50% in TPV scenario, and is comparable to trained\nmodels in the FPV one, with as few as 20 demonstrations.\n","authors":["Davide Buoso","Luke Robinson","Giuseppe Averta","Philip Torr","Tim Franzmeyer","Daniele De Martini"],"pdf_url":"https://arxiv.org/pdf/2411.04006v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04005v1","updated":"2024-11-06T15:44:10Z","published":"2024-11-06T15:44:10Z","title":"Object-Centric Dexterous Manipulation from Human Motion Data","summary":" Manipulating objects to achieve desired goal states is a basic but important\nskill for dexterous manipulation. Human hand motions demonstrate proficient\nmanipulation capability, providing valuable data for training robots with\nmulti-finger hands. Despite this potential, substantial challenges arise due to\nthe embodiment gap between human and robot hands. In this work, we introduce a\nhierarchical policy learning framework that uses human hand motion data for\ntraining object-centric dexterous robot manipulation. At the core of our method\nis a high-level trajectory generative model, learned with a large-scale human\nhand motion capture dataset, to synthesize human-like wrist motions conditioned\non the desired object goal states. Guided by the generated wrist motions, deep\nreinforcement learning is further used to train a low-level finger controller\nthat is grounded in the robot's embodiment to physically interact with the\nobject to achieve the goal. Through extensive evaluation across 10 household\nobjects, our approach not only demonstrates superior performance but also\nshowcases generalization capability to novel object geometries and goal states.\nFurthermore, we transfer the learned policies from simulation to a real-world\nbimanual dexterous robot system, further demonstrating its applicability in\nreal-world scenarios. Project website:\nhttps://cypypccpy.github.io/obj-dex.github.io/.\n","authors":["Yuanpei Chen","Chen Wang","Yaodong Yang","C. Karen Liu"],"pdf_url":"https://arxiv.org/pdf/2411.04005v1.pdf","comment":"20 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.03990v1","updated":"2024-11-06T15:30:42Z","published":"2024-11-06T15:30:42Z","title":"ET-SEED: Efficient Trajectory-Level SE(3) Equivariant Diffusion Policy","summary":" Imitation learning, e.g., diffusion policy, has been proven effective in\nvarious robotic manipulation tasks. However, extensive demonstrations are\nrequired for policy robustness and generalization. To reduce the demonstration\nreliance, we leverage spatial symmetry and propose ET-SEED, an efficient\ntrajectory-level SE(3) equivariant diffusion model for generating action\nsequences in complex robot manipulation tasks. Further, previous equivariant\ndiffusion models require the per-step equivariance in the Markov process,\nmaking it difficult to learn policy under such strong constraints. We\ntheoretically extend equivariant Markov kernels and simplify the condition of\nequivariant diffusion process, thereby significantly improving training\nefficiency for trajectory-level SE(3) equivariant diffusion policy in an\nend-to-end manner. We evaluate ET-SEED on representative robotic manipulation\ntasks, involving rigid body, articulated and deformable object. Experiments\ndemonstrate superior data efficiency and manipulation proficiency of our\nproposed method, as well as its ability to generalize to unseen configurations\nwith only a few demonstrations. Website: https://et-seed.github.io/\n","authors":["Chenrui Tie","Yue Chen","Ruihai Wu","Boxuan Dong","Zeyi Li","Chongkai Gao","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2411.03990v1.pdf","comment":"Accept to CoRL 2024 Workshop on X-Embodiment Robot Learning"},{"id":"http://arxiv.org/abs/2309.13171v2","updated":"2024-11-06T14:46:04Z","published":"2023-09-22T20:17:11Z","title":"Robust Perception-Informed Navigation using PAC-NMPC with a Learned\n Value Function","summary":" Nonlinear model predictive control (NMPC) is typically restricted to short,\nfinite horizons to limit the computational burden of online optimization. As a\nresult, global planning frameworks are frequently necessary to avoid local\nminima when using NMPC for navigation in complex environments. By contrast,\nreinforcement learning (RL) can generate policies that minimize the expected\ncost over an infinite-horizon and can often avoid local minima, even when\noperating only on current sensor measurements. However, these learned policies\nare usually unable to provide performance guarantees (e.g., on collision\navoidance), especially when outside of the training distribution. In this\npaper, we augment Probably Approximately Correct NMPC (PAC-NMPC), a\nsampling-based stochastic NMPC algorithm capable of providing statistical\nguarantees of performance and safety, with an approximate perception-dependent\nvalue function trained via RL. We demonstrate in simulation that our algorithm\ncan improve the long-term behavior of PAC-NMPC while outperforming other\napproaches with regards to safety for both planar car dynamics and more\ncomplex, high-dimensional fixed-wing aerial vehicle dynamics. We also\ndemonstrate that, even when our value function is trained in simulation, our\nalgorithm can successfully achieve statistically safe navigation on hardware\nusing a 1/10th scale rally car in cluttered real-world environments using only\ncurrent sensor information.\n","authors":["Adam Polevoy","Mark Gonzales","Marin Kobilarov","Joseph Moore"],"pdf_url":"https://arxiv.org/pdf/2309.13171v2.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2411.03951v1","updated":"2024-11-06T14:33:30Z","published":"2024-11-06T14:33:30Z","title":"Continuous-Time State Estimation Methods in Robotics: A Survey","summary":" Accurate, efficient, and robust state estimation is more important than ever\nin robotics as the variety of platforms and complexity of tasks continue to\ngrow. Historically, discrete-time filters and smoothers have been the dominant\napproach, in which the estimated variables are states at discrete sample times.\nThe paradigm of continuous-time state estimation proposes an alternative\nstrategy by estimating variables that express the state as a continuous\nfunction of time, which can be evaluated at any query time. Not only can this\nbenefit downstream tasks such as planning and control, but it also\nsignificantly increases estimator performance and flexibility, as well as\nreduces sensor preprocessing and interfacing complexity. Despite this,\ncontinuous-time methods remain underutilized, potentially because they are less\nwell-known within robotics. To remedy this, this work presents a unifying\nformulation of these methods and the most exhaustive literature review to date,\nsystematically categorizing prior work by methodology, application, state\nvariables, historical context, and theoretical contribution to the field. By\nsurveying splines and Gaussian processes together and contextualizing works\nfrom other research domains, this work identifies and analyzes open problems in\ncontinuous-time state estimation and suggests new research directions.\n","authors":["William Talbot","Julian Nubert","Turcan Tuna","Cesar Cadena","Frederike Dümbgen","Jesus Tordesillas","Timothy D. Barfoot","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2411.03951v1.pdf","comment":"Submitted to IEEE Transactions on Robotics (T-RO)"},{"id":"http://arxiv.org/abs/2410.06192v2","updated":"2024-11-06T14:29:21Z","published":"2024-10-08T16:54:44Z","title":"Hibikino-Musashi@Home 2024 Team Description Paper","summary":" This paper provides an overview of the techniques employed by\nHibikino-Musashi@Home, which intends to participate in the domestic standard\nplatform league. The team has developed a dataset generator for training a\nrobot vision system and an open-source development environment running on a\nHuman Support Robot simulator.\n The large language model powered task planner selects appropriate primitive\nskills to perform the task requested by users. The team aims to design a home\nservice robot that can assist humans in their homes and continuously attends\ncompetitions to evaluate and improve the developed system.\n","authors":["Kosei Isomoto","Akinobu Mizutani","Fumiya Matsuzaki","Hikaru Sato","Ikuya Matsumoto","Kosei Yamao","Takuya Kawabata","Tomoya Shiba","Yuga Yano","Atsuki Yokota","Daiju Kanaoka","Hiromasa Yamaguchi","Kazuya Murai","Kim Minje","Lu Shen","Mayo Suzuka","Moeno Anraku","Naoki Yamaguchi","Satsuki Fujimatsu","Shoshi Tokuno","Tadataka Mizo","Tomoaki Fujino","Yuuki Nakadera","Yuka Shishido","Yusuke Nakaoka","Yuichiro Tanaka","Takashi Morie","Hakaru Tamukoh"],"pdf_url":"https://arxiv.org/pdf/2410.06192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03845v2","updated":"2024-11-06T14:11:05Z","published":"2024-06-06T08:23:22Z","title":"Open Problem: Active Representation Learning","summary":" In this work, we introduce the concept of Active Representation Learning, a\nnovel class of problems that intertwines exploration and representation\nlearning within partially observable environments. We extend ideas from Active\nSimultaneous Localization and Mapping (active SLAM), and translate them to\nscientific discovery problems, exemplified by adaptive microscopy. We explore\nthe need for a framework that derives exploration skills from representations\nthat are in some sense actionable, aiming to enhance the efficiency and\neffectiveness of data collection and model building in the natural sciences.\n","authors":["Nikola Milosevic","Gesine Müller","Jan Huisken","Nico Scherf"],"pdf_url":"https://arxiv.org/pdf/2406.03845v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03928v1","updated":"2024-11-06T14:03:49Z","published":"2024-11-06T14:03:49Z","title":"DEIO: Deep Event Inertial Odometry","summary":" Event cameras are bio-inspired, motion-activated sensors that demonstrate\nimpressive potential in handling challenging situations, such as motion blur\nand high-dynamic range. Despite their promise, existing event-based\nsimultaneous localization and mapping (SLAM) approaches exhibit limited\nperformance in real-world applications. On the other hand, state-of-the-art\nSLAM approaches that incorporate deep neural networks for better robustness and\napplicability. However, these is a lack of research in fusing learning-based\nevent SLAM methods with IMU, which could be indispensable to push the\nevent-based SLAM to large-scale, low-texture or complex scenarios. In this\npaper, we propose DEIO, the first monocular deep event-inertial odometry\nframework that combines learning-based method with traditional nonlinear\ngraph-based optimization. Specifically, we tightly integrate a trainable\nevent-based differentiable bundle adjustment (e-DBA) with the IMU\npre-integration in a factor graph which employs keyframe-based sliding window\noptimization. Numerical Experiments in nine public challenge datasets show that\nour method can achieve superior performance compared with the image-based and\nevent-based benchmarks. The source code is available at:\nhttps://github.com/arclab-hku/DEIO.\n","authors":["Weipeng Guan","Fuling Lin","Peiyu Chen","Peng Lu"],"pdf_url":"https://arxiv.org/pdf/2411.03928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04815v3","updated":"2024-11-06T13:24:41Z","published":"2024-06-07T10:35:29Z","title":"Skill-aware Mutual Information Optimisation for Generalisation in\n Reinforcement Learning","summary":" Meta-Reinforcement Learning (Meta-RL) agents can struggle to operate across\ntasks with varying environmental features that require different optimal skills\n(i.e., different modes of behaviour). Using context encoders based on\ncontrastive learning to enhance the generalisability of Meta-RL agents is now\nwidely studied but faces challenges such as the requirement for a large sample\nsize, also referred to as the $\\log$-$K$ curse. To improve RL generalisation to\ndifferent tasks, we first introduce Skill-aware Mutual Information (SaMI), an\noptimisation objective that aids in distinguishing context embeddings according\nto skills, thereby equipping RL agents with the ability to identify and execute\ndifferent skills across tasks. We then propose Skill-aware Noise Contrastive\nEstimation (SaNCE), a $K$-sample estimator used to optimise the SaMI objective.\nWe provide a framework for equipping an RL agent with SaNCE in practice and\nconduct experimental validation on modified MuJoCo and Panda-gym benchmarks. We\nempirically find that RL agents that learn by maximising SaMI achieve\nsubstantially improved zero-shot generalisation to unseen tasks. Additionally,\nthe context encoder trained with SaNCE demonstrates greater robustness to a\nreduction in the number of available samples, thus possessing the potential to\novercome the $\\log$-$K$ curse.\n","authors":["Xuehui Yu","Mhairi Dunion","Xin Li","Stefano V. Albrecht"],"pdf_url":"https://arxiv.org/pdf/2406.04815v3.pdf","comment":"The Thirty-eighth Annual Conference on Neural Information Processing\n Systems (NeurIPS), 2024"},{"id":"http://arxiv.org/abs/2411.03873v1","updated":"2024-11-06T12:40:59Z","published":"2024-11-06T12:40:59Z","title":"Biomechanics-Aware Trajectory Optimization for Navigation during Robotic\n Physiotherapy","summary":" Robotic devices hold promise for aiding patients in orthopedic\nrehabilitation. However, current robotic-assisted physiotherapy methods\nstruggle including biomechanical metrics in their control algorithms, crucial\nfor safe and effective therapy. This paper introduces BATON, a\nBiomechanics-Aware Trajectory Optimization approach to robotic Navigation of\nhuman musculoskeletal loads. The method integrates a high-fidelity\nmusculoskeletal model of the human shoulder into real-time control of\nrobot-patient interaction during rotator cuff tendon rehabilitation. We extract\nskeletal dynamics and tendon loading information from an OpenSim shoulder model\nto solve an optimal control problem, generating strain-minimizing trajectories.\nTrajectories were realized on a healthy subject by an impedance-controlled\nrobot while estimating the state of the subject's shoulder. Target poses were\nprescribed to design personalized rehabilitation across a wide range of\nshoulder motion avoiding high-strain areas. BATON was designed with real-time\ncapabilities, enabling continuous trajectory replanning to address unforeseen\nvariations in tendon strain, such as those from changing muscle activation of\nthe subject.\n","authors":["Italo Belli","J. Micah Prendergast","Ajay Seth","Luka Peternel"],"pdf_url":"https://arxiv.org/pdf/2411.03873v1.pdf","comment":"13 pages, 9 figures, under review"},{"id":"http://arxiv.org/abs/2111.08248v3","updated":"2024-11-06T12:20:10Z","published":"2021-11-16T06:14:31Z","title":"Active Vapor-Based Robotic Wiper","summary":" This paper presents a method for estimating normals of mirrors and\ntransparent objects challenging for cameras to recognize. We propose spraying\nwater vapor onto mirror or transparent surfaces to create a diffuse reflective\nsurface. Using an ultrasonic humidifier on a robotic arm, we apply water vapor\nto the target object's surface, forming a cross-shaped misted area. This\ncreates partially diffuse reflective surfaces, enabling the camera to detect\nthe target object's surface. Adjusting the gripper-mounted camera viewpoint\nmaximizes the extracted misted area's appearance in the image, allowing normal\nestimation of the target surface. Experiments show the method's effectiveness,\nwith RMSEs of azimuth estimation for mirrors and transparent glass at\napproximately 4.2 and 5.8 degrees, respectively. Our robot experiments\ndemonstrated that our robotic wiper can perform contact-force-regulated wiping\nmotions to clean a transparent window, akin to human performance.\n","authors":["Takuya Kiyokawa","Hiroki Katayama","Jun Takamatsu","Kensuke Harada"],"pdf_url":"https://arxiv.org/pdf/2111.08248v3.pdf","comment":"4 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.03838v1","updated":"2024-11-06T11:18:13Z","published":"2024-11-06T11:18:13Z","title":"Fundamental Three-Dimensional Configuration of Wire-Wound Muscle-Tendon\n Complex Drive","summary":" For robots to become more versatile and expand their areas of application,\ntheir bodies need to be suitable for contact with the environment. When the\nhuman body comes into contact with the environment, it is possible for it to\ncontinue to move even if the positional relationship between muscles or the\nshape of the muscles changes. We have already focused on the effect of\ngeometric deformation of muscles and proposed a drive system called wire-wound\nMuscle-Tendon Complex (ww-MTC), an extension of the wire drive system. Our\nprevious study using a robot with a two-dimensional configuration demonstrated\nseveral advantages: reduced wire loosening, interference, and wear; improved\nrobustness during environmental contact; and a muscular appearance. However,\nthis design had some problems, such as excessive muscle expansion that hindered\ninter-muscle movement, and confinement to planar motion. In this study, we\ndevelop the ww-MTC into a three-dimensional shape. We present a fundamental\nconstruction method for a muscle exterior that expands gently and can be\ncontacted over its entire surface. We also apply the three-dimensional ww-MTC\nto a 2-axis 3-muscle robot, and confirm that the robot can continue to move\nwhile adapting to its environment.\n","authors":["Yoshimoto Ribayashi","Yuta Sahara","Shogo Sawaguchi","Kazuhiro Miyama","Akihiro Miki","Kento Kawaharazuka","Kei Okada","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2411.03838v1.pdf","comment":"Accepted at Humanoids2024, website -\n https://sites.google.com/view/yoshimoto-ribayashi/projects, YouTube -\n https://youtu.be/EDeAqg7aAb4"},{"id":"http://arxiv.org/abs/2411.03817v1","updated":"2024-11-06T10:35:11Z","published":"2024-11-06T10:35:11Z","title":"From Novice to Expert: LLM Agent Policy Optimization via Step-wise\n Reinforcement Learning","summary":" The outstanding capabilities of large language models (LLMs) render them a\ncrucial component in various autonomous agent systems. While traditional\nmethods depend on the inherent knowledge of LLMs without fine-tuning, more\nrecent approaches have shifted toward the reinforcement learning strategy to\nfurther enhance agents' ability to solve complex interactive tasks with\nenvironments and tools. However, previous approaches are constrained by the\nsparse reward issue, where existing datasets solely provide a final scalar\nreward for each multi-step reasoning chain, potentially leading to\nineffectiveness and inefficiency in policy learning. In this paper, we\nintroduce StepAgent, which utilizes step-wise reward to optimize the agent's\nreinforcement learning process. Inheriting the spirit of novice-to-expert\ntheory, we first compare the actions of the expert and the agent to\nautomatically generate intermediate rewards for fine-grained optimization.\nAdditionally, we propose implicit-reward and inverse reinforcement learning\ntechniques to facilitate agent reflection and policy adjustment. Further\ntheoretical analysis demonstrates that the action distribution of the agent can\nconverge toward the expert action distribution over multiple training cycles.\nExperimental results across various datasets indicate that StepAgent\noutperforms existing baseline methods.\n","authors":["Zhirui Deng","Zhicheng Dou","Yutao Zhu","Ji-Rong Wen","Ruibin Xiong","Mang Wang","Weipeng Chen"],"pdf_url":"https://arxiv.org/pdf/2411.03817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03815v1","updated":"2024-11-06T10:33:01Z","published":"2024-11-06T10:33:01Z","title":"How to Drawjectory? -- Trajectory Planning using Programming by\n Demonstration","summary":" A flight trajectory defines how exactly a quadrocopter moves in the\nthree-dimensional space from one position to another. Automatic flight\ntrajectory planning faces challenges such as high computational effort and a\nlack of precision. Hence, when low computational effort or precise control is\nrequired, programming the flight route trajectory manually might be preferable.\nHowever, this requires in-depth knowledge of how to accurately plan flight\ntrajectories in three-dimensional space. We propose planning quadrocopter\nflight trajectories manually using the Programming by Demonstration (PbD)\napproach -- simply drawing the trajectory in the three-dimensional space by\nhand. This simplifies the planning process and reduces the level of in-depth\nknowledge required.\n We implemented the approach in the context of the Quadcopter Lab at Ulm\nUniversity. In order to evaluate our approach, we compare the precision and\naccuracy of the trajectories drawn by a user using our approach as well as the\nrequired time with those manually programmed using a domain specific language.\nThe evaluation shows that the Drawjectory workflow is, on average, 78.7 seconds\nfaster without a significant loss of precision, shown by an average deviation\n6.67 cm.\n","authors":["Leonhard Alkewitz","Timo Zuccarello","Alexander Raschke","Matthias Tichy"],"pdf_url":"https://arxiv.org/pdf/2411.03815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09080v2","updated":"2024-11-06T10:23:59Z","published":"2024-04-13T20:55:15Z","title":"Safe Reinforcement Learning on the Constraint Manifold: Theory and\n Applications","summary":" Integrating learning-based techniques, especially reinforcement learning,\ninto robotics is promising for solving complex problems in unstructured\nenvironments. However, most existing approaches are trained in well-tuned\nsimulators and subsequently deployed on real robots without online fine-tuning.\nIn this setting, extensive engineering is required to mitigate the sim-to-real\ngap, which can be challenging for complex systems. Instead, learning with\nreal-world interaction data offers a promising alternative: it not only\neliminates the need for a fine-tuned simulator but also applies to a broader\nrange of tasks where accurate modeling is unfeasible. One major problem for\non-robot reinforcement learning is ensuring safety, as uncontrolled exploration\ncan cause catastrophic damage to the robot or the environment. Indeed, safety\nspecifications, often represented as constraints, can be complex and\nnon-linear, making safety challenging to guarantee in learning systems. In this\npaper, we show how we can impose complex safety constraints on learning-based\nrobotics systems in a principled manner, both from theoretical and practical\npoints of view. Our approach is based on the concept of the Constraint\nManifold, representing the set of safe robot configurations. Exploiting\ndifferential geometry techniques, i.e., the tangent space, we can construct a\nsafe action space, allowing learning agents to sample arbitrary actions while\nensuring safety. We demonstrate the method's effectiveness in a real-world\nRobot Air Hockey task, showing that our method can handle high-dimensional\ntasks with complex constraints. Videos of the real robot experiments are\navailable on the project website (https://puzeliu.github.io/TRO-ATACOM).\n","authors":["Puze Liu","Haitham Bou-Ammar","Jan Peters","Davide Tateo"],"pdf_url":"https://arxiv.org/pdf/2404.09080v2.pdf","comment":"19 pages; sumitted to IEEE Transactions on Robotics"},{"id":"http://arxiv.org/abs/2410.23643v2","updated":"2024-11-06T08:41:50Z","published":"2024-10-31T05:29:30Z","title":"SceneComplete: Open-World 3D Scene Completion in Complex Real World\n Environments for Robot Manipulation","summary":" Careful robot manipulation in every-day cluttered environments requires an\naccurate understanding of the 3D scene, in order to grasp and place objects\nstably and reliably and to avoid mistakenly colliding with other objects. In\ngeneral, we must construct such a 3D interpretation of a complex scene based on\nlimited input, such as a single RGB-D image. We describe SceneComplete, a\nsystem for constructing a complete, segmented, 3D model of a scene from a\nsingle view. It provides a novel pipeline for composing general-purpose\npretrained perception modules (vision-language, segmentation, image-inpainting,\nimage-to-3D, and pose-estimation) to obtain high-accuracy results. We\ndemonstrate its accuracy and effectiveness with respect to ground-truth models\nin a large benchmark dataset and show that its accurate whole-object\nreconstruction enables robust grasp proposal generation, including for a\ndexterous hand. Project website - https://scenecomplete.github.io/\n","authors":["Aditya Agarwal","Gaurav Singh","Bipasha Sen","Tomás Lozano-Pérez","Leslie Pack Kaelbling"],"pdf_url":"https://arxiv.org/pdf/2410.23643v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03747v1","updated":"2024-11-06T08:22:34Z","published":"2024-11-06T08:22:34Z","title":"Observability-Aware Control for Cooperatively Localizing Quadrotor UAVs","summary":" Cooperatively Localizing robots should seek optimal control strategies to\nmaximize precision of position estimation and ensure safety in flight.\nObservability-Aware Trajectory Optimization has strong potential to address\nthis issue, but no concrete link between observability and precision has been\nproven yet. In this paper, we prove that improvement in positioning precision\ninherently follows from optimizing observability. Based on this finding, we\ndevelop an Observability-Aware Control principle to generate\nobservability-optimal control strategies. We implement this principle in a\nModel Predictive Control framework, and we verify it on a team of quadrotor\nUnmanned Aerial Vehicles comprising a follower vehicle localizing itself by\ntracking a leader vehicle in both simulations and real-world flight tests. Our\nresults demonstrate that maximizing observability contributed to improving\nglobal positioning precision for the quadrotor team.\n","authors":["H S Helson Go","Ching Lok Chong","Longhao Qian","Hugh H. -T. Liu"],"pdf_url":"https://arxiv.org/pdf/2411.03747v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2209.05824v2","updated":"2024-11-06T07:23:52Z","published":"2022-09-13T09:00:58Z","title":"CPnP: Consistent Pose Estimator for Perspective-n-Point Problem with\n Bias Elimination","summary":" The Perspective-n-Point (PnP) problem has been widely studied in both\ncomputer vision and photogrammetry societies. With the development of feature\nextraction techniques, a large number of feature points might be available in a\nsingle shot. It is promising to devise a consistent estimator, i.e., the\nestimate can converge to the true camera pose as the number of points\nincreases. To this end, we propose a consistent PnP solver, named \\emph{CPnP},\nwith bias elimination. Specifically, linear equations are constructed from the\noriginal projection model via measurement model modification and variable\nelimination, based on which a closed-form least-squares solution is obtained.\nWe then analyze and subtract the asymptotic bias of this solution, resulting in\na consistent estimate. Additionally, Gauss-Newton (GN) iterations are executed\nto refine the consistent solution. Our proposed estimator is efficient in terms\nof computations -- it has $O(n)$ computational complexity. Experimental tests\non both synthetic data and real images show that our proposed estimator is\nsuperior to some well-known ones for images with dense visual features, in\nterms of estimation precision and computing time.\n","authors":["Guangyang Zeng","Shiyu Chen","Biqiang Mu","Guodong Shi","Junfeng Wu"],"pdf_url":"https://arxiv.org/pdf/2209.05824v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03706v1","updated":"2024-11-06T07:08:41Z","published":"2024-11-06T07:08:41Z","title":"3DGS-CD: 3D Gaussian Splatting-based Change Detection for Physical\n Object Rearrangement","summary":" We present 3DGS-CD, the first 3D Gaussian Splatting (3DGS)-based method for\ndetecting physical object rearrangements in 3D scenes. Our approach estimates\n3D object-level changes by comparing two sets of unaligned images taken at\ndifferent times. Leveraging 3DGS's novel view rendering and EfficientSAM's\nzero-shot segmentation capabilities, we detect 2D object-level changes, which\nare then associated and fused across views to estimate 3D changes. Our method\ncan detect changes in cluttered environments using sparse post-change images\nwithin as little as 18s, using as few as a single new image. It does not rely\non depth input, user instructions, object classes, or object models -- An\nobject is recognized simply if it has been re-arranged. Our approach is\nevaluated on both public and self-collected real-world datasets, achieving up\nto 14% higher accuracy and three orders of magnitude faster performance\ncompared to the state-of-the-art radiance-field-based change detection method.\nThis significant performance boost enables a broad range of downstream\napplications, where we highlight three key use cases: object reconstruction,\nrobot workspace reset, and 3DGS model update. Our code and data will be made\navailable at https://github.com/520xyxyzq/3DGS-CD.\n","authors":["Ziqi Lu","Jianbo Ye","John Leonard"],"pdf_url":"https://arxiv.org/pdf/2411.03706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03702v1","updated":"2024-11-06T06:58:17Z","published":"2024-11-06T06:58:17Z","title":"Graph-Based Multi-Modal Sensor Fusion for Autonomous Driving","summary":" The growing demand for robust scene understanding in mobile robotics and\nautonomous driving has highlighted the importance of integrating multiple\nsensing modalities. By combining data from diverse sensors like cameras and\nLIDARs, fusion techniques can overcome the limitations of individual sensors,\nenabling a more complete and accurate perception of the environment. We\nintroduce a novel approach to multi-modal sensor fusion, focusing on developing\na graph-based state representation that supports critical decision-making\nprocesses in autonomous driving. We present a Sensor-Agnostic Graph-Aware\nKalman Filter [3], the first online state estimation technique designed to fuse\nmulti-modal graphs derived from noisy multi-sensor data. The estimated\ngraph-based state representations serve as a foundation for advanced\napplications like Multi-Object Tracking (MOT), offering a comprehensive\nframework for enhancing the situational awareness and safety of autonomous\nsystems. We validate the effectiveness of our proposed framework through\nextensive experiments conducted on both synthetic and real-world driving\ndatasets (nuScenes). Our results showcase an improvement in MOTA and a\nreduction in estimated position errors (MOTP) and identity switches (IDS) for\ntracked objects using the SAGA-KF. Furthermore, we highlight the capability of\nsuch a framework to develop methods that can leverage heterogeneous information\n(like semantic objects and geometric structures) from various sensing\nmodalities, enabling a more holistic approach to scene understanding and\nenhancing the safety and effectiveness of autonomous systems.\n","authors":["Depanshu Sani","Saket Anand"],"pdf_url":"https://arxiv.org/pdf/2411.03702v1.pdf","comment":"An extended abstract accepted at Young Researchers' Symposium, ICVGIP\n '24. This extended abstract contains the following: 1. Short summary of our\n work, SAGA-KF, accepted at ICPR'24. 2. A proposal that was awarded the\n Qualcomm Innovation Fellowship'24"},{"id":"http://arxiv.org/abs/2411.03682v1","updated":"2024-11-06T06:06:07Z","published":"2024-11-06T06:06:07Z","title":"LEGATO: Cross-Embodiment Imitation Using a Grasping Tool","summary":" Cross-embodiment imitation learning enables policies trained on specific\nembodiments to transfer across different robots, unlocking the potential for\nlarge-scale imitation learning that is both cost-effective and highly reusable.\nThis paper presents LEGATO, a cross-embodiment imitation learning framework for\nvisuomotor skill transfer across varied kinematic morphologies. We introduce a\nhandheld gripper that unifies action and observation spaces, allowing tasks to\nbe defined consistently across robots. Using this gripper, we train visuomotor\npolicies via imitation learning, applying a motion-invariant transformation to\ncompute the training loss. Gripper motions are then retargeted into\nhigh-degree-of-freedom whole-body motions using inverse kinematics for\ndeployment across diverse embodiments. Our evaluations in simulation and\nreal-robot experiments highlight the framework's effectiveness in learning and\ntransferring visuomotor skills across various robots. More information can be\nfound at the project page: https://ut-hcrl.github.io/LEGATO.\n","authors":["Mingyo Seo","H. Andy Park","Shenli Yuan","Yuke Zhu","Luis Sentis"],"pdf_url":"https://arxiv.org/pdf/2411.03682v1.pdf","comment":"Submitted to RA-L"},{"id":"http://arxiv.org/abs/2411.03669v1","updated":"2024-11-06T05:08:49Z","published":"2024-11-06T05:08:49Z","title":"Imagined Potential Games: A Framework for Simulating, Learning and\n Evaluating Interactive Behaviors","summary":" Interacting with human agents in complex scenarios presents a significant\nchallenge for robotic navigation, particularly in environments that necessitate\nboth collision avoidance and collaborative interaction, such as indoor spaces.\nUnlike static or predictably moving obstacles, human behavior is inherently\ncomplex and unpredictable, stemming from dynamic interactions with other\nagents. Existing simulation tools frequently fail to adequately model such\nreactive and collaborative behaviors, impeding the development and evaluation\nof robust social navigation strategies. This paper introduces a novel framework\nutilizing distributed potential games to simulate human-like interactions in\nhighly interactive scenarios. Within this framework, each agent imagines a\nvirtual cooperative game with others based on its estimation. We demonstrate\nthis formulation can facilitate the generation of diverse and realistic\ninteraction patterns in a configurable manner across various scenarios.\nAdditionally, we have developed a gym-like environment leveraging our\ninteractive agent model to facilitate the learning and evaluation of\ninteractive navigation algorithms.\n","authors":["Lingfeng Sun","Yixiao Wang","Pin-Yun Hung","Changhao Wang","Xiang Zhang","Zhuo Xu","Masayoshi Tomizuka"],"pdf_url":"https://arxiv.org/pdf/2411.03669v1.pdf","comment":"13 pages, 10 figures. arXiv admin note: substantial text overlap with\n arXiv:2310.01614"},{"id":"http://arxiv.org/abs/2411.03660v1","updated":"2024-11-06T04:41:58Z","published":"2024-11-06T04:41:58Z","title":"Development of a Practical Articulated Wheeled In-pipe Robot for Both\n 3-4 in Force Main Inspection of Sewer Pipes","summary":" This paper reports a practical articulated wheeled in-pipe inspection robot\n\"AIRo-7.1\" which is waterproof and dustproof, and can adapt to 3 to 4 in inner\ndiameters. The joint torque can be adjusted by a PWM open-loop control. The\nmiddle joint angle can be controlled by a position feedback control system\nwhile the other two joints are bent by torsional springs. Thanks to this simple\nand high-density design, not only downsizing of the robot but also wide range\nof the adaptive inner diameter were achieved. However, the relationship between\nthe actual middle joint torque value and the PWM duty ratio should be pre-known\nbecause the reducer used in AIRo-7.1 was designed by ourselves. Therefore,\npreliminary experiments were conducted to clarify the relationship between\nthem. To examine the adaptive movement, experiments in both 3 in and 4 in pipes\nwith vertical, bend, and diameter change sections. Finally, field experiment\nwas also conducted. From the results, high adaptability to different inner\ndiameters of pipes and slippery environments were confirmed although waterproof\nand dustproof were not perfectly working.\n","authors":["Kenya Murata","Atsushi Kakogawa"],"pdf_url":"https://arxiv.org/pdf/2411.03660v1.pdf","comment":"The Twenty-Ninth International Symposium on Artificial Life and\n Robotics 2024 (AROB 29th 2024), The Ninth International Symposium on\n BioComplexity 2024 (ISBC 9th 2024), The Seventh International Symposium on\n Swarm Behavior and Bio-Inspired Robotics 2024 (SWARM 7th 2024) B-Con Plaza,\n Beppu, Japan and ONLINE, January 24-26, 2024"},{"id":"http://arxiv.org/abs/2209.02849v2","updated":"2024-11-06T04:07:16Z","published":"2022-09-06T23:19:14Z","title":"Adaptive Complexity Model Predictive Control","summary":" This work introduces a formulation of model predictive control (MPC) which\nadaptively reasons about the complexity of the model based on the task while\nmaintaining feasibility and stability guarantees. Existing MPC implementations\noften handle computational complexity by shortening prediction horizons or\nsimplifying models, both of which can result in instability. Inspired by\nrelated approaches in behavioral economics, motion planning, and biomechanics,\nour method solves MPC problems with a simple model for dynamics and constraints\nover regions of the horizon where such a model is feasible and a complex model\nwhere it is not. The approach leverages an interleaving of planning and\nexecution to iteratively identify these regions, which can be safely simplified\nif they satisfy an exact template/anchor relationship. We show that this method\ndoes not compromise the stability and feasibility properties of the system, and\nmeasure performance in simulation experiments on a quadrupedal robot executing\nagile behaviors over terrains of interest. We find that this adaptive method\nenables more agile motion and expands the range of executable tasks compared to\nfixed-complexity implementations.\n","authors":["Joseph Norby","Ardalan Tajbakhsh","Yanhao Yang","Aaron M. Johnson"],"pdf_url":"https://arxiv.org/pdf/2209.02849v2.pdf","comment":"Published in Transactions on Robotics"},{"id":"http://arxiv.org/abs/2110.00675v4","updated":"2024-11-06T03:29:03Z","published":"2021-10-01T23:03:21Z","title":"Contraction Theory for Nonlinear Stability Analysis and Learning-based\n Control: A Tutorial Overview","summary":" Contraction theory is an analytical tool to study differential dynamics of a\nnon-autonomous (i.e., time-varying) nonlinear system under a contraction metric\ndefined with a uniformly positive definite matrix, the existence of which\nresults in a necessary and sufficient characterization of incremental\nexponential stability of multiple solution trajectories with respect to each\nother. By using a squared differential length as a Lyapunov-like function, its\nnonlinear stability analysis boils down to finding a suitable contraction\nmetric that satisfies a stability condition expressed as a linear matrix\ninequality, indicating that many parallels can be drawn between well-known\nlinear systems theory and contraction theory for nonlinear systems.\nFurthermore, contraction theory takes advantage of a superior robustness\nproperty of exponential stability used in conjunction with the comparison\nlemma. This yields much-needed safety and stability guarantees for neural\nnetwork-based control and estimation schemes, without resorting to a more\ninvolved method of using uniform asymptotic stability for input-to-state\nstability. Such distinctive features permit systematic construction of a\ncontraction metric via convex optimization, thereby obtaining an explicit\nexponential bound on the distance between a time-varying target trajectory and\nsolution trajectories perturbed externally due to disturbances and learning\nerrors. The objective of this paper is therefore to present a tutorial overview\nof contraction theory and its advantages in nonlinear stability analysis of\ndeterministic and stochastic systems, with an emphasis on deriving formal\nrobustness and stability guarantees for various learning-based and data-driven\nautomatic control methods. In particular, we provide a detailed review of\ntechniques for finding contraction metrics and associated control and\nestimation laws using deep neural networks.\n","authors":["Hiroyasu Tsukamoto","Soon-Jo Chung","Jean-Jacques E. Slotine"],"pdf_url":"https://arxiv.org/pdf/2110.00675v4.pdf","comment":"Annual Reviews in Control, Accepted, Oct. 1st"},{"id":"http://arxiv.org/abs/2410.21845v2","updated":"2024-11-06T03:14:14Z","published":"2024-10-29T08:12:20Z","title":"Precise and Dexterous Robotic Manipulation via Human-in-the-Loop\n Reinforcement Learning","summary":" Reinforcement learning (RL) holds great promise for enabling autonomous\nacquisition of complex robotic manipulation skills, but realizing this\npotential in real-world settings has been challenging. We present a\nhuman-in-the-loop vision-based RL system that demonstrates impressive\nperformance on a diverse set of dexterous manipulation tasks, including dynamic\nmanipulation, precision assembly, and dual-arm coordination. Our approach\nintegrates demonstrations and human corrections, efficient RL algorithms, and\nother system-level design choices to learn policies that achieve near-perfect\nsuccess rates and fast cycle times within just 1 to 2.5 hours of training. We\nshow that our method significantly outperforms imitation learning baselines and\nprior RL approaches, with an average 2x improvement in success rate and 1.8x\nfaster execution. Through extensive experiments and analysis, we provide\ninsights into the effectiveness of our approach, demonstrating how it learns\nrobust, adaptive policies for both reactive and predictive control strategies.\nOur results suggest that RL can indeed learn a wide range of complex\nvision-based manipulation policies directly in the real world within practical\ntraining times. We hope this work will inspire a new generation of learned\nrobotic manipulation techniques, benefiting both industrial applications and\nresearch advancements. Videos and code are available at our project website\nhttps://hil-serl.github.io/.\n","authors":["Jianlan Luo","Charles Xu","Jeffrey Wu","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2410.21845v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03619v1","updated":"2024-11-06T02:24:27Z","published":"2024-11-06T02:24:27Z","title":"Real-Time Safe Bipedal Robot Navigation using Linear Discrete Control\n Barrier Functions","summary":" Safe navigation in real-time is an essential task for humanoid robots in\nreal-world deployment. Since humanoid robots are inherently underactuated\nthanks to unilateral ground contacts, a path is considered safe if it is\nobstacle-free and respects the robot's physical limitations and underlying\ndynamics. Existing approaches often decouple path planning from gait control\ndue to the significant computational challenge caused by the full-order robot\ndynamics. In this work, we develop a unified, safe path and gait planning\nframework that can be evaluated online in real-time, allowing the robot to\nnavigate clustered environments while sustaining stable locomotion. Our\napproach uses the popular Linear Inverted Pendulum (LIP) model as a template\nmodel to represent walking dynamics. It incorporates heading angles in the\nmodel to evaluate kinematic constraints essential for physically feasible gaits\nproperly. In addition, we leverage discrete control barrier functions (DCBF)\nfor obstacle avoidance, ensuring that the subsequent foot placement provides a\nsafe navigation path within clustered environments. To guarantee real-time\ncomputation, we use a novel approximation of the DCBF to produce linear DCBF\n(LDCBF) constraints. We validate the proposed approach in simulation using a\nDigit robot in randomly generated environments. The results demonstrate that\nour approach can generate safe gaits for a non-trivial humanoid robot to\nnavigate environments with randomly generated obstacles in real-time.\n","authors":["Chengyang Peng","Victor Paredes","Guillermo A. Castillo","Ayonga Hereid"],"pdf_url":"https://arxiv.org/pdf/2411.03619v1.pdf","comment":"7 pages, 10 figures"},{"id":"http://arxiv.org/abs/2411.03614v1","updated":"2024-11-06T02:16:26Z","published":"2024-11-06T02:16:26Z","title":"Robot Swarming over the internet","summary":" This paper considers cooperative control of robots involving two different\ntestbed systems in remote locations with communication on the internet. This\nprovides us the capability to exchange robots status like positions, velocities\nand directions needed for the swarming algorithm. The results show that all\nrobots properly follow some leader defined one of the testbeds. Measurement of\ndata exchange rates show no loss of packets, and average transfer delays stay\nwithin tolerance limits for practical applications. In our knowledge, the\nnovelty of this paper concerns this kind of control over a large network like\ninternet.\n","authors":["Will Ferenc","Hannah Kastein","Lauren Lieu","Ryan Wilson","Yuan Rick Huang","Jerome Gilles","Andrea L. Bertozzi","Balaji R. Sharma","Baisravan HomChaudhuri","Subramanian Ramakrishnan","Manish Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.03614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03610v1","updated":"2024-11-06T02:05:44Z","published":"2024-11-06T02:05:44Z","title":"LCP-Fusion: A Neural Implicit SLAM with Enhanced Local Constraints and\n Computable Prior","summary":" Recently the dense Simultaneous Localization and Mapping (SLAM) based on\nneural implicit representation has shown impressive progress in hole filling\nand high-fidelity mapping. Nevertheless, existing methods either heavily rely\non known scene bounds or suffer inconsistent reconstruction due to drift in\npotential loop-closure regions, or both, which can be attributed to the\ninflexible representation and lack of local constraints. In this paper, we\npresent LCP-Fusion, a neural implicit SLAM system with enhanced local\nconstraints and computable prior, which takes the sparse voxel octree structure\ncontaining feature grids and SDF priors as hybrid scene representation,\nenabling the scalability and robustness during mapping and tracking. To enhance\nthe local constraints, we propose a novel sliding window selection strategy\nbased on visual overlap to address the loop-closure, and a practical warping\nloss to constrain relative poses. Moreover, we estimate SDF priors as coarse\ninitialization for implicit features, which brings additional explicit\nconstraints and robustness, especially when a light but efficient adaptive\nearly ending is adopted. Experiments demonstrate that our method achieve better\nlocalization accuracy and reconstruction consistency than existing RGB-D\nimplicit SLAM, especially in challenging real scenes (ScanNet) as well as\nself-captured scenes with unknown scene bounds. The code is available at\nhttps://github.com/laliwang/LCP-Fusion.\n","authors":["Jiahui Wang","Yinan Deng","Yi Yang","Yufeng Yue"],"pdf_url":"https://arxiv.org/pdf/2411.03610v1.pdf","comment":"Accepted by 2024 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2411.03591v1","updated":"2024-11-06T01:11:39Z","published":"2024-11-06T01:11:39Z","title":"vMF-Contact: Uncertainty-aware Evidential Learning for Probabilistic\n Contact-grasp in Noisy Clutter","summary":" Grasp learning in noisy environments, such as occlusions, sensor noise, and\nout-of-distribution (OOD) objects, poses significant challenges. Recent\nlearning-based approaches focus primarily on capturing aleatoric uncertainty\nfrom inherent data noise. The epistemic uncertainty, which represents the OOD\nrecognition, is often addressed by ensembles with multiple forward paths,\nlimiting real-time application. In this paper, we propose an uncertainty-aware\napproach for 6-DoF grasp detection using evidential learning to comprehensively\ncapture both uncertainties in real-world robotic grasping. As a key\ncontribution, we introduce vMF-Contact, a novel architecture for learning\nhierarchical contact grasp representations with probabilistic modeling of\ndirectional uncertainty as von Mises-Fisher (vMF) distribution. To achieve\nthis, we derive and analyze the theoretical formulation of the second-order\nobjective on the posterior parametrization, providing formal guarantees for the\nmodel's ability to quantify uncertainty and improve grasp prediction\nperformance. Moreover, we enhance feature expressiveness by applying partial\npoint reconstructions as an auxiliary task, improving the comprehension of\nuncertainty quantification as well as the generalization to unseen objects. In\nthe real-world experiments, our method demonstrates a significant improvement\nby 39% in the overall clearance rate compared to the baselines. Video is under\nhttps://www.youtube.com/watch?v=4aQsrDgdV8Y&t=12s\n","authors":["Yitian Shi","Edgar Welte","Maximilian Gilles","Rania Rayyes"],"pdf_url":"https://arxiv.org/pdf/2411.03591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03581v1","updated":"2024-11-06T00:40:53Z","published":"2024-11-06T00:40:53Z","title":"Can Robotic Cues Manipulate Human Decisions? Exploring Consensus\n Building via Bias-Controlled Non-linear Opinion Dynamics and Robotic Eye Gaze\n Mediated Interaction in Human-Robot Teaming","summary":" Although robots are becoming more advanced with human-like anthropomorphic\nfeatures and decision-making abilities to improve collaboration, the active\nintegration of humans into this process remains under-explored. This article\npresents the first experimental study exploring decision-making interactions\nbetween humans and robots with visual cues from robotic eyes, which can\ndynamically influence human opinion formation. The cues generated by robotic\neyes gradually guide human decisions towards alignment with the robot's\nchoices. Both human and robot decision-making processes are modeled as\nnon-linear opinion dynamics with evolving biases. To examine these opinion\ndynamics under varying biases, we conduct numerical parametric and equilibrium\ncontinuation analyses using tuned parameters designed explicitly for the\npresented human-robot interaction experiment. Furthermore, to facilitate the\ntransition from disagreement to agreement, we introduced a human opinion\nobservation algorithm integrated with the formation of the robot's opinion,\nwhere the robot's behavior is controlled based on its formed opinion. The\nalgorithms developed aim to enhance human involvement in consensus building,\nfostering effective collaboration between humans and robots. Experiments with\n51 participants (N = 51) show that human-robot teamwork can be improved by\nguiding human decisions using robotic cues. Finally, we provide detailed\ninsights on the effects of trust, cognitive load, and participant demographics\non decision-making based on user feedback and post-experiment interviews.\n","authors":["Rajul Kumar","Adam Bhatti","Ningshi Yao"],"pdf_url":"https://arxiv.org/pdf/2411.03581v1.pdf","comment":"35 pages, 14 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2411.04125v1","updated":"2024-11-06T18:59:41Z","published":"2024-11-06T18:59:41Z","title":"Community Forensics: Using Thousands of Generators to Train Fake Image\n Detectors","summary":" One of the key challenges of detecting AI-generated images is spotting images\nthat have been created by previously unseen generative models. We argue that\nthe limited diversity of the training data is a major obstacle to addressing\nthis problem, and we propose a new dataset that is significantly larger and\nmore diverse than prior work. As part of creating this dataset, we\nsystematically download thousands of text-to-image latent diffusion models and\nsample images from them. We also collect images from dozens of popular open\nsource and commercial models. The resulting dataset contains 2.7M images that\nhave been sampled from 4803 different models. These images collectively capture\na wide range of scene content, generator architectures, and image processing\nsettings. Using this dataset, we study the generalization abilities of fake\nimage detectors. Our experiments suggest that detection performance improves as\nthe number of models in the training set increases, even when these models have\nsimilar architectures. We also find that detection performance improves as the\ndiversity of the models increases, and that our trained detectors generalize\nbetter than those trained on other datasets.\n","authors":["Jeongsoo Park","Andrew Owens"],"pdf_url":"https://arxiv.org/pdf/2411.04125v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2407.10964v2","updated":"2024-11-06T18:58:03Z","published":"2024-07-15T17:58:42Z","title":"No Train, all Gain: Self-Supervised Gradients Improve Deep Frozen\n Representations","summary":" This paper introduces FUNGI, Features from UNsupervised GradIents, a method\nto enhance the features of transformer encoders by leveraging self-supervised\ngradients. Our method is simple: given any pretrained model, we first compute\ngradients from various self-supervised objectives for each input. These\ngradients are projected to a lower dimension and then concatenated with the\nmodel's output embedding. The resulting features are evaluated on k-nearest\nneighbor classification over 11 datasets from vision, 5 from natural language\nprocessing, and 2 from audio. Across backbones spanning various sizes and\npretraining strategies, FUNGI features provide consistent performance\nimprovements over the embeddings. We also show that using FUNGI features can\nbenefit linear classification, clustering and image retrieval, and that they\nsignificantly improve the retrieval-based in-context scene understanding\nabilities of pretrained models, for example improving upon DINO by +17% for\nsemantic segmentation - without any training.\n","authors":["Walter Simoncini","Spyros Gidaris","Andrei Bursuc","Yuki M. Asano"],"pdf_url":"https://arxiv.org/pdf/2407.10964v2.pdf","comment":"NeurIPS 2024. Code available at\n https://github.com/WalterSimoncini/fungivision"},{"id":"http://arxiv.org/abs/2411.04112v1","updated":"2024-11-06T18:44:09Z","published":"2024-11-06T18:44:09Z","title":"Fed-EC: Bandwidth-Efficient Clustering-Based Federated Learning For\n Autonomous Visual Robot Navigation","summary":" Centralized learning requires data to be aggregated at a central server,\nwhich poses significant challenges in terms of data privacy and bandwidth\nconsumption. Federated learning presents a compelling alternative, however,\nvanilla federated learning methods deployed in robotics aim to learn a single\nglobal model across robots that works ideally for all. But in practice one\nmodel may not be well suited for robots deployed in various environments. This\npaper proposes Federated-EmbedCluster (Fed-EC), a clustering-based federated\nlearning framework that is deployed with vision based autonomous robot\nnavigation in diverse outdoor environments. The framework addresses the key\nfederated learning challenge of deteriorating model performance of a single\nglobal model due to the presence of non-IID data across real-world robots.\nExtensive real-world experiments validate that Fed-EC reduces the communication\nsize by 23x for each robot while matching the performance of centralized\nlearning for goal-oriented navigation and outperforms local learning. Fed-EC\ncan transfer previously learnt models to new robots that join the cluster.\n","authors":["Shreya Gummadi","Mateus V. Gasparino","Deepak Vasisht","Girish Chowdhary"],"pdf_url":"https://arxiv.org/pdf/2411.04112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19863v4","updated":"2024-11-06T18:29:38Z","published":"2024-03-28T22:17:19Z","title":"DeNetDM: Debiasing by Network Depth Modulation","summary":" Neural networks trained on biased datasets tend to inadvertently learn\nspurious correlations, hindering generalization. We formally prove that (1)\nsamples that exhibit spurious correlations lie on a lower rank manifold\nrelative to the ones that do not; and (2) the depth of a network acts as an\nimplicit regularizer on the rank of the attribute subspace that is encoded in\nits representations. Leveraging these insights, we present DeNetDM, a novel\ndebiasing method that uses network depth modulation as a way of developing\nrobustness to spurious correlations. Using a training paradigm derived from\nProduct of Experts, we create both biased and debiased branches with deep and\nshallow architectures and then distill knowledge to produce the target debiased\nmodel. Our method requires no bias annotations or explicit data augmentation\nwhile performing on par with approaches that require either or both. We\ndemonstrate that DeNetDM outperforms existing debiasing techniques on both\nsynthetic and real-world datasets by 5\\%. The project page is available at\nhttps://vssilpa.github.io/denetdm/.\n","authors":["Silpa Vadakkeeveetil Sreelatha","Adarsh Kappiyath","Abhra Chaudhuri","Anjan Dutta"],"pdf_url":"https://arxiv.org/pdf/2403.19863v4.pdf","comment":"Camera-ready version : NeurIPS 2024, * indicates these authors\n contributed equally"},{"id":"http://arxiv.org/abs/2411.04097v1","updated":"2024-11-06T18:25:00Z","published":"2024-11-06T18:25:00Z","title":"RaVL: Discovering and Mitigating Spurious Correlations in Fine-Tuned\n Vision-Language Models","summary":" Fine-tuned vision-language models (VLMs) often capture spurious correlations\nbetween image features and textual attributes, resulting in degraded zero-shot\nperformance at test time. Existing approaches for addressing spurious\ncorrelations (i) primarily operate at the global image-level rather than\nintervening directly on fine-grained image features and (ii) are predominantly\ndesigned for unimodal settings. In this work, we present RaVL, which takes a\nfine-grained perspective on VLM robustness by discovering and mitigating\nspurious correlations using local image features rather than operating at the\nglobal image level. Given a fine-tuned VLM, RaVL first discovers spurious\ncorrelations by leveraging a region-level clustering approach to identify\nprecise image features contributing to zero-shot classification errors. Then,\nRaVL mitigates the identified spurious correlation with a novel region-aware\nloss function that enables the VLM to focus on relevant regions and ignore\nspurious relationships during fine-tuning. We evaluate RaVL on 654 VLMs with\nvarious model architectures, data domains, and learned spurious correlations.\nOur results show that RaVL accurately discovers (191% improvement over the\nclosest baseline) and mitigates (8.2% improvement on worst-group image\nclassification accuracy) spurious correlations. Qualitative evaluations on\ngeneral-domain and medical-domain VLMs confirm our findings.\n","authors":["Maya Varma","Jean-Benoit Delbrouck","Zhihong Chen","Akshay Chaudhari","Curtis Langlotz"],"pdf_url":"https://arxiv.org/pdf/2411.04097v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.16147v3","updated":"2024-11-06T18:08:23Z","published":"2024-09-23T00:11:30Z","title":"Gaussian Deja-vu: Creating Controllable 3D Gaussian Head-Avatars with\n Enhanced Generalization and Personalization Abilities","summary":" Recent advancements in 3D Gaussian Splatting (3DGS) have unlocked significant\npotential for modeling 3D head avatars, providing greater flexibility than\nmesh-based methods and more efficient rendering compared to NeRF-based\napproaches. Despite these advancements, the creation of controllable 3DGS-based\nhead avatars remains time-intensive, often requiring tens of minutes to hours.\nTo expedite this process, we here introduce the \"Gaussian Deja-vu\" framework,\nwhich first obtains a generalized model of the head avatar and then\npersonalizes the result. The generalized model is trained on large 2D\n(synthetic and real) image datasets. This model provides a well-initialized 3D\nGaussian head that is further refined using a monocular video to achieve the\npersonalized head avatar. For personalizing, we propose learnable\nexpression-aware rectification blendmaps to correct the initial 3D Gaussians,\nensuring rapid convergence without the reliance on neural networks. Experiments\ndemonstrate that the proposed method meets its objectives. It outperforms\nstate-of-the-art 3D Gaussian head avatars in terms of photorealistic quality as\nwell as reduces training time consumption to at least a quarter of the existing\nmethods, producing the avatar in minutes.\n","authors":["Peizhi Yan","Rabab Ward","Qiang Tang","Shan Du"],"pdf_url":"https://arxiv.org/pdf/2409.16147v3.pdf","comment":"11 pages, Accepted by WACV 2025 in Round 1"},{"id":"http://arxiv.org/abs/2411.04079v1","updated":"2024-11-06T17:57:43Z","published":"2024-11-06T17:57:43Z","title":"Textual Decomposition Then Sub-motion-space Scattering for\n Open-Vocabulary Motion Generation","summary":" Text-to-motion generation is a crucial task in computer vision, which\ngenerates the target 3D motion by the given text. The existing annotated\ndatasets are limited in scale, resulting in most existing methods overfitting\nto the small datasets and unable to generalize to the motions of the open\ndomain. Some methods attempt to solve the open-vocabulary motion generation\nproblem by aligning to the CLIP space or using the Pretrain-then-Finetuning\nparadigm. However, the current annotated dataset's limited scale only allows\nthem to achieve mapping from sub-text-space to sub-motion-space, instead of\nmapping between full-text-space and full-motion-space (full mapping), which is\nthe key to attaining open-vocabulary motion generation. To this end, this paper\nproposes to leverage the atomic motion (simple body part motions over a short\ntime period) as an intermediate representation, and leverage two orderly\ncoupled steps, i.e., Textual Decomposition and Sub-motion-space Scattering, to\naddress the full mapping problem. For Textual Decomposition, we design a\nfine-grained description conversion algorithm, and combine it with the\ngeneralization ability of a large language model to convert any given motion\ntext into atomic texts. Sub-motion-space Scattering learns the compositional\nprocess from atomic motions to the target motions, to make the learned\nsub-motion-space scattered to form the full-motion-space. For a given motion of\nthe open domain, it transforms the extrapolation into interpolation and thereby\nsignificantly improves generalization. Our network, $DSO$-Net, combines textual\n$d$ecomposition and sub-motion-space $s$cattering to solve the\n$o$pen-vocabulary motion generation. Extensive experiments demonstrate that our\nDSO-Net achieves significant improvements over the state-of-the-art methods on\nopen-vocabulary motion generation. Code is available at\nhttps://vankouf.github.io/DSONet/.\n","authors":["Ke Fan","Jiangning Zhang","Ran Yi","Jingyu Gong","Yabiao Wang","Yating Wang","Xin Tan","Chengjie Wang","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2411.04079v1.pdf","comment":"project page: https://vankouf.github.io/DSONet/"},{"id":"http://arxiv.org/abs/2411.04077v1","updated":"2024-11-06T17:55:37Z","published":"2024-11-06T17:55:37Z","title":"H-POPE: Hierarchical Polling-based Probing Evaluation of Hallucinations\n in Large Vision-Language Models","summary":" By leveraging both texts and images, large vision language models (LVLMs)\nhave shown significant progress in various multi-modal tasks. Nevertheless,\nthese models often suffer from hallucinations, e.g., they exhibit\ninconsistencies between the visual input and the textual output. To address\nthis, we propose H-POPE, a coarse-to-fine-grained benchmark that systematically\nassesses hallucination in object existence and attributes. Our evaluation shows\nthat models are prone to hallucinations on object existence, and even more so\non fine-grained attributes. We further investigate whether these models rely on\nvisual input to formulate the output texts.\n","authors":["Nhi Pham","Michael Schott"],"pdf_url":"https://arxiv.org/pdf/2411.04077v1.pdf","comment":"Poster at https://sites.google.com/berkeley.edu/bb-stat/home"},{"id":"http://arxiv.org/abs/2411.04059v1","updated":"2024-11-06T17:11:44Z","published":"2024-11-06T17:11:44Z","title":"Pseudo-labeling with Keyword Refining for Few-Supervised Video\n Captioning","summary":" Video captioning generate a sentence that describes the video content.\nExisting methods always require a number of captions (\\eg, 10 or 20) per video\nto train the model, which is quite costly. In this work, we explore the\npossibility of using only one or very few ground-truth sentences, and introduce\na new task named few-supervised video captioning. Specifically, we propose a\nfew-supervised video captioning framework that consists of lexically\nconstrained pseudo-labeling module and keyword-refined captioning module.\nUnlike the random sampling in natural language processing that may cause\ninvalid modifications (\\ie, edit words), the former module guides the model to\nedit words using some actions (\\eg, copy, replace, insert, and delete) by a\npretrained token-level classifier, and then fine-tunes candidate sentences by a\npretrained language model. Meanwhile, the former employs the repetition\npenalized sampling to encourage the model to yield concise pseudo-labeled\nsentences with less repetition, and selects the most relevant sentences upon a\npretrained video-text model. Moreover, to keep semantic consistency between\npseudo-labeled sentences and video content, we develop the transformer-based\nkeyword refiner with the video-keyword gated fusion strategy to emphasize more\non relevant words. Extensive experiments on several benchmarks demonstrate the\nadvantages of the proposed approach in both few-supervised and fully-supervised\nscenarios. The code implementation is available at\nhttps://github.com/mlvccn/PKG_VidCap\n","authors":["Ping Li","Tao Wang","Xinkui Zhao","Xianghua Xu","Mingli Song"],"pdf_url":"https://arxiv.org/pdf/2411.04059v1.pdf","comment":"12 figures, Accepted in Pattern Recognition"},{"id":"http://arxiv.org/abs/2410.23247v2","updated":"2024-11-06T17:07:53Z","published":"2024-10-30T17:30:35Z","title":"bit2bit: 1-bit quanta video reconstruction via self-supervised photon\n prediction","summary":" Quanta image sensors, such as SPAD arrays, are an emerging sensor technology,\nproducing 1-bit arrays representing photon detection events over exposures as\nshort as a few nanoseconds. In practice, raw data are post-processed using\nheavy spatiotemporal binning to create more useful and interpretable images at\nthe cost of degrading spatiotemporal resolution. In this work, we propose\nbit2bit, a new method for reconstructing high-quality image stacks at the\noriginal spatiotemporal resolution from sparse binary quanta image data.\nInspired by recent work on Poisson denoising, we developed an algorithm that\ncreates a dense image sequence from sparse binary photon data by predicting the\nphoton arrival location probability distribution. However, due to the binary\nnature of the data, we show that the assumption of a Poisson distribution is\ninadequate. Instead, we model the process with a Bernoulli lattice process from\nthe truncated Poisson. This leads to the proposal of a novel self-supervised\nsolution based on a masked loss function. We evaluate our method using both\nsimulated and real data. On simulated data from a conventional video, we\nachieve 34.35 mean PSNR with extremely photon-sparse binary input (<0.06\nphotons per pixel per frame). We also present a novel dataset containing a wide\nrange of real SPAD high-speed videos under various challenging imaging\nconditions. The scenes cover strong/weak ambient light, strong motion,\nultra-fast events, etc., which will be made available to the community, on\nwhich we demonstrate the promise of our approach. Both reconstruction quality\nand throughput substantially surpass the state-of-the-art methods (e.g., Quanta\nBurst Photography (QBP)). Our approach significantly enhances the visualization\nand usability of the data, enabling the application of existing analysis\ntechniques.\n","authors":["Yehe Liu","Alexander Krull","Hector Basevi","Ales Leonardis","Michael W. Jenkins"],"pdf_url":"https://arxiv.org/pdf/2410.23247v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.04055v1","updated":"2024-11-06T16:59:51Z","published":"2024-11-06T16:59:51Z","title":"Multi-branch Spatio-Temporal Graph Neural Network For Efficient Ice\n Layer Thickness Prediction","summary":" Understanding spatio-temporal patterns in polar ice layers is essential for\ntracking changes in ice sheet balance and assessing ice dynamics. While\nconvolutional neural networks are widely used in learning ice layer patterns\nfrom raw echogram images captured by airborne snow radar sensors, noise in the\nechogram images prevents researchers from getting high-quality results.\nInstead, we focus on geometric deep learning using graph neural networks,\naiming to build a spatio-temporal graph neural network that learns from\nthickness information of the top ice layers and predicts for deeper layers. In\nthis paper, we developed a novel multi-branch spatio-temporal graph neural\nnetwork that used the GraphSAGE framework for spatio features learning and a\ntemporal convolution operation to capture temporal changes, enabling different\nbranches of the network to be more specialized and focusing on a single\nlearning task. We found that our proposed multi-branch network can consistently\noutperform the current fused spatio-temporal graph neural network in both\naccuracy and efficiency.\n","authors":["Zesheng Liu","Maryam Rahnemoonfar"],"pdf_url":"https://arxiv.org/pdf/2411.04055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08774v3","updated":"2024-11-06T16:57:36Z","published":"2024-02-13T20:16:31Z","title":"LDTrack: Dynamic People Tracking by Service Robots using Diffusion\n Models","summary":" Tracking of dynamic people in cluttered and crowded human-centered\nenvironments is a challenging robotics problem due to the presence of\nintraclass variations including occlusions, pose deformations, and lighting\nvariations. This paper introduces a novel deep learning architecture, using\nconditional latent diffusion models, the Latent Diffusion Track (LDTrack), for\ntracking multiple dynamic people under intraclass variations. By uniquely\nutilizing conditional latent diffusion models to capture temporal person\nembeddings, our architecture can adapt to appearance changes of people over\ntime. We incorporated a latent feature encoder network which enables the\ndiffusion process to operate within a high-dimensional latent space to allow\nfor the extraction and spatial-temporal refinement of such rich features as\nperson appearance, motion, location, identity, and contextual information.\nExtensive experiments demonstrate the effectiveness of LDTrack over other\nstate-of-the-art tracking methods in cluttered and crowded human-centered\nenvironments under intraclass variations. Namely, the results show our method\noutperforms existing deep learning robotic people tracking methods in both\ntracking accuracy and tracking precision with statistical significance.\nAdditionally, a comprehensive multi-object tracking comparison study was\nperformed against the state-of-the-art methods in urban environments,\ndemonstrating the generalizability of LDTrack. An ablation study was performed\nto validate the design choices of LDTrack.\n","authors":["Angus Fung","Beno Benhabib","Goldie Nejat"],"pdf_url":"https://arxiv.org/pdf/2402.08774v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16749v2","updated":"2024-11-06T16:55:39Z","published":"2024-05-27T01:38:30Z","title":"DMPlug: A Plug-in Method for Solving Inverse Problems with Diffusion\n Models","summary":" Pretrained diffusion models (DMs) have recently been popularly used in\nsolving inverse problems (IPs). The existing methods mostly interleave\niterative steps in the reverse diffusion process and iterative steps to bring\nthe iterates closer to satisfying the measurement constraint. However, such\ninterleaving methods struggle to produce final results that look like natural\nobjects of interest (i.e., manifold feasibility) and fit the measurement (i.e.,\nmeasurement feasibility), especially for nonlinear IPs. Moreover, their\ncapabilities to deal with noisy IPs with unknown types and levels of\nmeasurement noise are unknown. In this paper, we advocate viewing the reverse\nprocess in DMs as a function and propose a novel plug-in method for solving IPs\nusing pretrained DMs, dubbed DMPlug. DMPlug addresses the issues of manifold\nfeasibility and measurement feasibility in a principled manner, and also shows\ngreat potential for being robust to unknown types and levels of noise. Through\nextensive experiments across various IP tasks, including two linear and three\nnonlinear IPs, we demonstrate that DMPlug consistently outperforms\nstate-of-the-art methods, often by large margins especially for nonlinear IPs.\nThe code is available at https://github.com/sun-umn/DMPlug.\n","authors":["Hengkang Wang","Xu Zhang","Taihui Li","Yuxiang Wan","Tiancong Chen","Ju Sun"],"pdf_url":"https://arxiv.org/pdf/2405.16749v2.pdf","comment":"Published in NeurIPS 2024\n (https://openreview.net/forum?id=81IFFsfQUj)"},{"id":"http://arxiv.org/abs/2410.05969v2","updated":"2024-11-06T16:28:58Z","published":"2024-10-08T12:16:30Z","title":"Deep neural network-based detection of counterfeit products from\n smartphone images","summary":" Counterfeit products such as drugs and vaccines as well as luxury items such\nas high-fashion handbags, watches, jewelry, garments, and cosmetics, represent\nsignificant direct losses of revenue to legitimate manufacturers and vendors,\nas well as indirect costs to societies at large. We present the world's first\npurely computer-vision-based system to combat such counterfeiting-one that does\nnot require special security tags or other alterations to the products or\nmodifications to supply chain tracking. Our deep neural network system shows\nhigh accuracy on branded garments from our first manufacturer tested (99.71%\nafter 3.06% rejections) using images captured under natural, weakly controlled\nconditions, such as in retail stores, customs checkpoints, warehouses, and\noutdoors. Our system, suitably transfer trained on a small number of fake and\ngenuine articles, should find application in additional product categories as\nwell, for example fashion accessories, perfume boxes, medicines, and more.\n","authors":["Hugo Garcia-Cotte","Dorra Mellouli","Abdul Rehman","Li Wang","David G. Stork"],"pdf_url":"https://arxiv.org/pdf/2410.05969v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17537v3","updated":"2024-11-06T15:56:04Z","published":"2024-05-27T17:57:48Z","title":"CLIBD: Bridging Vision and Genomics for Biodiversity Monitoring at Scale","summary":" Measuring biodiversity is crucial for understanding ecosystem health. While\nprior works have developed machine learning models for taxonomic classification\nof photographic images and DNA separately, in this work, we introduce a\nmultimodal approach combining both, using CLIP-style contrastive learning to\nalign images, barcode DNA, and text-based representations of taxonomic labels\nin a unified embedding space. This allows for accurate classification of both\nknown and unknown insect species without task-specific fine-tuning, leveraging\ncontrastive learning for the first time to fuse DNA and image data. Our method\nsurpasses previous single-modality approaches in accuracy by over 8% on\nzero-shot learning tasks, showcasing its effectiveness in biodiversity studies.\n","authors":["ZeMing Gong","Austin T. Wang","Xiaoliang Huo","Joakim Bruslund Haurum","Scott C. Lowe","Graham W. Taylor","Angel X. Chang"],"pdf_url":"https://arxiv.org/pdf/2405.17537v3.pdf","comment":"25 pages with 11 figures"},{"id":"http://arxiv.org/abs/2411.04008v1","updated":"2024-11-06T15:47:18Z","published":"2024-11-06T15:47:18Z","title":"Aligning Characteristic Descriptors with Images for Human-Expert-like\n Explainability","summary":" In mission-critical domains such as law enforcement and medical diagnosis,\nthe ability to explain and interpret the outputs of deep learning models is\ncrucial for ensuring user trust and supporting informed decision-making.\nDespite advancements in explainability, existing methods often fall short in\nproviding explanations that mirror the depth and clarity of those given by\nhuman experts. Such expert-level explanations are essential for the dependable\napplication of deep learning models in law enforcement and medical contexts.\nAdditionally, we recognize that most explanations in real-world scenarios are\ncommunicated primarily through natural language. Addressing these needs, we\npropose a novel approach that utilizes characteristic descriptors to explain\nmodel decisions by identifying their presence in images, thereby generating\nexpert-like explanations. Our method incorporates a concept bottleneck layer\nwithin the model architecture, which calculates the similarity between image\nand descriptor encodings to deliver inherent and faithful explanations. Through\nexperiments in face recognition and chest X-ray diagnosis, we demonstrate that\nour approach offers a significant contrast over existing techniques, which are\noften limited to the use of saliency maps. We believe our approach represents a\nsignificant step toward making deep learning systems more accountable,\ntransparent, and trustworthy in the critical domains of face recognition and\nmedical diagnosis.\n","authors":["Bharat Chandra Yalavarthi","Nalini Ratha"],"pdf_url":"https://arxiv.org/pdf/2411.04008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04004v1","updated":"2024-11-06T15:43:51Z","published":"2024-11-06T15:43:51Z","title":"Synomaly Noise and Multi-Stage Diffusion: A Novel Approach for\n Unsupervised Anomaly Detection in Ultrasound Imaging","summary":" Ultrasound (US) imaging is widely used in routine clinical practice due to\nits advantages of being radiation-free, cost-effective, and portable. However,\nthe low reproducibility and quality of US images, combined with the scarcity of\nexpert-level annotation, make the training of fully supervised segmentation\nmodels challenging. To address these issues, we propose a novel unsupervised\nanomaly detection framework based on a diffusion model that incorporates a\nsynthetic anomaly (Synomaly) noise function and a multi-stage diffusion\nprocess. Synomaly noise introduces synthetic anomalies into healthy images\nduring training, allowing the model to effectively learn anomaly removal. The\nmulti-stage diffusion process is introduced to progressively denoise images,\npreserving fine details while improving the quality of anomaly-free\nreconstructions. The generated high-fidelity counterfactual healthy images can\nfurther enhance the interpretability of the segmentation models, as well as\nprovide a reliable baseline for evaluating the extent of anomalies and\nsupporting clinical decision-making. Notably, the unsupervised anomaly\ndetection model is trained purely on healthy images, eliminating the need for\nanomalous training samples and pixel-level annotations. We validate the\nproposed approach on carotid US, brain MRI, and liver CT datasets. The\nexperimental results demonstrate that the proposed framework outperforms\nexisting state-of-the-art unsupervised anomaly detection methods, achieving\nperformance comparable to fully supervised segmentation models in the US\ndataset. Additionally, ablation studies underline the importance of\nhyperparameter selection for Synomaly noise and the effectiveness of the\nmulti-stage diffusion process in enhancing model performance.\n","authors":["Yuan Bi","Lucie Huang","Ricarda Clarenbach","Reza Ghotbi","Angelos Karlas","Nassir Navab","Zhongliang Jiang"],"pdf_url":"https://arxiv.org/pdf/2411.04004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03993v1","updated":"2024-11-06T15:34:57Z","published":"2024-11-06T15:34:57Z","title":"Local vs distributed representations: What is the right basis for\n interpretability?","summary":" Much of the research on the interpretability of deep neural networks has\nfocused on studying the visual features that maximally activate individual\nneurons. However, recent work has cast doubts on the usefulness of such local\nrepresentations for understanding the behavior of deep neural networks because\nindividual neurons tend to respond to multiple unrelated visual patterns, a\nphenomenon referred to as \"superposition\". A promising alternative to\ndisentangle these complex patterns is learning sparsely distributed vector\nrepresentations from entire network layers, as the resulting basis vectors\nseemingly encode single identifiable visual patterns consistently. Thus, one\nwould expect the resulting code to align better with human perceivable visual\npatterns, but supporting evidence remains, at best, anecdotal. To fill this\ngap, we conducted three large-scale psychophysics experiments collected from a\npool of 560 participants. Our findings provide (i) strong evidence that\nfeatures obtained from sparse distributed representations are easier to\ninterpret by human observers and (ii) that this effect is more pronounced in\nthe deepest layers of a neural network. Complementary analyses also reveal that\n(iii) features derived from sparse distributed representations contribute more\nto the model's decision. Overall, our results highlight that distributed\nrepresentations constitute a superior basis for interpretability, underscoring\na need for the field to move beyond the interpretation of local neural codes in\nfavor of sparsely distributed ones.\n","authors":["Julien Colin","Lore Goetschalckx","Thomas Fel","Victor Boutin","Jay Gopal","Thomas Serre","Nuria Oliver"],"pdf_url":"https://arxiv.org/pdf/2411.03993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03990v1","updated":"2024-11-06T15:30:42Z","published":"2024-11-06T15:30:42Z","title":"ET-SEED: Efficient Trajectory-Level SE(3) Equivariant Diffusion Policy","summary":" Imitation learning, e.g., diffusion policy, has been proven effective in\nvarious robotic manipulation tasks. However, extensive demonstrations are\nrequired for policy robustness and generalization. To reduce the demonstration\nreliance, we leverage spatial symmetry and propose ET-SEED, an efficient\ntrajectory-level SE(3) equivariant diffusion model for generating action\nsequences in complex robot manipulation tasks. Further, previous equivariant\ndiffusion models require the per-step equivariance in the Markov process,\nmaking it difficult to learn policy under such strong constraints. We\ntheoretically extend equivariant Markov kernels and simplify the condition of\nequivariant diffusion process, thereby significantly improving training\nefficiency for trajectory-level SE(3) equivariant diffusion policy in an\nend-to-end manner. We evaluate ET-SEED on representative robotic manipulation\ntasks, involving rigid body, articulated and deformable object. Experiments\ndemonstrate superior data efficiency and manipulation proficiency of our\nproposed method, as well as its ability to generalize to unseen configurations\nwith only a few demonstrations. Website: https://et-seed.github.io/\n","authors":["Chenrui Tie","Yue Chen","Ruihai Wu","Boxuan Dong","Zeyi Li","Chongkai Gao","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2411.03990v1.pdf","comment":"Accept to CoRL 2024 Workshop on X-Embodiment Robot Learning"},{"id":"http://arxiv.org/abs/2411.03982v1","updated":"2024-11-06T15:19:24Z","published":"2024-11-06T15:19:24Z","title":"ReEdit: Multimodal Exemplar-Based Image Editing with Diffusion Models","summary":" Modern Text-to-Image (T2I) Diffusion models have revolutionized image editing\nby enabling the generation of high-quality photorealistic images. While the de\nfacto method for performing edits with T2I models is through text instructions,\nthis approach non-trivial due to the complex many-to-many mapping between\nnatural language and images. In this work, we address exemplar-based image\nediting -- the task of transferring an edit from an exemplar pair to a content\nimage(s). We propose ReEdit, a modular and efficient end-to-end framework that\ncaptures edits in both text and image modalities while ensuring the fidelity of\nthe edited image. We validate the effectiveness of ReEdit through extensive\ncomparisons with state-of-the-art baselines and sensitivity analyses of key\ndesign choices. Our results demonstrate that ReEdit consistently outperforms\ncontemporary approaches both qualitatively and quantitatively. Additionally,\nReEdit boasts high practical applicability, as it does not require any\ntask-specific optimization and is four times faster than the next best\nbaseline.\n","authors":["Ashutosh Srivastava","Tarun Ram Menta","Abhinav Java","Avadhoot Jadhav","Silky Singh","Surgan Jandial","Balaji Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2411.03982v1.pdf","comment":"First three authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2411.03976v1","updated":"2024-11-06T15:13:31Z","published":"2024-11-06T15:13:31Z","title":"HRDecoder: High-Resolution Decoder Network for Fundus Image Lesion\n Segmentation","summary":" High resolution is crucial for precise segmentation in fundus images, yet\nhandling high-resolution inputs incurs considerable GPU memory costs, with\ndiminishing performance gains as overhead increases. To address this issue\nwhile tackling the challenge of segmenting tiny objects, recent studies have\nexplored local-global fusion methods. These methods preserve fine details using\nlocal regions and capture long-range context information from downscaled global\nimages. However, the necessity of multiple forward passes inevitably incurs\nsignificant computational overhead, adversely affecting inference speed. In\nthis paper, we propose HRDecoder, a simple High-Resolution Decoder network for\nfundus lesion segmentation. It integrates a high-resolution representation\nlearning module to capture fine-grained local features and a high-resolution\nfusion module to fuse multi-scale predictions. Our method effectively improves\nthe overall segmentation accuracy of fundus lesions while consuming reasonable\nmemory and computational overhead, and maintaining satisfying inference speed.\nExperimental results on the IDRID and DDR datasets demonstrate the\neffectiveness of our method. Code is available at\nhttps://github.com/CVIU-CSU/HRDecoder.\n","authors":["Ziyuan Ding","Yixiong Liang","Shichao Kan","Qing Liu"],"pdf_url":"https://arxiv.org/pdf/2411.03976v1.pdf","comment":"11 pages, 3 figures, accepted by MICCAI 2024, the revised version"},{"id":"http://arxiv.org/abs/2407.17952v2","updated":"2024-11-06T14:58:17Z","published":"2024-07-25T11:16:37Z","title":"BetterDepth: Plug-and-Play Diffusion Refiner for Zero-Shot Monocular\n Depth Estimation","summary":" By training over large-scale datasets, zero-shot monocular depth estimation\n(MDE) methods show robust performance in the wild but often suffer from\ninsufficient detail. Although recent diffusion-based MDE approaches exhibit a\nsuperior ability to extract details, they struggle in geometrically complex\nscenes that challenge their geometry prior, trained on less diverse 3D data. To\nleverage the complementary merits of both worlds, we propose BetterDepth to\nachieve geometrically correct affine-invariant MDE while capturing fine\ndetails. Specifically, BetterDepth is a conditional diffusion-based refiner\nthat takes the prediction from pre-trained MDE models as depth conditioning, in\nwhich the global depth layout is well-captured, and iteratively refines details\nbased on the input image. For the training of such a refiner, we propose global\npre-alignment and local patch masking methods to ensure BetterDepth remains\nfaithful to the depth conditioning while learning to add fine-grained scene\ndetails. With efficient training on small-scale synthetic datasets, BetterDepth\nachieves state-of-the-art zero-shot MDE performance on diverse public datasets\nand on in-the-wild scenes. Moreover, BetterDepth can improve the performance of\nother MDE models in a plug-and-play manner without further re-training.\n","authors":["Xiang Zhang","Bingxin Ke","Hayko Riemenschneider","Nando Metzger","Anton Obukhov","Markus Gross","Konrad Schindler","Christopher Schroers"],"pdf_url":"https://arxiv.org/pdf/2407.17952v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.00738v3","updated":"2024-11-06T14:45:58Z","published":"2024-08-01T17:35:58Z","title":"Virchow2: Scaling Self-Supervised Mixed Magnification Models in\n Pathology","summary":" Foundation models are rapidly being developed for computational pathology\napplications. However, it remains an open question which factors are most\nimportant for downstream performance with data scale and diversity, model size,\nand training algorithm all playing a role. In this work, we propose algorithmic\nmodifications, tailored for pathology, and we present the result of scaling\nboth data and model size, surpassing previous studies in both dimensions. We\nintroduce three new models: Virchow2, a 632 million parameter vision\ntransformer, Virchow2G, a 1.9 billion parameter vision transformer, and\nVirchow2G Mini, a 22 million parameter distillation of Virchow2G, each trained\nwith 3.1 million histopathology whole slide images, with diverse tissues,\noriginating institutions, and stains. We achieve state of the art performance\non 12 tile-level tasks, as compared to the top performing competing models. Our\nresults suggest that data diversity and domain-specific methods can outperform\nmodels that only scale in the number of parameters, but, on average,\nperformance benefits from the combination of domain-specific methods, data\nscale, and model scale.\n","authors":["Eric Zimmermann","Eugene Vorontsov","Julian Viret","Adam Casson","Michal Zelechowski","George Shaikovski","Neil Tenenholtz","James Hall","David Klimstra","Razik Yousfi","Thomas Fuchs","Nicolo Fusi","Siqi Liu","Kristen Severson"],"pdf_url":"https://arxiv.org/pdf/2408.00738v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03960v1","updated":"2024-11-06T14:45:41Z","published":"2024-11-06T14:45:41Z","title":"Face Reconstruction from Face Embeddings using Adapter to a Face\n Foundation Model","summary":" Face recognition systems extract embedding vectors from face images and use\nthese embeddings to verify or identify individuals. Face reconstruction attack\n(also known as template inversion) refers to reconstructing face images from\nface embeddings and using the reconstructed face image to enter a face\nrecognition system. In this paper, we propose to use a face foundation model to\nreconstruct face images from the embeddings of a blackbox face recognition\nmodel. The foundation model is trained with 42M images to generate face images\nfrom the facial embeddings of a fixed face recognition model. We propose to use\nan adapter to translate target embeddings into the embedding space of the\nfoundation model. The generated images are evaluated on different face\nrecognition models and different datasets, demonstrating the effectiveness of\nour method to translate embeddings of different face recognition models. We\nalso evaluate the transferability of reconstructed face images when attacking\ndifferent face recognition models. Our experimental results show that our\nreconstructed face images outperform previous reconstruction attacks against\nface recognition models.\n","authors":["Hatef Otroshi Shahreza","Anjith George","Sébastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2411.03960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03959v1","updated":"2024-11-06T14:45:16Z","published":"2024-11-06T14:45:16Z","title":"Energy Score-based Pseudo-Label Filtering and Adaptive Loss for\n Imbalanced Semi-supervised SAR target recognition","summary":" Automatic target recognition (ATR) is an important use case for synthetic\naperture radar (SAR) image interpretation. Recent years have seen significant\nadvancements in SAR ATR technology based on semi-supervised learning. However,\nexisting semi-supervised SAR ATR algorithms show low recognition accuracy in\nthe case of class imbalance. This work offers a non-balanced semi-supervised\nSAR target recognition approach using dynamic energy scores and adaptive loss.\nFirst, an energy score-based method is developed to dynamically select\nunlabeled samples near to the training distribution as pseudo-labels during\ntraining, assuring pseudo-label reliability in long-tailed distribution\ncircumstances. Secondly, loss functions suitable for class imbalances are\nproposed, including adaptive margin perception loss and adaptive hard triplet\nloss, the former offsets inter-class confusion of classifiers, alleviating the\nimbalance issue inherent in pseudo-label generation. The latter effectively\ntackles the model's preference for the majority class by focusing on complex\ndifficult samples during training. Experimental results on extremely imbalanced\nSAR datasets demonstrate that the proposed method performs well under the dual\nconstraints of scarce labels and data imbalance, effectively overcoming the\nmodel bias caused by data imbalance and achieving high-precision target\nrecognition.\n","authors":["Xinzheng Zhang","Yuqing Luo","Guopeng Li"],"pdf_url":"https://arxiv.org/pdf/2411.03959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07724v2","updated":"2024-11-06T14:29:36Z","published":"2024-04-11T13:16:47Z","title":"Applying Guidance in a Limited Interval Improves Sample and Distribution\n Quality in Diffusion Models","summary":" Guidance is a crucial technique for extracting the best performance out of\nimage-generating diffusion models. Traditionally, a constant guidance weight\nhas been applied throughout the sampling chain of an image. We show that\nguidance is clearly harmful toward the beginning of the chain (high noise\nlevels), largely unnecessary toward the end (low noise levels), and only\nbeneficial in the middle. We thus restrict it to a specific range of noise\nlevels, improving both the inference speed and result quality. This limited\nguidance interval improves the record FID in ImageNet-512 significantly, from\n1.81 to 1.40. We show that it is quantitatively and qualitatively beneficial\nacross different sampler parameters, network architectures, and datasets,\nincluding the large-scale setting of Stable Diffusion XL. We thus suggest\nexposing the guidance interval as a hyperparameter in all diffusion models that\nuse guidance.\n","authors":["Tuomas Kynkäänniemi","Miika Aittala","Tero Karras","Samuli Laine","Timo Aila","Jaakko Lehtinen"],"pdf_url":"https://arxiv.org/pdf/2404.07724v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2407.09786v2","updated":"2024-11-06T14:22:28Z","published":"2024-07-13T06:53:39Z","title":"Self-supervised 3D Point Cloud Completion via Multi-view Adversarial\n Learning","summary":" In real-world scenarios, scanned point clouds are often incomplete due to\nocclusion issues. The task of self-supervised point cloud completion involves\nreconstructing missing regions of these incomplete objects without the\nsupervision of complete ground truth. Current self-supervised methods either\nrely on multiple views of partial observations for supervision or overlook the\nintrinsic geometric similarity that can be identified and utilized from the\ngiven partial point clouds. In this paper, we propose MAL-SPC, a framework that\neffectively leverages both object-level and category-specific geometric\nsimilarities to complete missing structures. Our MAL-SPC does not require any\n3D complete supervision and only necessitates a single partial point cloud for\neach object. Specifically, we first introduce a Pattern Retrieval Network to\nretrieve similar position and curvature patterns between the partial input and\nthe predicted shape, then leverage these similarities to densify and refine the\nreconstructed results. Additionally, we render the reconstructed complete shape\ninto multi-view depth maps and design an adversarial learning module to learn\nthe geometry of the target shape from category-specific single-view depth\nimages. To achieve anisotropic rendering, we design a density-aware radius\nestimation algorithm to improve the quality of the rendered images. Our MAL-SPC\nyields the best results compared to current state-of-the-art methods.We will\nmake the source code publicly available at \\url{https://github.com/ltwu6/malspc\n","authors":["Lintai Wu","Xianjing Cheng","Yong Xu","Huanqiang Zeng","Junhui Hou"],"pdf_url":"https://arxiv.org/pdf/2407.09786v2.pdf","comment":"14 pages,10 figures"},{"id":"http://arxiv.org/abs/2411.03926v1","updated":"2024-11-06T13:57:53Z","published":"2024-11-06T13:57:53Z","title":"Act in Collusion: A Persistent Distributed Multi-Target Backdoor in\n Federated Learning","summary":" Federated learning, a novel paradigm designed to protect data privacy, is\nvulnerable to backdoor attacks due to its distributed nature. Current research\noften designs attacks based on a single attacker with a single backdoor,\noverlooking more realistic and complex threats in federated learning. We\npropose a more practical threat model for federated learning: the distributed\nmulti-target backdoor. In this model, multiple attackers control different\nclients, embedding various triggers and targeting different classes,\ncollaboratively implanting backdoors into the global model via central\naggregation. Empirical validation shows that existing methods struggle to\nmaintain the effectiveness of multiple backdoors in the global model. Our key\ninsight is that similar backdoor triggers cause parameter conflicts and\ninjecting new backdoors disrupts gradient directions, significantly weakening\nsome backdoors performance. To solve this, we propose a Distributed\nMulti-Target Backdoor Attack (DMBA), ensuring efficiency and persistence of\nbackdoors from different malicious clients. To avoid parameter conflicts, we\ndesign a multi-channel dispersed frequency trigger strategy to maximize trigger\ndifferences. To mitigate gradient interference, we introduce backdoor replay in\nlocal training to neutralize conflicting gradients. Extensive validation shows\nthat 30 rounds after the attack, Attack Success Rates of three different\nbackdoors from various clients remain above 93%. The code will be made publicly\navailable after the review period.\n","authors":["Tao Liu","Wu Yang","Chen Xu","Jiguang Lv","Huanran Wang","Yuhang Zhang","Shuchun Xu","Dapeng Man"],"pdf_url":"https://arxiv.org/pdf/2411.03926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07001v4","updated":"2024-11-06T13:56:28Z","published":"2024-05-11T12:33:46Z","title":"ChartInsights: Evaluating Multimodal Large Language Models for Low-Level\n Chart Question Answering","summary":" Chart question answering (ChartQA) tasks play a critical role in interpreting\nand extracting insights from visualization charts. While recent advancements in\nmultimodal large language models (MLLMs) like GPT-4o have shown promise in\nhigh-level ChartQA tasks, such as chart captioning, their effectiveness in\nlow-level ChartQA tasks (e.g., identifying correlations) remains underexplored.\nIn this paper, we address this gap by evaluating MLLMs on low-level ChartQA\nusing a newly curated dataset, ChartInsights, which consists of 22,347 (chart,\ntask, query, answer) covering 10 data analysis tasks across 7 chart types. We\nsystematically evaluate 19 advanced MLLMs, including 12 open-source and 7\nclosed-source models. The average accuracy rate across these models is 39.8%,\nwith GPT-4o achieving the highest accuracy at 69.17%. To further explore the\nlimitations of MLLMs in low-level ChartQA, we conduct experiments that alter\nvisual elements of charts (e.g., changing color schemes, adding image noise) to\nassess their impact on the task effectiveness. Furthermore, we propose a new\ntextual prompt strategy, Chain-of-Charts, tailored for low-level ChartQA tasks,\nwhich boosts performance by 14.41%, achieving an accuracy of 83.58%. Finally,\nincorporating a visual prompt strategy that directs attention to relevant\nvisual elements further improves accuracy to 84.32%.\n","authors":["Yifan Wu","Lutao Yan","Leixian Shen","Yunhai Wang","Nan Tang","Yuyu Luo"],"pdf_url":"https://arxiv.org/pdf/2405.07001v4.pdf","comment":"EMNLP 2024 Conference Paper"},{"id":"http://arxiv.org/abs/2411.03924v1","updated":"2024-11-06T13:54:26Z","published":"2024-11-06T13:54:26Z","title":"Self-supervised Representation Learning for Cell Event Recognition\n through Time Arrow Prediction","summary":" The spatio-temporal nature of live-cell microscopy data poses challenges in\nthe analysis of cell states which is fundamental in bioimaging. Deep-learning\nbased segmentation or tracking methods rely on large amount of high quality\nannotations to work effectively. In this work, we explore an alternative\nsolution: using feature maps obtained from self-supervised representation\nlearning (SSRL) on time arrow prediction (TAP) for the downstream supervised\ntask of cell event recognition. We demonstrate through extensive experiments\nand analysis that this approach can achieve better performance with limited\nannotation compared to models trained from end to end using fully supervised\napproach. Our analysis also provides insight into applications of the SSRL\nusing TAP in live-cell microscopy.\n","authors":["Cangxiong Chen","Vinay P. Namboodiri","Julia E. Sero"],"pdf_url":"https://arxiv.org/pdf/2411.03924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06629v4","updated":"2024-11-06T13:29:57Z","published":"2023-10-10T13:48:18Z","title":"EViT: An Eagle Vision Transformer with Bi-Fovea Self-Attention","summary":" Owing to advancements in deep learning technology, Vision Transformers (ViTs)\nhave demonstrated impressive performance in various computer vision tasks.\nNonetheless, ViTs still face some challenges, such as high computational\ncomplexity and the absence of desirable inductive biases. To alleviate these\nissues, {the potential advantages of combining eagle vision with ViTs are\nexplored. We summarize a Bi-Fovea Visual Interaction (BFVI) structure inspired\nby the unique physiological and visual characteristics of eagle eyes. A novel\nBi-Fovea Self-Attention (BFSA) mechanism and Bi-Fovea Feedforward Network\n(BFFN) are proposed based on this structural design approach, which can be used\nto mimic the hierarchical and parallel information processing scheme of the\nbiological visual cortex, enabling networks to learn feature representations of\ntargets in a coarse-to-fine manner. Furthermore, a Bionic Eagle Vision (BEV)\nblock is designed as the basic building unit based on the BFSA mechanism and\nBFFN. By stacking BEV blocks, a unified and efficient family of pyramid\nbackbone networks called Eagle Vision Transformers (EViTs) is developed.\nExperimental results show that EViTs exhibit highly competitive performance in\nvarious computer vision tasks, such as image classification, object detection\nand semantic segmentation. Compared with other approaches, EViTs have\nsignificant advantages, especially in terms of performance and computational\nefficiency. Code is available at https://github.com/nkusyl/EViT\n","authors":["Yulong Shi","Mingwei Sun","Yongshuai Wang","Jiahao Ma","Zengqiang Chen"],"pdf_url":"https://arxiv.org/pdf/2310.06629v4.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2411.00393v3","updated":"2024-11-06T13:25:42Z","published":"2024-11-01T06:40:47Z","title":"Advantages of Neural Population Coding for Deep Learning","summary":" Scalar variables, e.g., the orientation of a shape in an image, are commonly\npredicted using a single output neuron in a neural network. In contrast, the\nmammalian cortex represents variables with a population of neurons. In this\npopulation code, each neuron is most active at its preferred value and shows\npartial activity for other values. Here, we investigate the benefit of using a\npopulation code for the output layer of a neural network. We compare population\ncodes against single-neuron outputs and one-hot vectors. First, we show\ntheoretically and in experiments with synthetic data that population codes\nimprove robustness to input noise in networks of stacked linear layers. Second,\nwe demonstrate the benefit of using population codes to encode ambiguous\noutputs, such as the pose of symmetric objects. Using the T-LESS dataset of\nfeature-less real-world objects, we show that population codes improve the\naccuracy of predicting 3D object orientation from image input.\n","authors":["Heiko Hoffmann"],"pdf_url":"https://arxiv.org/pdf/2411.00393v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03055v2","updated":"2024-11-06T13:24:10Z","published":"2024-11-05T12:42:42Z","title":"ATM: Improving Model Merging by Alternating Tuning and Merging","summary":" Model merging has recently emerged as a cost-efficient paradigm for\nmulti-task learning. Among current approaches, task arithmetic stands out for\nits simplicity and effectiveness. In this paper, we motivate the effectiveness\nof task vectors by linking them to multi-task gradients. We show that in a\nsingle-epoch scenario, task vectors are mathematically equivalent to the\ngradients obtained via gradient descent in a multi-task setting, and still\napproximate these gradients in subsequent epochs. Furthermore, we show that\ntask vectors perform optimally when equality is maintained, and their\neffectiveness is largely driven by the first epoch's gradient. Building on this\ninsight, we propose viewing model merging as a single step in an iterative\nprocess that Alternates between Tuning and Merging (ATM). This method acts as a\nbridge between model merging and multi-task gradient descent, achieving\nstate-of-the-art results with the same data and computational requirements. We\nextensively evaluate ATM across diverse settings, achieving up to 20% higher\naccuracy in computer vision and NLP tasks, compared to the best baselines.\nFinally, we provide both empirical and theoretical support for its\neffectiveness, demonstrating increased orthogonality between task vectors and\nproving that ATM minimizes an upper bound on the loss obtained by jointly\nfinetuning all tasks.\n","authors":["Luca Zhou","Daniele Solombrino","Donato Crisostomi","Maria Sofia Bucarelli","Fabrizio Silvestri","Emanuele Rodolà"],"pdf_url":"https://arxiv.org/pdf/2411.03055v2.pdf","comment":"Main paper: 10 Pages, 11 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.08240v3","updated":"2024-11-06T13:03:20Z","published":"2024-09-12T17:39:23Z","title":"IFAdapter: Instance Feature Control for Grounded Text-to-Image\n Generation","summary":" While Text-to-Image (T2I) diffusion models excel at generating visually\nappealing images of individual instances, they struggle to accurately position\nand control the features generation of multiple instances. The Layout-to-Image\n(L2I) task was introduced to address the positioning challenges by\nincorporating bounding boxes as spatial control signals, but it still falls\nshort in generating precise instance features. In response, we propose the\nInstance Feature Generation (IFG) task, which aims to ensure both positional\naccuracy and feature fidelity in generated instances. To address the IFG task,\nwe introduce the Instance Feature Adapter (IFAdapter). The IFAdapter enhances\nfeature depiction by incorporating additional appearance tokens and utilizing\nan Instance Semantic Map to align instance-level features with spatial\nlocations. The IFAdapter guides the diffusion process as a plug-and-play\nmodule, making it adaptable to various community models. For evaluation, we\ncontribute an IFG benchmark and develop a verification pipeline to objectively\ncompare models' abilities to generate instances with accurate positioning and\nfeatures. Experimental results demonstrate that IFAdapter outperforms other\nmodels in both quantitative and qualitative evaluations.\n","authors":["Yinwei Wu","Xianpan Zhou","Bing Ma","Xuefeng Su","Kai Ma","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.08240v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01853v2","updated":"2024-11-06T12:27:27Z","published":"2024-11-04T07:07:31Z","title":"GVKF: Gaussian Voxel Kernel Functions for Highly Efficient Surface\n Reconstruction in Open Scenes","summary":" In this paper we present a novel method for efficient and effective 3D\nsurface reconstruction in open scenes. Existing Neural Radiance Fields (NeRF)\nbased works typically require extensive training and rendering time due to the\nadopted implicit representations. In contrast, 3D Gaussian splatting (3DGS)\nuses an explicit and discrete representation, hence the reconstructed surface\nis built by the huge number of Gaussian primitives, which leads to excessive\nmemory consumption and rough surface details in sparse Gaussian areas. To\naddress these issues, we propose Gaussian Voxel Kernel Functions (GVKF), which\nestablish a continuous scene representation based on discrete 3DGS through\nkernel regression. The GVKF integrates fast 3DGS rasterization and highly\neffective scene implicit representations, achieving high-fidelity open scene\nsurface reconstruction. Experiments on challenging scene datasets demonstrate\nthe efficiency and effectiveness of our proposed GVKF, featuring with high\nreconstruction quality, real-time rendering speed, significant savings in\nstorage and training memory consumption.\n","authors":["Gaochao Song","Chong Cheng","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2411.01853v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.03862v1","updated":"2024-11-06T12:14:23Z","published":"2024-11-06T12:14:23Z","title":"ROBIN: Robust and Invisible Watermarks for Diffusion Models with\n Adversarial Optimization","summary":" Watermarking generative content serves as a vital tool for authentication,\nownership protection, and mitigation of potential misuse. Existing watermarking\nmethods face the challenge of balancing robustness and concealment. They\nempirically inject a watermark that is both invisible and robust and passively\nachieve concealment by limiting the strength of the watermark, thus reducing\nthe robustness. In this paper, we propose to explicitly introduce a watermark\nhiding process to actively achieve concealment, thus allowing the embedding of\nstronger watermarks. To be specific, we implant a robust watermark in an\nintermediate diffusion state and then guide the model to hide the watermark in\nthe final generated image. We employ an adversarial optimization algorithm to\nproduce the optimal hiding prompt guiding signal for each watermark. The prompt\nembedding is optimized to minimize artifacts in the generated image, while the\nwatermark is optimized to achieve maximum strength. The watermark can be\nverified by reversing the generation process. Experiments on various diffusion\nmodels demonstrate the watermark remains verifiable even under significant\nimage tampering and shows superior invisibility compared to other\nstate-of-the-art robust watermarking methods.\n","authors":["Huayang Huang","Yu Wu","Qian Wang"],"pdf_url":"https://arxiv.org/pdf/2411.03862v1.pdf","comment":"Accept to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.03861v1","updated":"2024-11-06T12:14:11Z","published":"2024-11-06T12:14:11Z","title":"FedRISE: Rating Induced Sign Election of Gradients for Byzantine\n Tolerant Federated Aggregation","summary":" One of the most common defense strategies against model poisoning in\nfederated learning is to employ a robust aggregator mechanism that makes the\ntraining more resilient. Many of the existing Byzantine robust aggregators\nprovide theoretical guarantees and are empirically effective against certain\ncategories of attacks. However, we observe that certain high-strength attacks\ncan subvert the aggregator and collapse the training. In addition, most\naggregators require identifying tolerant settings to converge. Impact of\nattacks becomes more pronounced when the number of Byzantines is near-majority,\nand becomes harder to evade if the attacker is omniscient with access to data,\nhonest updates and aggregation methods. Motivated by these observations, we\ndevelop a robust aggregator called FedRISE for cross-silo FL that is consistent\nand less susceptible to poisoning updates by an omniscient attacker. The\nproposed method explicitly determines the optimal direction of each gradient\nthrough a sign-voting strategy that uses variance-reduced sparse gradients. We\nargue that vote weighting based on the cosine similarity of raw gradients is\nmisleading, and we introduce a sign-based gradient valuation function that\nignores the gradient magnitude. We compare our method against 8 robust\naggregators under 6 poisoning attacks on 3 datasets and architectures. Our\nresults show that existing robust aggregators collapse for at least some\nattacks under severe settings, while FedRISE demonstrates better robustness\nbecause of a stringent gradient inclusion formulation.\n","authors":["Joseph Geo Benjamin","Mothilal Asokan","Mohammad Yaqub","Karthik Nandakumar"],"pdf_url":"https://arxiv.org/pdf/2411.03861v1.pdf","comment":"This is a work under submission/review process"},{"id":"http://arxiv.org/abs/2408.13800v3","updated":"2024-11-06T12:10:54Z","published":"2024-08-25T10:42:07Z","title":"BCDNet: A Fast Residual Neural Network For Invasive Ductal Carcinoma\n Detection","summary":" It is of great significance to diagnose Invasive Ductal Carcinoma (IDC) in\nearly stage, which is the most common subtype of breast cancer. Although the\npowerful models in the Computer-Aided Diagnosis (CAD) systems provide promising\nresults, it is still difficult to integrate them into other medical devices or\nuse them without sufficient computation resource. In this paper, we propose\nBCDNet, which firstly upsamples the input image by the residual block and use\nsmaller convolutional block and a special MLP to learn features. BCDNet is\nproofed to effectively detect IDC in histopathological RGB images with an\naverage accuracy of 91.6% and reduce training consumption effectively compared\nto ResNet 50 and ViT-B-16.\n","authors":["Yujia Lin","Aiwei Lian","Mingyu Liao","Shuangjie Yuan"],"pdf_url":"https://arxiv.org/pdf/2408.13800v3.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.03313v2","updated":"2024-11-06T12:07:08Z","published":"2024-11-05T18:58:15Z","title":"Classification Done Right for Vision-Language Pre-Training","summary":" We introduce SuperClass, a super simple classification method for\nvision-language pre-training on image-text data. Unlike its contrastive\ncounterpart CLIP who contrast with a text encoder, SuperClass directly utilizes\ntokenized raw text as supervised classification labels, without the need for\nadditional text filtering or selection. Due to the absence of the text encoding\nas contrastive target, SuperClass does not require a text encoder and does not\nneed to maintain a large batch size as CLIP does. SuperClass demonstrated\nsuperior performance on various downstream tasks, including classic computer\nvision benchmarks and vision language downstream tasks. We further explored the\nscaling behavior of SuperClass on model size, training length, or data size,\nand reported encouraging results and comparisons to CLIP.\nhttps://github.com/x-cls/superclass\n","authors":["Zilong Huang","Qinghao Ye","Bingyi Kang","Jiashi Feng","Haoqi Fan"],"pdf_url":"https://arxiv.org/pdf/2411.03313v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2404.05997v2","updated":"2024-11-06T12:06:03Z","published":"2024-04-09T04:04:50Z","title":"Concept-Attention Whitening for Interpretable Skin Lesion Diagnosis","summary":" The black-box nature of deep learning models has raised concerns about their\ninterpretability for successful deployment in real-world clinical applications.\nTo address the concerns, eXplainable Artificial Intelligence (XAI) aims to\nprovide clear and understandable explanations of the decision-making process.\nIn the medical domain, concepts such as attributes of lesions or abnormalities\nserve as key evidence for deriving diagnostic results. Existing concept-based\nmodels mainly depend on concepts that appear independently and require\nfine-grained concept annotations such as bounding boxes. However, a medical\nimage usually contains multiple concepts, and the fine-grained concept\nannotations are difficult to acquire. In this paper, we aim to interpret\nrepresentations in deep neural networks by aligning the axes of the latent\nspace with known concepts of interest. We propose a novel Concept-Attention\nWhitening (CAW) framework for interpretable skin lesion diagnosis. CAW is\ncomprised of a disease diagnosis branch and a concept alignment branch. In the\nformer branch, we train a convolutional neural network (CNN) with an inserted\nCAW layer to perform skin lesion diagnosis. The CAW layer decorrelates features\nand aligns image features to conceptual meanings via an orthogonal matrix. In\nthe latter branch, the orthogonal matrix is calculated under the guidance of\nthe concept attention mask. We particularly introduce a weakly-supervised\nconcept mask generator that only leverages coarse concept labels for filtering\nlocal regions that are relevant to certain concepts, improving the optimization\nof the orthogonal matrix. Extensive experiments on two public skin lesion\ndiagnosis datasets demonstrated that CAW not only enhanced interpretability but\nalso maintained a state-of-the-art diagnostic performance.\n","authors":["Junlin Hou","Jilan Xu","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05997v2.pdf","comment":"MICCAI 2024"},{"id":"http://arxiv.org/abs/2410.11666v3","updated":"2024-11-06T12:00:44Z","published":"2024-10-15T14:53:07Z","title":"Degradation Oriented and Regularized Network for Blind Depth\n Super-Resolution","summary":" Recent RGB-guided depth super-resolution methods have achieved impressive\nperformance under the assumption of fixed and known degradation (e.g., bicubic\ndownsampling). However, in real-world scenarios, captured depth data often\nsuffer from unconventional and unknown degradation due to sensor limitations\nand complex imaging environments (e.g., low reflective surfaces, varying\nillumination). Consequently, the performance of these methods significantly\ndeclines when real-world degradation deviate from their assumptions. In this\npaper, we propose the Degradation Oriented and Regularized Network (DORNet), a\nnovel framework designed to adaptively address unknown degradation in\nreal-world scenes through implicit degradation representations. Our approach\nbegins with the development of a self-supervised degradation learning strategy,\nwhich models the degradation representations of low-resolution depth data using\nrouting selection-based degradation regularization. To facilitate effective\nRGB-D fusion, we further introduce a degradation-oriented feature\ntransformation module that selectively propagates RGB content into the depth\ndata based on the learned degradation priors. Extensive experimental results on\nboth real and synthetic datasets demonstrate the superiority of our DORNet in\nhandling unknown degradation, outperforming existing methods. The code is\navailable at https://github.com/yanzq95/DORNet.\n","authors":["Zhengxue Wang","Zhiqiang Yan","Jinshan Pan","Guangwei Gao","Kai Zhang","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2410.11666v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2411.03855v1","updated":"2024-11-06T11:57:55Z","published":"2024-11-06T11:57:55Z","title":"MambaPEFT: Exploring Parameter-Efficient Fine-Tuning for Mamba","summary":" An ecosystem of Transformer-based models has been established by building\nlarge models with extensive data. Parameter-efficient fine-tuning (PEFT) is a\ncrucial technology for deploying these models to downstream tasks with minimal\ncost while achieving effective performance. Recently, Mamba, a State Space\nModel (SSM)-based model, has attracted attention as a potential alternative to\nTransformers. While many large-scale Mamba-based models have been proposed,\nefficiently adapting pre-trained Mamba-based models to downstream tasks remains\nunexplored. In this paper, we conduct an exploratory analysis of PEFT methods\nfor Mamba. We investigate the effectiveness of existing PEFT methods for\nTransformers when applied to Mamba. We also modify these methods to better\nalign with the Mamba architecture. Additionally, we propose new Mamba-specific\nPEFT methods that leverage the distinctive structure of Mamba. Our experiments\nindicate that PEFT performs more effectively for Mamba than Transformers.\nLastly, we demonstrate how to effectively combine multiple PEFT methods and\nprovide a framework that outperforms previous works. To ensure reproducibility,\nwe will release the code after publication.\n","authors":["Masakazu Yoshimura","Teruaki Hayashi","Yota Maeda"],"pdf_url":"https://arxiv.org/pdf/2411.03855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11124v2","updated":"2024-11-06T11:40:50Z","published":"2024-01-20T05:31:47Z","title":"Cross-Task Affinity Learning for Multitask Dense Scene Predictions","summary":" Multitask learning (MTL) has become prominent for its ability to predict\nmultiple tasks jointly, achieving better per-task performance with fewer\nparameters than single-task learning. Recently, decoder-focused architectures\nhave significantly improved multitask performance by refining task predictions\nusing features from related tasks. However, most refinement methods struggle to\nefficiently capture both local and long-range dependencies between\ntask-specific representations and cross-task patterns. In this paper, we\nintroduce the Cross-Task Affinity Learning (CTAL) module, a lightweight\nframework that enhances task refinement in multitask networks. CTAL effectively\ncaptures local and long-range cross-task interactions by optimizing task\naffinity matrices for parameter-efficient grouped convolutions without concern\nfor information loss. Our results demonstrate state-of-the-art MTL performance\nfor both CNN and transformer backbones, using significantly fewer parameters\nthan single-task learning. Our code is publicly available at\nhttps://github.com/Armanfard-Lab/EMA-Net.\n","authors":["Dimitrios Sinodinos","Narges Armanfard"],"pdf_url":"https://arxiv.org/pdf/2401.11124v2.pdf","comment":"Accepted for publication at the IEEE Winter Conference on\n Applications of Computer Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2411.03835v1","updated":"2024-11-06T11:14:49Z","published":"2024-11-06T11:14:49Z","title":"An Edge Computing-Based Solution for Real-Time Leaf Disease\n Classification using Thermal Imaging","summary":" Deep learning (DL) technologies can transform agriculture by improving crop\nhealth monitoring and management, thus improving food safety. In this paper, we\nexplore the potential of edge computing for real-time classification of leaf\ndiseases using thermal imaging. We present a thermal image dataset for plant\ndisease classification and evaluate deep learning models, including\nInceptionV3, MobileNetV1, MobileNetV2, and VGG-16, on resource-constrained\ndevices like the Raspberry Pi 4B. Using pruning and quantization-aware\ntraining, these models achieve inference times up to 1.48x faster on Edge TPU\nMax for VGG16, and up to 2.13x faster with precision reduction on Intel NCS2\nfor MobileNetV1, compared to high-end GPUs like the RTX 3090, while maintaining\nstate-of-the-art accuracy.\n","authors":["Públio Elon Correa da Silva","Jurandy Almeida"],"pdf_url":"https://arxiv.org/pdf/2411.03835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03831v1","updated":"2024-11-06T11:03:34Z","published":"2024-11-06T11:03:34Z","title":"An Enhancement of Haar Cascade Algorithm Applied to Face Recognition for\n Gate Pass Security","summary":" This study is focused on enhancing the Haar Cascade Algorithm to decrease the\nfalse positive and false negative rate in face matching and face detection to\nincrease the accuracy rate even under challenging conditions. The face\nrecognition library was implemented with Haar Cascade Algorithm in which the\n128-dimensional vectors representing the unique features of a face are encoded.\nA subprocess was applied where the grayscale image from Haar Cascade was\nconverted to RGB to improve the face encoding. Logical process and face\nfiltering are also used to decrease non-face detection. The Enhanced Haar\nCascade Algorithm produced a 98.39% accuracy rate (21.39% increase), 63.59%\nprecision rate, 98.30% recall rate, and 72.23% in F1 Score. In comparison, the\nHaar Cascade Algorithm achieved a 46.70% to 77.00% accuracy rate, 44.15%\nprecision rate, 98.61% recall rate, and 47.01% in F1 Score. Both algorithms\nused the Confusion Matrix Test with 301,950 comparisons using the same dataset\nof 550 images. The 98.39% accuracy rate shows a significant decrease in false\npositive and false negative rates in facial recognition. Face matching and face\ndetection are more accurate in images with complex backgrounds, lighting\nvariations, and occlusions, or even those with similar attributes.\n","authors":["Clarence A. Antipona","Romeo R. Magsino","Raymund M. Dioses","Khatalyn E. Mata"],"pdf_url":"https://arxiv.org/pdf/2411.03831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03829v1","updated":"2024-11-06T11:03:02Z","published":"2024-11-06T11:03:02Z","title":"Generalize or Detect? Towards Robust Semantic Segmentation Under\n Multiple Distribution Shifts","summary":" In open-world scenarios, where both novel classes and domains may exist, an\nideal segmentation model should detect anomaly classes for safety and\ngeneralize to new domains. However, existing methods often struggle to\ndistinguish between domain-level and semantic-level distribution shifts,\nleading to poor out-of-distribution (OOD) detection or domain generalization\nperformance. In this work, we aim to equip the model to generalize effectively\nto covariate-shift regions while precisely identifying semantic-shift regions.\nTo achieve this, we design a novel generative augmentation method to produce\ncoherent images that incorporate both anomaly (or novel) objects and various\ncovariate shifts at both image and object levels. Furthermore, we introduce a\ntraining strategy that recalibrates uncertainty specifically for semantic\nshifts and enhances the feature extractor to align features associated with\ndomain shifts. We validate the effectiveness of our method across benchmarks\nfeaturing both semantic and domain shifts. Our method achieves state-of-the-art\nperformance across all benchmarks for both OOD detection and domain\ngeneralization. Code is available at\nhttps://github.com/gaozhitong/MultiShiftSeg.\n","authors":["Zhitong Gao","Bingnan Li","Mathieu Salzmann","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2411.03829v1.pdf","comment":"Published in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.03823v1","updated":"2024-11-06T10:44:15Z","published":"2024-11-06T10:44:15Z","title":"Both Text and Images Leaked! A Systematic Analysis of Multimodal LLM\n Data Contamination","summary":" The rapid progression of multimodal large language models (MLLMs) has\ndemonstrated superior performance on various multimodal benchmarks. However,\nthe issue of data contamination during training creates challenges in\nperformance evaluation and comparison. While numerous methods exist for\ndetecting dataset contamination in large language models (LLMs), they are less\neffective for MLLMs due to their various modalities and multiple training\nphases. In this study, we introduce a multimodal data contamination detection\nframework, MM-Detect, designed for MLLMs. Our experimental results indicate\nthat MM-Detect is sensitive to varying degrees of contamination and can\nhighlight significant performance improvements due to leakage of the training\nset of multimodal benchmarks. Furthermore, We also explore the possibility of\ncontamination originating from the pre-training phase of LLMs used by MLLMs and\nthe fine-tuning phase of MLLMs, offering new insights into the stages at which\ncontamination may be introduced.\n","authors":["Dingjie Song","Sicheng Lai","Shunian Chen","Lichao Sun","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2411.03823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03819v1","updated":"2024-11-06T10:39:00Z","published":"2024-11-06T10:39:00Z","title":"SA3DIP: Segment Any 3D Instance with Potential 3D Priors","summary":" The proliferation of 2D foundation models has sparked research into adapting\nthem for open-world 3D instance segmentation. Recent methods introduce a\nparadigm that leverages superpoints as geometric primitives and incorporates 2D\nmulti-view masks from Segment Anything model (SAM) as merging guidance,\nachieving outstanding zero-shot instance segmentation results. However, the\nlimited use of 3D priors restricts the segmentation performance. Previous\nmethods calculate the 3D superpoints solely based on estimated normal from\nspatial coordinates, resulting in under-segmentation for instances with similar\ngeometry. Besides, the heavy reliance on SAM and hand-crafted algorithms in 2D\nspace suffers from over-segmentation due to SAM's inherent part-level\nsegmentation tendency. To address these issues, we propose SA3DIP, a novel\nmethod for Segmenting Any 3D Instances via exploiting potential 3D Priors.\nSpecifically, on one hand, we generate complementary 3D primitives based on\nboth geometric and textural priors, which reduces the initial errors that\naccumulate in subsequent procedures. On the other hand, we introduce\nsupplemental constraints from the 3D space by using a 3D detector to guide a\nfurther merging process. Furthermore, we notice a considerable portion of\nlow-quality ground truth annotations in ScanNetV2 benchmark, which affect the\nfair evaluations. Thus, we present ScanNetV2-INS with complete ground truth\nlabels and supplement additional instances for 3D class-agnostic instance\nsegmentation. Experimental evaluations on various 2D-3D datasets demonstrate\nthe effectiveness and robustness of our approach. Our code and proposed\nScanNetV2-INS dataset are available HERE.\n","authors":["Xi Yang","Xu Gu","Xingyilang Yin","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2411.03819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03807v1","updated":"2024-11-06T10:07:46Z","published":"2024-11-06T10:07:46Z","title":"GS2Pose: Tow-stage 6D Object Pose Estimation Guided by Gaussian\n Splatting","summary":" This paper proposes a new method for accurate and robust 6D pose estimation\nof novel objects, named GS2Pose. By introducing 3D Gaussian splatting, GS2Pose\ncan utilize the reconstruction results without requiring a high-quality CAD\nmodel, which means it only requires segmented RGBD images as input.\nSpecifically, GS2Pose employs a two-stage structure consisting of coarse\nestimation followed by refined estimation. In the coarse stage, a lightweight\nU-Net network with a polarization attention mechanism, called Pose-Net, is\ndesigned. By using the 3DGS model for supervised training, Pose-Net can\ngenerate NOCS images to compute a coarse pose. In the refinement stage, GS2Pose\nformulates a pose regression algorithm following the idea of reprojection or\nBundle Adjustment (BA), referred to as GS-Refiner. By leveraging Lie algebra to\nextend 3DGS, GS-Refiner obtains a pose-differentiable rendering pipeline that\nrefines the coarse pose by comparing the input images with the rendered images.\nGS-Refiner also selectively updates parameters in the 3DGS model to achieve\nenvironmental adaptation, thereby enhancing the algorithm's robustness and\nflexibility to illuminative variation, occlusion, and other challenging\ndisruptive factors. GS2Pose was evaluated through experiments conducted on the\nLineMod dataset, where it was compared with similar algorithms, yielding highly\ncompetitive results. The code for GS2Pose will soon be released on GitHub.\n","authors":["Jilan Mei","Junbo Li","Cai Meng"],"pdf_url":"https://arxiv.org/pdf/2411.03807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17459v2","updated":"2024-11-06T09:50:06Z","published":"2024-09-26T01:34:42Z","title":"TFS-NeRF: Template-Free NeRF for Semantic 3D Reconstruction of Dynamic\n Scene","summary":" Despite advancements in Neural Implicit models for 3D surface reconstruction,\nhandling dynamic environments with arbitrary rigid, non-rigid, or deformable\nentities remains challenging. Many template-based methods are entity-specific,\nfocusing on humans, while generic reconstruction methods adaptable to such\ndynamic scenes often require additional inputs like depth or optical flow or\nrely on pre-trained image features for reasonable outcomes. These methods\ntypically use latent codes to capture frame-by-frame deformations. In contrast,\nsome template-free methods bypass these requirements and adopt traditional LBS\n(Linear Blend Skinning) weights for a detailed representation of deformable\nobject motions, although they involve complex optimizations leading to lengthy\ntraining times. To this end, as a remedy, this paper introduces TFS-NeRF, a\ntemplate-free 3D semantic NeRF for dynamic scenes captured from sparse or\nsingle-view RGB videos, featuring interactions among various entities and more\ntime-efficient than other LBS-based approaches. Our framework uses an\nInvertible Neural Network (INN) for LBS prediction, simplifying the training\nprocess. By disentangling the motions of multiple entities and optimizing\nper-entity skinning weights, our method efficiently generates accurate,\nsemantically separable geometries. Extensive experiments demonstrate that our\napproach produces high-quality reconstructions of both deformable and\nnon-deformable objects in complex interactions, with improved training\nefficiency compared to existing methods.\n","authors":["Sandika Biswas","Qianyi Wu","Biplab Banerjee","Hamid Rezatofighi"],"pdf_url":"https://arxiv.org/pdf/2409.17459v2.pdf","comment":"Accepted in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.15306v3","updated":"2024-11-06T09:49:31Z","published":"2024-05-24T07:48:35Z","title":"DeTikZify: Synthesizing Graphics Programs for Scientific Figures and\n Sketches with TikZ","summary":" Creating high-quality scientific figures can be time-consuming and\nchallenging, even though sketching ideas on paper is relatively easy.\nFurthermore, recreating existing figures that are not stored in formats\npreserving semantic information is equally complex. To tackle this problem, we\nintroduce DeTikZify, a novel multimodal language model that automatically\nsynthesizes scientific figures as semantics-preserving TikZ graphics programs\nbased on sketches and existing figures. To achieve this, we create three new\ndatasets: DaTikZv2, the largest TikZ dataset to date, containing over 360k\nhuman-created TikZ graphics; SketchFig, a dataset that pairs hand-drawn\nsketches with their corresponding scientific figures; and MetaFig, a collection\nof diverse scientific figures and associated metadata. We train DeTikZify on\nMetaFig and DaTikZv2, along with synthetically generated sketches learned from\nSketchFig. We also introduce an MCTS-based inference algorithm that enables\nDeTikZify to iteratively refine its outputs without the need for additional\ntraining. Through both automatic and human evaluation, we demonstrate that\nDeTikZify outperforms commercial Claude 3 and GPT-4V in synthesizing TikZ\nprograms, with the MCTS algorithm effectively boosting its performance. We make\nour code, models, and datasets publicly available.\n","authors":["Jonas Belouadi","Simone Paolo Ponzetto","Steffen Eger"],"pdf_url":"https://arxiv.org/pdf/2405.15306v3.pdf","comment":"Accepted at NeurIPS 2024 (spotlight); Project page:\n https://github.com/potamides/DeTikZify"},{"id":"http://arxiv.org/abs/2411.03795v1","updated":"2024-11-06T09:39:52Z","published":"2024-11-06T09:39:52Z","title":"VQA$^2$:Visual Question Answering for Video Quality Assessment","summary":" The advent and proliferation of large multi-modal models (LMMs) have\nintroduced a new paradigm to video-related computer vision fields, including\ntraining and inference methods based on visual question answering (VQA). These\nmethods enable models to handle multiple downstream tasks robustly. Video\nQuality Assessment (VQA), a classic field in low-level visual quality\nevaluation, originally focused on quantitative video quality scoring. However,\ndriven by advances in LMMs, it is now evolving towards more comprehensive\nvisual quality understanding tasks. Visual question answering has significantly\nimproved low-level visual evaluation within the image domain recently. However,\nrelated work is almost nonexistent in the video domain, leaving substantial\nroom for improvement. To address this gap, we introduce the VQA2 Instruction\nDataset the first visual question answering instruction dataset entirely\nfocuses on video quality assessment, and based on it, we propose the VQA2\nseries models The VQA2 Instruction Dataset consists of three stages and covers\nvarious video types, containing 157,735 instruction question-answer pairs,\nincluding both manually annotated and synthetic data. We conduct extensive\nexperiments on both video quality scoring and video quality understanding\ntasks. Results demonstrate that the VQA2 series models achieve state-of-the-art\n(SOTA) performance in quality scoring tasks, and their performance in visual\nquality question answering surpasses the renowned GPT-4o. Additionally, our\nfinal model, the VQA2-Assistant, performs well across both scoring and\nquestion-answering tasks, validating its versatility.\n","authors":["Ziheng Jia","Zicheng Zhang","Jiaying Qian","Haoning Wu","Wei Sun","Chunyi Li","Xiaohong Liu","Weisi Lin","Guangtao Zhai","Xiongkuo Min"],"pdf_url":"https://arxiv.org/pdf/2411.03795v1.pdf","comment":"10 pages 3 figures"},{"id":"http://arxiv.org/abs/2411.03794v1","updated":"2024-11-06T09:39:25Z","published":"2024-11-06T09:39:25Z","title":"Harmformer: Harmonic Networks Meet Transformers for Continuous\n Roto-Translation Equivariance","summary":" CNNs exhibit inherent equivariance to image translation, leading to efficient\nparameter and data usage, faster learning, and improved robustness. The concept\nof translation equivariant networks has been successfully extended to rotation\ntransformation using group convolution for discrete rotation groups and\nharmonic functions for the continuous rotation group encompassing $360^\\circ$.\nWe explore the compatibility of the SA mechanism with full rotation\nequivariance, in contrast to previous studies that focused on discrete\nrotation. We introduce the Harmformer, a harmonic transformer with a\nconvolutional stem that achieves equivariance for both translation and\ncontinuous rotation. Accompanied by an end-to-end equivariance proof, the\nHarmformer not only outperforms previous equivariant transformers, but also\ndemonstrates inherent stability under any continuous rotation, even without\nseeing rotated samples during training.\n","authors":["Tomáš Karella","Adam Harmanec","Jan Kotera","Jan Blažek","Filip Šroubek"],"pdf_url":"https://arxiv.org/pdf/2411.03794v1.pdf","comment":"Appears in NeurIPS 2024 Workshop on Symmetry and Geometry in Neural\n Representations"},{"id":"http://arxiv.org/abs/2404.03202v5","updated":"2024-11-06T09:26:23Z","published":"2024-04-04T05:10:26Z","title":"OmniGS: Fast Radiance Field Reconstruction using Omnidirectional\n Gaussian Splatting","summary":" Photorealistic reconstruction relying on 3D Gaussian Splatting has shown\npromising potential in various domains. However, the current 3D Gaussian\nSplatting system only supports radiance field reconstruction using undistorted\nperspective images. In this paper, we present OmniGS, a novel omnidirectional\nGaussian splatting system, to take advantage of omnidirectional images for fast\nradiance field reconstruction. Specifically, we conduct a theoretical analysis\nof spherical camera model derivatives in 3D Gaussian Splatting. According to\nthe derivatives, we then implement a new GPU-accelerated omnidirectional\nrasterizer that directly splats 3D Gaussians onto the equirectangular screen\nspace for omnidirectional image rendering. We realize differentiable\noptimization of the omnidirectional radiance field without the requirement of\ncube-map rectification or tangent-plane approximation. Extensive experiments\nconducted in egocentric and roaming scenarios demonstrate that our method\nachieves state-of-the-art reconstruction quality and high rendering speed using\nomnidirectional images. The code will be publicly available.\n","authors":["Longwei Li","Huajian Huang","Sai-Kit Yeung","Hui Cheng"],"pdf_url":"https://arxiv.org/pdf/2404.03202v5.pdf","comment":"8 pages, 6 figures, accepted by WACV 2025, project page:\n https://liquorleaf.github.io/research/OmniGS/"},{"id":"http://arxiv.org/abs/2411.03223v2","updated":"2024-11-06T09:10:46Z","published":"2024-11-05T16:12:12Z","title":"Beyond Grid Data: Exploring Graph Neural Networks for Earth Observation","summary":" Earth Observation (EO) data analysis has been significantly revolutionized by\ndeep learning (DL), with applications typically limited to grid-like data\nstructures. Graph Neural Networks (GNNs) emerge as an important innovation,\npropelling DL into the non-Euclidean domain. Naturally, GNNs can effectively\ntackle the challenges posed by diverse modalities, multiple sensors, and the\nheterogeneous nature of EO data. To introduce GNNs in the related domains, our\nreview begins by offering fundamental knowledge on GNNs. Then, we summarize the\ngeneric problems in EO, to which GNNs can offer potential solutions. Following\nthis, we explore a broad spectrum of GNNs' applications to scientific problems\nin Earth systems, covering areas such as weather and climate analysis, disaster\nmanagement, air quality monitoring, agriculture, land cover classification,\nhydrological process modeling, and urban modeling. The rationale behind\nadopting GNNs in these fields is explained, alongside methodologies for\norganizing graphs and designing favorable architectures for various tasks.\nFurthermore, we highlight methodological challenges of implementing GNNs in\nthese domains and possible solutions that could guide future research. While\nacknowledging that GNNs are not a universal solution, we conclude the paper by\ncomparing them with other popular architectures like transformers and analyzing\ntheir potential synergies.\n","authors":["Shan Zhao","Zhaiyu Chen","Zhitong Xiong","Yilei Shi","Sudipan Saha","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.03223v2.pdf","comment":"Accepted for publication in Geoscience and Remote Sensing Magazine\n (GRSM)"},{"id":"http://arxiv.org/abs/2404.09633v2","updated":"2024-11-06T09:04:35Z","published":"2024-04-15T10:05:36Z","title":"In-Context Translation: Towards Unifying Image Recognition, Processing,\n and Generation","summary":" We propose In-Context Translation (ICT), a general learning framework to\nunify visual recognition (e.g., semantic segmentation), low-level image\nprocessing (e.g., denoising), and conditional image generation (e.g.,\nedge-to-image synthesis). Thanks to unification, ICT significantly reduces the\ninherent inductive bias that comes with designing models for specific tasks,\nand it maximizes mutual enhancement across similar tasks. However, the\nunification across a large number of tasks is non-trivial due to various data\nformats and training pipelines. To this end, ICT introduces two designs.\nFirstly, it standardizes input-output data of different tasks into RGB image\npairs, e.g., semantic segmentation data pairs an RGB image with its\nsegmentation mask in the same RGB format. This turns different tasks into a\ngeneral translation task between two RGB images. Secondly, it standardizes the\ntraining of different tasks into a general in-context learning, where\n\"in-context\" means the input comprises an example input-output pair of the\ntarget task and a query image. The learning objective is to generate the\n\"missing\" data paired with the query. The implicit translation process is thus\nbetween the query and the generated image. In experiments, ICT unifies ten\nvision tasks and showcases impressive performance on their respective\nbenchmarks. Notably, ICT performs well across three major categories of\ncomputer vision tasks, while its two competitors (Painter and PromptDiffusion)\nare only effective in at most two of these task categories. In addition,\ncompared to its competitors, ICT trained on only 4 RTX 3090 GPUs is shown to be\nmore efficient and less costly in training.\n","authors":["Han Xue","Qianru Sun","Li Song","Wenjun Zhang","Zhiwu Huang"],"pdf_url":"https://arxiv.org/pdf/2404.09633v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03758v1","updated":"2024-11-06T08:33:07Z","published":"2024-11-06T08:33:07Z","title":"Sub-DM:Subspace Diffusion Model with Orthogonal Decomposition for MRI\n Reconstruction","summary":" Diffusion model-based approaches recently achieved re-markable success in MRI\nreconstruction, but integration into clinical routine remains challenging due\nto its time-consuming convergence. This phenomenon is partic-ularly notable\nwhen directly apply conventional diffusion process to k-space data without\nconsidering the inherent properties of k-space sampling, limiting k-space\nlearning efficiency and image reconstruction quality. To tackle these\nchallenges, we introduce subspace diffusion model with orthogonal\ndecomposition, a method (referred to as Sub-DM) that restrict the diffusion\nprocess via projections onto subspace as the k-space data distribution evolves\ntoward noise. Particularly, the subspace diffusion model circumvents the\ninference challenges posed by the com-plex and high-dimensional characteristics\nof k-space data, so the highly compact subspace ensures that diffusion process\nrequires only a few simple iterations to produce accurate prior information.\nFurthermore, the orthogonal decomposition strategy based on wavelet transform\nhin-ders the information loss during the migration of the vanilla diffusion\nprocess to the subspace. Considering the strate-gy is approximately reversible,\nsuch that the entire pro-cess can be reversed. As a result, it allows the\ndiffusion processes in different spaces to refine models through a mutual\nfeedback mechanism, enabling the learning of ac-curate prior even when dealing\nwith complex k-space data. Comprehensive experiments on different datasets\nclearly demonstrate that the superiority of Sub-DM against state of-the-art\nmethods in terms of reconstruction speed and quality.\n","authors":["Yu Guan","Qinrong Cai","Wei Li","Qiuyun Fan","Dong Liang","Qiegen Liu"],"pdf_url":"https://arxiv.org/pdf/2411.03758v1.pdf","comment":"10 pages, 11 figures"},{"id":"http://arxiv.org/abs/2410.13824v3","updated":"2024-11-06T08:29:22Z","published":"2024-10-17T17:48:54Z","title":"Harnessing Webpage UIs for Text-Rich Visual Understanding","summary":" Text-rich visual understanding-the ability to process environments where\ndense textual content is integrated with visuals-is crucial for multimodal\nlarge language models (MLLMs) to interact effectively with structured\nenvironments. To enhance this capability, we propose synthesizing general\nmultimodal instructions from webpage UIs using text-based large language models\n(LLMs). Despite lacking direct visual input, text-based LLMs are able to\nprocess structured text representations from webpage accessibility trees. These\ninstructions are then paired with UI screenshots to train multimodal models. We\nintroduce MultiUI, a dataset containing 7.3 million samples from 1 million\nwebsites, covering diverse multimodal tasks and UI layouts. Models trained on\nMultiUI not only excel in web UI tasks-achieving up to a 48% improvement on\nVisualWebBench and a 19.1% boost in element accuracy on a web agent dataset\nMind2Web-but also generalize surprisingly well to non-web UI tasks and even to\nnon-UI domains, such as document understanding, OCR, and chart interpretation.\nThese results highlight the broad applicability of web UI data for advancing\ntext-rich visual understanding across various scenarios.\n","authors":["Junpeng Liu","Tianyue Ou","Yifan Song","Yuxiao Qu","Wai Lam","Chenyan Xiong","Wenhu Chen","Graham Neubig","Xiang Yue"],"pdf_url":"https://arxiv.org/pdf/2410.13824v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03752v1","updated":"2024-11-06T08:27:49Z","published":"2024-11-06T08:27:49Z","title":"Deferred Poisoning: Making the Model More Vulnerable via Hessian\n Singularization","summary":" Recent studies have shown that deep learning models are very vulnerable to\npoisoning attacks. Many defense methods have been proposed to address this\nissue. However, traditional poisoning attacks are not as threatening as\ncommonly believed. This is because they often cause differences in how the\nmodel performs on the training set compared to the validation set. Such\ninconsistency can alert defenders that their data has been poisoned, allowing\nthem to take the necessary defensive actions. In this paper, we introduce a\nmore threatening type of poisoning attack called the Deferred Poisoning Attack.\nThis new attack allows the model to function normally during the training and\nvalidation phases but makes it very sensitive to evasion attacks or even\nnatural noise. We achieve this by ensuring the poisoned model's loss function\nhas a similar value as a normally trained model at each input sample but with a\nlarge local curvature. A similar model loss ensures that there is no obvious\ninconsistency between the training and validation accuracy, demonstrating high\nstealthiness. On the other hand, the large curvature implies that a small\nperturbation may cause a significant increase in model loss, leading to\nsubstantial performance degradation, which reflects a worse robustness. We\nfulfill this purpose by making the model have singular Hessian information at\nthe optimal point via our proposed Singularization Regularization term. We have\nconducted both theoretical and empirical analyses of the proposed method and\nvalidated its effectiveness through experiments on image classification tasks.\nFurthermore, we have confirmed the hazards of this form of poisoning attack\nunder more general scenarios using natural noise, offering a new perspective\nfor research in the field of security.\n","authors":["Yuhao He","Jinyu Tian","Xianwei Zheng","Li Dong","Yuanman Li","Leo Yu Zhang","Jiantao Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.03752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17331v2","updated":"2024-11-06T08:22:33Z","published":"2024-07-24T14:54:16Z","title":"Multi-label Cluster Discrimination for Visual Representation Learning","summary":" Contrastive Language Image Pre-training (CLIP) has recently demonstrated\nsuccess across various tasks due to superior feature representation empowered\nby image-text contrastive learning. However, the instance discrimination method\nused by CLIP can hardly encode the semantic structure of training data. To\nhandle this limitation, cluster discrimination has been proposed through\niterative cluster assignment and classification. Nevertheless, most cluster\ndiscrimination approaches only define a single pseudo-label for each image,\nneglecting multi-label signals in the image. In this paper, we propose a novel\nMulti-Label Cluster Discrimination method named MLCD to enhance representation\nlearning. In the clustering step, we first cluster the large-scale LAION-400M\ndataset into one million centers based on off-the-shelf embedding features.\nConsidering that natural images frequently contain multiple visual objects or\nattributes, we select the multiple closest centers as auxiliary class labels.\nIn the discrimination step, we design a novel multi-label classification loss,\nwhich elegantly separates losses from positive classes and negative classes,\nand alleviates ambiguity on decision boundary. We validate the proposed\nmulti-label cluster discrimination method with experiments on different scales\nof models and pre-training datasets. Experimental results show that our method\nachieves state-of-the-art performance on multiple downstream tasks including\nlinear probe, zero-shot classification, and image-text retrieval. Code and\nmodels have been released at https://github.com/deepglint/unicom .\n","authors":["Xiang An","Kaicheng Yang","Xiangzi Dai","Ziyong Feng","Jiankang Deng"],"pdf_url":"https://arxiv.org/pdf/2407.17331v2.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2411.03745v1","updated":"2024-11-06T08:22:00Z","published":"2024-11-06T08:22:00Z","title":"Homotopy Continuation Made Easy: Regression-based Online Simulation of\n Starting Problem-Solution Pairs","summary":" While automatically generated polynomial elimination templates have sparked\ngreat progress in the field of 3D computer vision, there remain many problems\nfor which the degree of the constraints or the number of unknowns leads to\nintractability. In recent years, homotopy continuation has been introduced as a\nplausible alternative. However, the method currently depends on expensive\nparallel tracking of all possible solutions in the complex domain, or a\nclassification network for starting problem-solution pairs trained over a\nlimited set of real-world examples. Our innovation consists of employing a\nregression network trained in simulation to directly predict a solution from\ninput correspondences, followed by an online simulator that invents a\nconsistent problem-solution pair. Subsequently, homotopy continuation is\napplied to track that single solution back to the original problem. We apply\nthis elegant combination to generalized camera resectioning, and also introduce\na new solution to the challenging generalized relative pose and scale problem.\nAs demonstrated, the proposed method successfully compensates the raw error\ncommitted by the regressor alone, and leads to state-of-the-art efficiency and\nsuccess rates while running on CPU resources, only.\n","authors":["Xinyue Zhang","Zijia Dai","Wanting Xu","Laurent Kneip"],"pdf_url":"https://arxiv.org/pdf/2411.03745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03730v1","updated":"2024-11-06T07:51:19Z","published":"2024-11-06T07:51:19Z","title":"NeurIPS 2023 Competition: Privacy Preserving Federated Learning Document\n VQA","summary":" The Privacy Preserving Federated Learning Document VQA (PFL-DocVQA)\ncompetition challenged the community to develop provably private and\ncommunication-efficient solutions in a federated setting for a real-life use\ncase: invoice processing. The competition introduced a dataset of real invoice\ndocuments, along with associated questions and answers requiring information\nextraction and reasoning over the document images. Thereby, it brings together\nresearchers and expertise from the document analysis, privacy, and federated\nlearning communities. Participants fine-tuned a pre-trained, state-of-the-art\nDocument Visual Question Answering model provided by the organizers for this\nnew domain, mimicking a typical federated invoice processing setup. The base\nmodel is a multi-modal generative language model, and sensitive information\ncould be exposed through either the visual or textual input modality.\nParticipants proposed elegant solutions to reduce communication costs while\nmaintaining a minimum utility threshold in track 1 and to protect all\ninformation from each document provider using differential privacy in track 2.\nThe competition served as a new testbed for developing and testing private\nfederated learning methods, simultaneously raising awareness about privacy\nwithin the document image analysis and recognition community. Ultimately, the\ncompetition analysis provides best practices and recommendations for\nsuccessfully running privacy-focused federated learning challenges in the\nfuture.\n","authors":["Marlon Tobaben","Mohamed Ali Souibgui","Rubèn Tito","Khanh Nguyen","Raouf Kerkouche","Kangsoo Jung","Joonas Jälkö","Lei Kang","Andrey Barsky","Vincent Poulain d'Andecy","Aurélie Joseph","Aashiq Muhamed","Kevin Kuo","Virginia Smith","Yusuke Yamasaki","Takumi Fukami","Kenta Niwa","Iifan Tyou","Hiro Ishii","Rio Yokota","Ragul N","Rintu Kutum","Josep Llados","Ernest Valveny","Antti Honkela","Mario Fritz","Dimosthenis Karatzas"],"pdf_url":"https://arxiv.org/pdf/2411.03730v1.pdf","comment":"27 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.03729v1","updated":"2024-11-06T07:48:30Z","published":"2024-11-06T07:48:30Z","title":"Relation Learning and Aggregate-attention for Multi-person Motion\n Prediction","summary":" Multi-person motion prediction is an emerging and intricate task with broad\nreal-world applications. Unlike single person motion prediction, it considers\nnot just the skeleton structures or human trajectories but also the\ninteractions between others. Previous methods use various networks to achieve\nimpressive predictions but often overlook that the joints relations within an\nindividual (intra-relation) and interactions among groups (inter-relation) are\ndistinct types of representations. These methods often lack explicit\nrepresentation of inter&intra-relations, and inevitably introduce undesired\ndependencies. To address this issue, we introduce a new collaborative framework\nfor multi-person motion prediction that explicitly modeling these relations:a\nGCN-based network for intra-relations and a novel reasoning network for\ninter-relations.Moreover, we propose a novel plug-and-play aggregation module\ncalled the Interaction Aggregation Module (IAM), which employs an\naggregate-attention mechanism to seamlessly integrate these relations.\nExperiments indicate that the module can also be applied to other dual-path\nmodels. Extensive experiments on the 3DPW, 3DPW-RC, CMU-Mocap, MuPoTS-3D, as\nwell as synthesized datasets Mix1 & Mix2 (9 to 15 persons), demonstrate that\nour method achieves state-of-the-art performance.\n","authors":["Kehua Qu","Rui Ding","Jin Tang"],"pdf_url":"https://arxiv.org/pdf/2411.03729v1.pdf","comment":"Submitted to IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2411.03728v1","updated":"2024-11-06T07:46:34Z","published":"2024-11-06T07:46:34Z","title":"Efficient Fourier Filtering Network with Contrastive Learning for\n UAV-based Unaligned Bi-modal Salient Object Detection","summary":" Unmanned aerial vehicle (UAV)-based bi-modal salient object detection (BSOD)\naims to segment salient objects in a scene utilizing complementary cues in\nunaligned RGB and thermal image pairs. However, the high computational expense\nof existing UAV-based BSOD models limits their applicability to real-world UAV\ndevices. To address this problem, we propose an efficient Fourier filter\nnetwork with contrastive learning that achieves both real-time and accurate\nperformance. Specifically, we first design a semantic contrastive alignment\nloss to align the two modalities at the semantic level, which facilitates\nmutual refinement in a parameter-free way. Second, inspired by the fast Fourier\ntransform that obtains global relevance in linear complexity, we propose\nsynchronized alignment fusion, which aligns and fuses bi-modal features in the\nchannel and spatial dimensions by a hierarchical filtering mechanism. Our\nproposed model, AlignSal, reduces the number of parameters by 70.0%, decreases\nthe floating point operations by 49.4%, and increases the inference speed by\n152.5% compared to the cutting-edge BSOD model (i.e., MROS). Extensive\nexperiments on the UAV RGB-T 2400 and three weakly aligned datasets demonstrate\nthat AlignSal achieves both real-time inference speed and better performance\nand generalizability compared to sixteen state-of-the-art BSOD models across\nmost evaluation metrics. In addition, our ablation studies further verify\nAlignSal's potential in boosting the performance of existing aligned BSOD\nmodels on UAV-based unaligned data. The code is available at:\nhttps://github.com/JoshuaLPF/AlignSal.\n","authors":["Pengfei Lyu","Pak-Hei Yeung","Xiufei Cheng","Xiaosheng Yu","Chengdong Wu","Jagath C. Rajapakse"],"pdf_url":"https://arxiv.org/pdf/2411.03728v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.03725v1","updated":"2024-11-06T07:44:04Z","published":"2024-11-06T07:44:04Z","title":"PX2Tooth: Reconstructing the 3D Point Cloud Teeth from a Single\n Panoramic X-ray","summary":" Reconstructing the 3D anatomical structures of the oral cavity, which\noriginally reside in the cone-beam CT (CBCT), from a single 2D Panoramic\nX-ray(PX) remains a critical yet challenging task, as it can effectively reduce\nradiation risks and treatment costs during the diagnostic in digital dentistry.\nHowever, current methods are either error-prone or only trained/evaluated on\nsmall-scale datasets (less than 50 cases), resulting in compromised\ntrustworthiness. In this paper, we propose PX2Tooth, a novel approach to\nreconstruct 3D teeth using a single PX image with a two-stage framework. First,\nwe design the PXSegNet to segment the permanent teeth from the PX images,\nproviding clear positional, morphological, and categorical information for each\ntooth. Subsequently, we design a novel tooth generation network (TGNet) that\nlearns to transform random point clouds into 3D teeth. TGNet integrates the\nsegmented patch information and introduces a Prior Fusion Module (PFM) to\nenhance the generation quality, especially in the root apex region. Moreover,\nwe construct a dataset comprising 499 pairs of CBCT and Panoramic X-rays.\nExtensive experiments demonstrate that PX2Tooth can achieve an Intersection\nover Union (IoU) of 0.793, significantly surpassing previous methods,\nunderscoring the great potential of artificial intelligence in digital\ndentistry.\n","authors":["Wen Ma","Huikai Wu","Zikai Xiao","Yang Feng","Jian Wu","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2411.03725v1.pdf","comment":"Ma W, Wu H, Xiao Z, et al. PX2Tooth: Reconstructing the 3D Point\n Cloud Teeth from a Single Panoramic X-Ray[C]//International Conference on\n Medical Image Computing and Computer-Assisted Intervention. Cham: Springer\n Nature Switzerland, 2024: 411-421"},{"id":"http://arxiv.org/abs/2411.03724v1","updated":"2024-11-06T07:43:40Z","published":"2024-11-06T07:43:40Z","title":"Estimation of Psychosocial Work Environment Exposures Through Video\n Object Detection. Proof of Concept Using CCTV Footage","summary":" This paper examines the use of computer vision algorithms to estimate aspects\nof the psychosocial work environment using CCTV footage. We present a proof of\nconcept for a methodology that detects and tracks people in video footage and\nestimates interactions between customers and employees by estimating their\nposes and calculating the duration of their encounters. We propose a pipeline\nthat combines existing object detection and tracking algorithms (YOLOv8 and\nDeepSORT) with pose estimation algorithms (BlazePose) to estimate the number of\ncustomers and employees in the footage as well as the duration of their\nencounters. We use a simple rule-based approach to classify the interactions as\npositive, neutral or negative based on three different criteria: distance,\nduration and pose. The proposed methodology is tested on a small dataset of\nCCTV footage. While the data is quite limited in particular with respect to the\nquality of the footage, we have chosen this case as it represents a typical\nsetting where the method could be applied. The results show that the object\ndetection and tracking part of the pipeline has a reasonable performance on the\ndataset with a high degree of recall and reasonable accuracy. At this stage,\nthe pose estimation is still limited to fully detect the type of interactions\ndue to difficulties in tracking employees in the footage. We conclude that the\nmethod is a promising alternative to self-reported measures of the psychosocial\nwork environment and could be used in future studies to obtain external\nobservations of the work environment.\n","authors":["Claus D. Hansen","Thuy Hai Le","David Campos"],"pdf_url":"https://arxiv.org/pdf/2411.03724v1.pdf","comment":"11 pages, 9 figures, presented at IWOAR 9th International Workshop on\n Sensor-Based Activity Recognition and Artificial Intelligence, September\n 26-27, Potsdam, Germany"},{"id":"http://arxiv.org/abs/2411.03723v1","updated":"2024-11-06T07:40:27Z","published":"2024-11-06T07:40:27Z","title":"Zero-shot Dynamic MRI Reconstruction with Global-to-local Diffusion\n Model","summary":" Diffusion models have recently demonstrated considerable advancement in the\ngeneration and reconstruction of magnetic resonance imaging (MRI) data. These\nmodels exhibit great potential in handling unsampled data and reducing noise,\nhighlighting their promise as generative models. However, their application in\ndynamic MRI remains relatively underexplored. This is primarily due to the\nsubstantial amount of fully-sampled data typically required for training, which\nis difficult to obtain in dynamic MRI due to its spatio-temporal complexity and\nhigh acquisition costs. To address this challenge, we propose a dynamic MRI\nreconstruction method based on a time-interleaved acquisition scheme, termed\nthe Glob-al-to-local Diffusion Model. Specifically, fully encoded\nfull-resolution reference data are constructed by merging under-sampled k-space\ndata from adjacent time frames, generating two distinct bulk training datasets\nfor global and local models. The global-to-local diffusion framework\nalternately optimizes global information and local image details, enabling\nzero-shot reconstruction. Extensive experiments demonstrate that the proposed\nmethod performs well in terms of noise reduction and detail preservation,\nachieving reconstruction quality comparable to that of supervised approaches.\n","authors":["Yu Guan","Kunlong Zhang","Qi Qi","Dong Wang","Ziwen Ke","Shaoyu Wang","Dong Liang","Qiegen Liu"],"pdf_url":"https://arxiv.org/pdf/2411.03723v1.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2411.03717v1","updated":"2024-11-06T07:30:34Z","published":"2024-11-06T07:30:34Z","title":"These Maps Are Made by Propagation: Adapting Deep Stereo Networks to\n Road Scenarios with Decisive Disparity Diffusion","summary":" Stereo matching has emerged as a cost-effective solution for road surface 3D\nreconstruction, garnering significant attention towards improving both\ncomputational efficiency and accuracy. This article introduces decisive\ndisparity diffusion (D3Stereo), marking the first exploration of dense deep\nfeature matching that adapts pre-trained deep convolutional neural networks\n(DCNNs) to previously unseen road scenarios. A pyramid of cost volumes is\ninitially created using various levels of learned representations.\nSubsequently, a novel recursive bilateral filtering algorithm is employed to\naggregate these costs. A key innovation of D3Stereo lies in its alternating\ndecisive disparity diffusion strategy, wherein intra-scale diffusion is\nemployed to complete sparse disparity images, while inter-scale inheritance\nprovides valuable prior information for higher resolutions. Extensive\nexperiments conducted on our created UDTIRI-Stereo and Stereo-Road datasets\nunderscore the effectiveness of D3Stereo strategy in adapting pre-trained DCNNs\nand its superior performance compared to all other explicit programming-based\nalgorithms designed specifically for road surface 3D reconstruction. Additional\nexperiments conducted on the Middlebury dataset with backbone DCNNs pre-trained\non the ImageNet database further validate the versatility of D3Stereo strategy\nin tackling general stereo matching problems.\n","authors":["Chuang-Wei Liu","Yikang Zhang","Qijun Chen","Ioannis Pitas","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2411.03717v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.03714v1","updated":"2024-11-06T07:28:57Z","published":"2024-11-06T07:28:57Z","title":"Explaining Human Activity Recognition with SHAP: Validating Insights\n with Perturbation and Quantitative Measures","summary":" In Human Activity Recognition (HAR), understanding the intricacy of body\nmovements within high-risk applications is essential. This study uses SHapley\nAdditive exPlanations (SHAP) to explain the decision-making process of Graph\nConvolution Networks (GCNs) when classifying activities with skeleton data. We\nemploy SHAP to explain two real-world datasets: one for cerebral palsy (CP)\nclassification and the widely used NTU RGB+D 60 action recognition dataset. To\ntest the explanation, we introduce a novel perturbation approach that modifies\nthe model's edge importance matrix, allowing us to evaluate the impact of\nspecific body key points on prediction outcomes. To assess the fidelity of our\nexplanations, we employ informed perturbation, targeting body key points\nidentified as important by SHAP and comparing them against random perturbation\nas a control condition. This perturbation enables a judgment on whether the\nbody key points are truly influential or non-influential based on the SHAP\nvalues. Results on both datasets show that body key points identified as\nimportant through SHAP have the largest influence on the accuracy, specificity,\nand sensitivity metrics. Our findings highlight that SHAP can provide granular\ninsights into the input feature contribution to the prediction outcome of GCNs\nin HAR tasks. This demonstrates the potential for more interpretable and\ntrustworthy models in high-stakes applications like healthcare or\nrehabilitation.\n","authors":["Felix Tempel","Espen Alexander F. Ihlen","Lars Adde","Inga Strümke"],"pdf_url":"https://arxiv.org/pdf/2411.03714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05824v2","updated":"2024-11-06T07:23:52Z","published":"2022-09-13T09:00:58Z","title":"CPnP: Consistent Pose Estimator for Perspective-n-Point Problem with\n Bias Elimination","summary":" The Perspective-n-Point (PnP) problem has been widely studied in both\ncomputer vision and photogrammetry societies. With the development of feature\nextraction techniques, a large number of feature points might be available in a\nsingle shot. It is promising to devise a consistent estimator, i.e., the\nestimate can converge to the true camera pose as the number of points\nincreases. To this end, we propose a consistent PnP solver, named \\emph{CPnP},\nwith bias elimination. Specifically, linear equations are constructed from the\noriginal projection model via measurement model modification and variable\nelimination, based on which a closed-form least-squares solution is obtained.\nWe then analyze and subtract the asymptotic bias of this solution, resulting in\na consistent estimate. Additionally, Gauss-Newton (GN) iterations are executed\nto refine the consistent solution. Our proposed estimator is efficient in terms\nof computations -- it has $O(n)$ computational complexity. Experimental tests\non both synthetic data and real images show that our proposed estimator is\nsuperior to some well-known ones for images with dense visual features, in\nterms of estimation precision and computing time.\n","authors":["Guangyang Zeng","Shiyu Chen","Biqiang Mu","Guodong Shi","Junfeng Wu"],"pdf_url":"https://arxiv.org/pdf/2209.05824v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03707v1","updated":"2024-11-06T07:11:15Z","published":"2024-11-06T07:11:15Z","title":"Fine-Tuning Vision-Language Model for Automated Engineering Drawing\n Information Extraction","summary":" Geometric Dimensioning and Tolerancing (GD&T) plays a critical role in\nmanufacturing by defining acceptable variations in part features to ensure\ncomponent quality and functionality. However, extracting GD&T information from\n2D engineering drawings is a time-consuming and labor-intensive task, often\nrelying on manual efforts or semi-automated tools. To address these challenges,\nthis study proposes an automated and computationally efficient GD&T extraction\nmethod by fine-tuning Florence-2, an open-source vision-language model (VLM).\nThe model is trained on a dataset of 400 drawings with ground truth annotations\nprovided by domain experts. For comparison, two state-of-the-art closed-source\nVLMs, GPT-4o and Claude-3.5-Sonnet, are evaluated on the same dataset. All\nmodels are assessed using precision, recall, F1-score, and hallucination\nmetrics. Due to the computational cost and impracticality of fine-tuning large\nclosed-source VLMs for domain-specific tasks, GPT-4o and Claude-3.5-Sonnet are\nevaluated in a zero-shot setting. In contrast, Florence-2, a smaller model with\n0.23 billion parameters, is optimized through full-parameter fine-tuning across\nthree distinct experiments, each utilizing datasets augmented to different\nlevels. The results show that Florence-2 achieves a 29.95% increase in\nprecision, a 37.75% increase in recall, a 52.40% improvement in F1-score, and a\n43.15% reduction in hallucination rate compared to the best-performing\nclosed-source model. These findings highlight the effectiveness of fine-tuning\nsmaller, open-source VLMs like Florence-2, offering a practical and efficient\nsolution for automated GD&T extraction to support downstream manufacturing\ntasks.\n","authors":["Muhammad Tayyab Khan","Lequn Chen","Ye Han Ng","Wenhe Feng","Nicholas Yew Jin Tan","Seung Ki Moon"],"pdf_url":"https://arxiv.org/pdf/2411.03707v1.pdf","comment":"Paper has been submitted to the 9th International Conference on\n Innovation in Artificial Intelligence (ICIAI 2025)"},{"id":"http://arxiv.org/abs/2403.20213v3","updated":"2024-11-06T07:09:03Z","published":"2024-03-29T14:50:43Z","title":"VHM: Versatile and Honest Vision Language Model for Remote Sensing Image\n Analysis","summary":" This paper develops a Versatile and Honest vision language Model (VHM) for\nremote sensing image analysis. VHM is built on a large-scale remote sensing\nimage-text dataset with rich-content captions (VersaD), and an honest\ninstruction dataset comprising both factual and deceptive questions (HnstD).\nUnlike prevailing remote sensing image-text datasets, in which image captions\nfocus on a few prominent objects and their relationships, VersaD captions\nprovide detailed information about image properties, object attributes, and the\noverall scene. This comprehensive captioning enables VHM to thoroughly\nunderstand remote sensing images and perform diverse remote sensing tasks.\nMoreover, different from existing remote sensing instruction datasets that only\ninclude factual questions, HnstD contains additional deceptive questions\nstemming from the non-existence of objects. This feature prevents VHM from\nproducing affirmative answers to nonsense queries, thereby ensuring its\nhonesty. In our experiments, VHM significantly outperforms various vision\nlanguage models on common tasks of scene classification, visual question\nanswering, and visual grounding. Additionally, VHM achieves competent\nperformance on several unexplored tasks, such as building vectorizing,\nmulti-label classification and honest question answering. We will release the\ncode, data and model weights at https://github.com/opendatalab/VHM .\n","authors":["Chao Pang","Xingxing Weng","Jiang Wu","Jiayu Li","Yi Liu","Jiaxing Sun","Weijia Li","Shuai Wang","Litong Feng","Gui-Song Xia","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2403.20213v3.pdf","comment":"Equal contribution: Chao Pang, Xingxing Weng, Jiang Wu; Corresponding\n author: Gui-Song Xia, Conghui He"},{"id":"http://arxiv.org/abs/2403.05408v2","updated":"2024-11-06T07:08:58Z","published":"2024-03-08T16:06:54Z","title":"FedFMS: Exploring Federated Foundation Models for Medical Image\n Segmentation","summary":" Medical image segmentation is crucial for clinical diagnosis. The\nSegmentation Anything Model (SAM) serves as a powerful foundation model for\nvisual segmentation and can be adapted for medical image segmentation. However,\nmedical imaging data typically contain privacy-sensitive information, making it\nchallenging to train foundation models with centralized storage and sharing. To\ndate, there are few foundation models tailored for medical image deployment\nwithin the federated learning framework, and the segmentation performance, as\nwell as the efficiency of communication and training, remain unexplored. In\nresponse to these issues, we developed Federated Foundation models for Medical\nimage Segmentation (FedFMS), which includes the Federated SAM (FedSAM) and a\ncommunication and training-efficient Federated SAM with Medical SAM Adapter\n(FedMSA). Comprehensive experiments on diverse datasets are conducted to\ninvestigate the performance disparities between centralized training and\nfederated learning across various configurations of FedFMS. The experiments\nrevealed that FedFMS could achieve performance comparable to models trained via\ncentralized training methods while maintaining privacy. Furthermore, FedMSA\ndemonstrated the potential to enhance communication and training efficiency.\nOur model implementation codes are available at\nhttps://github.com/LIU-YUXI/FedFMS.\n","authors":["Yuxi Liu","Guibo Luo","Yuesheng Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.05408v2.pdf","comment":"Accepted by MICCAI'2024"},{"id":"http://arxiv.org/abs/2411.03706v1","updated":"2024-11-06T07:08:41Z","published":"2024-11-06T07:08:41Z","title":"3DGS-CD: 3D Gaussian Splatting-based Change Detection for Physical\n Object Rearrangement","summary":" We present 3DGS-CD, the first 3D Gaussian Splatting (3DGS)-based method for\ndetecting physical object rearrangements in 3D scenes. Our approach estimates\n3D object-level changes by comparing two sets of unaligned images taken at\ndifferent times. Leveraging 3DGS's novel view rendering and EfficientSAM's\nzero-shot segmentation capabilities, we detect 2D object-level changes, which\nare then associated and fused across views to estimate 3D changes. Our method\ncan detect changes in cluttered environments using sparse post-change images\nwithin as little as 18s, using as few as a single new image. It does not rely\non depth input, user instructions, object classes, or object models -- An\nobject is recognized simply if it has been re-arranged. Our approach is\nevaluated on both public and self-collected real-world datasets, achieving up\nto 14% higher accuracy and three orders of magnitude faster performance\ncompared to the state-of-the-art radiance-field-based change detection method.\nThis significant performance boost enables a broad range of downstream\napplications, where we highlight three key use cases: object reconstruction,\nrobot workspace reset, and 3DGS model update. Our code and data will be made\navailable at https://github.com/520xyxyzq/3DGS-CD.\n","authors":["Ziqi Lu","Jianbo Ye","John Leonard"],"pdf_url":"https://arxiv.org/pdf/2411.03706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03702v1","updated":"2024-11-06T06:58:17Z","published":"2024-11-06T06:58:17Z","title":"Graph-Based Multi-Modal Sensor Fusion for Autonomous Driving","summary":" The growing demand for robust scene understanding in mobile robotics and\nautonomous driving has highlighted the importance of integrating multiple\nsensing modalities. By combining data from diverse sensors like cameras and\nLIDARs, fusion techniques can overcome the limitations of individual sensors,\nenabling a more complete and accurate perception of the environment. We\nintroduce a novel approach to multi-modal sensor fusion, focusing on developing\na graph-based state representation that supports critical decision-making\nprocesses in autonomous driving. We present a Sensor-Agnostic Graph-Aware\nKalman Filter [3], the first online state estimation technique designed to fuse\nmulti-modal graphs derived from noisy multi-sensor data. The estimated\ngraph-based state representations serve as a foundation for advanced\napplications like Multi-Object Tracking (MOT), offering a comprehensive\nframework for enhancing the situational awareness and safety of autonomous\nsystems. We validate the effectiveness of our proposed framework through\nextensive experiments conducted on both synthetic and real-world driving\ndatasets (nuScenes). Our results showcase an improvement in MOTA and a\nreduction in estimated position errors (MOTP) and identity switches (IDS) for\ntracked objects using the SAGA-KF. Furthermore, we highlight the capability of\nsuch a framework to develop methods that can leverage heterogeneous information\n(like semantic objects and geometric structures) from various sensing\nmodalities, enabling a more holistic approach to scene understanding and\nenhancing the safety and effectiveness of autonomous systems.\n","authors":["Depanshu Sani","Saket Anand"],"pdf_url":"https://arxiv.org/pdf/2411.03702v1.pdf","comment":"An extended abstract accepted at Young Researchers' Symposium, ICVGIP\n '24. This extended abstract contains the following: 1. Short summary of our\n work, SAGA-KF, accepted at ICPR'24. 2. A proposal that was awarded the\n Qualcomm Innovation Fellowship'24"},{"id":"http://arxiv.org/abs/2411.02188v3","updated":"2024-11-06T06:38:47Z","published":"2024-11-04T15:42:22Z","title":"Digi2Real: Bridging the Realism Gap in Synthetic Data Face Recognition\n via Foundation Models","summary":" The accuracy of face recognition systems has improved significantly in the\npast few years, thanks to the large amount of data collected and the\nadvancement in neural network architectures. However, these large-scale\ndatasets are often collected without explicit consent, raising ethical and\nprivacy concerns. To address this, there have been proposals to use synthetic\ndatasets for training face recognition models. Yet, such models still rely on\nreal data to train the generative models and generally exhibit inferior\nperformance compared to those trained on real datasets. One of these datasets,\nDigiFace, uses a graphics pipeline to generate different identities and\ndifferent intra-class variations without using real data in training the\nmodels. However, the performance of this approach is poor on face recognition\nbenchmarks, possibly due to the lack of realism in the images generated from\nthe graphics pipeline. In this work, we introduce a novel framework for realism\ntransfer aimed at enhancing the realism of synthetically generated face images.\nOur method leverages the large-scale face foundation model, and we adapt the\npipeline for realism enhancement. By integrating the controllable aspects of\nthe graphics pipeline with our realism enhancement technique, we generate a\nlarge amount of realistic variations-combining the advantages of both\napproaches. Our empirical evaluations demonstrate that models trained using our\nenhanced dataset significantly improve the performance of face recognition\nsystems over the baseline. The source code and datasets will be made available\npublicly: https://www.idiap.ch/paper/digi2real\n","authors":["Anjith George","Sebastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2411.02188v3.pdf","comment":"The dataset would be available here:\n https://www.idiap.ch/paper/digi2real"},{"id":"http://arxiv.org/abs/2411.03696v1","updated":"2024-11-06T06:34:27Z","published":"2024-11-06T06:34:27Z","title":"OccLoff: Learning Optimized Feature Fusion for 3D Occupancy Prediction","summary":" 3D semantic occupancy prediction is crucial for finely representing the\nsurrounding environment, which is essential for ensuring the safety in\nautonomous driving. Existing fusion-based occupancy methods typically involve\nperforming a 2D-to-3D view transformation on image features, followed by\ncomputationally intensive 3D operations to fuse these with LiDAR features,\nleading to high computational costs and reduced accuracy. Moreover, current\nresearch on occupancy prediction predominantly focuses on designing specific\nnetwork architectures, often tailored to particular models, with limited\nattention given to the more fundamental aspect of semantic feature learning.\nThis gap hinders the development of more transferable methods that could\nenhance the performance of various occupancy models. To address these\nchallenges, we propose OccLoff, a framework that Learns to Optimize Feature\nFusion for 3D occupancy prediction. Specifically, we introduce a sparse fusion\nencoder with entropy masks that directly fuses 3D and 2D features, improving\nmodel accuracy while reducing computational overhead. Additionally, we propose\na transferable proxy-based loss function and an adaptive hard sample weighting\nalgorithm, which enhance the performance of several state-of-the-art methods.\nExtensive evaluations on the nuScenes and SemanticKITTI benchmarks demonstrate\nthe superiority of our framework, and ablation studies confirm the\neffectiveness of each proposed module.\n","authors":["Ji Zhang","Yiran Ding","Zixin Liu"],"pdf_url":"https://arxiv.org/pdf/2411.03696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03695v1","updated":"2024-11-06T06:33:55Z","published":"2024-11-06T06:33:55Z","title":"AMNCutter: Affinity-Attention-Guided Multi-View Normalized Cutter for\n Unsupervised Surgical Instrument Segmentation","summary":" Surgical instrument segmentation (SIS) is pivotal for robotic-assisted\nminimally invasive surgery, assisting surgeons by identifying surgical\ninstruments in endoscopic video frames. Recent unsupervised surgical instrument\nsegmentation (USIS) methods primarily rely on pseudo-labels derived from\nlow-level features such as color and optical flow, but these methods show\nlimited effectiveness and generalizability in complex and unseen endoscopic\nscenarios. In this work, we propose a label-free unsupervised model featuring a\nnovel module named Multi-View Normalized Cutter (m-NCutter). Different from\nprevious USIS works, our model is trained using a graph-cutting loss function\nthat leverages patch affinities for supervision, eliminating the need for\npseudo-labels. The framework adaptively determines which affinities from which\nlevels should be prioritized. Therefore, the low- and high-level features and\ntheir affinities are effectively integrated to train a label-free unsupervised\nmodel, showing superior effectiveness and generalization ability. We conduct\ncomprehensive experiments across multiple SIS datasets to validate our\napproach's state-of-the-art (SOTA) performance, robustness, and exceptional\npotential as a pre-trained model. Our code is released at\nhttps://github.com/MingyuShengSMY/AMNCutter.\n","authors":["Mingyu Sheng","Jianan Fan","Dongnan Liu","Ron Kikinis","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2411.03695v1.pdf","comment":"This paper was accepted by the 2025 IEEE Winter Conference on\n Applications of Computer Vision (WACV)"},{"id":"http://arxiv.org/abs/2411.03688v1","updated":"2024-11-06T06:14:24Z","published":"2024-11-06T06:14:24Z","title":"Where Do We Stand with Implicit Neural Representations? A Technical and\n Performance Survey","summary":" Implicit Neural Representations (INRs) have emerged as a paradigm in\nknowledge representation, offering exceptional flexibility and performance\nacross a diverse range of applications. INRs leverage multilayer perceptrons\n(MLPs) to model data as continuous implicit functions, providing critical\nadvantages such as resolution independence, memory efficiency, and\ngeneralisation beyond discretised data structures. Their ability to solve\ncomplex inverse problems makes them particularly effective for tasks including\naudio reconstruction, image representation, 3D object reconstruction, and\nhigh-dimensional data synthesis. This survey provides a comprehensive review of\nstate-of-the-art INR methods, introducing a clear taxonomy that categorises\nthem into four key areas: activation functions, position encoding, combined\nstrategies, and network structure optimisation. We rigorously analyse their\ncritical properties, such as full differentiability, smoothness, compactness,\nand adaptability to varying resolutions while also examining their strengths\nand limitations in addressing locality biases and capturing fine details. Our\nexperimental comparison offers new insights into the trade-offs between\ndifferent approaches, showcasing the capabilities and challenges of the latest\nINR techniques across various tasks. In addition to identifying areas where\ncurrent methods excel, we highlight key limitations and potential avenues for\nimprovement, such as developing more expressive activation functions, enhancing\npositional encoding mechanisms, and improving scalability for complex,\nhigh-dimensional data. This survey serves as a roadmap for researchers,\noffering practical guidance for future exploration in the field of INRs. We aim\nto foster new methodologies by outlining promising research directions for INRs\nand applications.\n","authors":["Amer Essakine","Yanqi Cheng","Chun-Wun Cheng","Lipei Zhang","Zhongying Deng","Lei Zhu","Carola-Bibiane Schönlieb","Angelica I Aviles-Rivero"],"pdf_url":"https://arxiv.org/pdf/2411.03688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04315v4","updated":"2024-11-06T05:35:40Z","published":"2023-11-07T19:41:19Z","title":"A Data Perspective on Enhanced Identity Preservation for Diffusion\n Personalization","summary":" Large text-to-image models have revolutionized the ability to generate\nimagery using natural language. However, particularly unique or personal visual\nconcepts, such as pets and furniture, will not be captured by the original\nmodel. This has led to interest in how to personalize a text-to-image model.\nDespite significant progress, this task remains a formidable challenge,\nparticularly in preserving the subject's identity. Most researchers attempt to\naddress this issue by modifying model architectures. These methods are capable\nof keeping the subject structure and color but fail to preserve identity\ndetails. Towards this issue, our approach takes a data-centric perspective. We\nintroduce a novel regularization dataset generation strategy on both the text\nand image level. This strategy enables the model to preserve fine details of\nthe desired subjects, such as text and logos. Our method is\narchitecture-agnostic and can be flexibly applied on various text-to-image\nmodels. We show on established benchmarks that our data-centric approach forms\nthe new state of the art in terms of identity preservation and text alignment.\n","authors":["Xingzhe He","Zhiwen Cao","Nicholas Kolkin","Lantao Yu","Kun Wan","Helge Rhodin","Ratheesh Kalarot"],"pdf_url":"https://arxiv.org/pdf/2311.04315v4.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2410.13147v5","updated":"2024-11-06T05:18:04Z","published":"2024-10-17T02:04:57Z","title":"Utilizing Large Language Models in an iterative paradigm with Domain\n feedback for Zero-shot Molecule optimization","summary":" Molecule optimization is a critical task in drug discovery to optimize\ndesired properties of a given molecule through chemical modification. Despite\nLarge Language Models (LLMs) holding the potential to efficiently simulate this\ntask by using natural language to direct the optimization, straightforwardly\nutilizing shows limited performance. In this work, we facilitate utilizing LLMs\nin an iterative paradigm by proposing a simple yet highly effective domain\nfeedback provider, namely $\\text{Re}^3$DF. In detail, $\\text{Re}^3$DF harnesses\nan external toolkit, RDKit, to handle the molecule hallucination, if the\nmodified molecule is chemically invalid. Otherwise, its desired properties are\ncomputed and compared to the original one, establishing reliable domain\nfeedback with correct direction and distance towards the objective, followed by\na retrieved example, to explicitly guide the LLM to refine the modified\nmolecule. We conduct experiments across both single- and multi-property\nobjectives with 2 thresholds, where $\\text{Re}^3$DF shows significant\nimprovements. Particularly, for 20 single-property objectives, $\\text{Re}^3$DF\nenhances Hit ratio by 16.95% and 20.76% under loose and strict thresholds,\nrespectively. For 32 multi-property objectives, $\\text{Re}^3$DF enhances Hit\nratio by 6.04% and 5.25%.\n","authors":["Khiem Le","Nitesh V. Chawla"],"pdf_url":"https://arxiv.org/pdf/2410.13147v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09774v3","updated":"2024-11-06T05:16:59Z","published":"2024-09-15T15:46:03Z","title":"Generalizing Alignment Paradigm of Text-to-Image Generation with\n Preferences through $f$-divergence Minimization","summary":" Direct Preference Optimization (DPO) has recently expanded its successful\napplication from aligning large language models (LLMs) to aligning\ntext-to-image models with human preferences, which has generated considerable\ninterest within the community. However, we have observed that these approaches\nrely solely on minimizing the reverse Kullback-Leibler divergence during\nalignment process between the fine-tuned model and the reference model,\nneglecting the incorporation of other divergence constraints. In this study, we\nfocus on extending reverse Kullback-Leibler divergence in the alignment\nparadigm of text-to-image models to $f$-divergence, which aims to garner better\nalignment performance as well as good generation diversity. We provide the\ngeneralized formula of the alignment paradigm under the $f$-divergence\ncondition and thoroughly analyze the impact of different divergence constraints\non alignment process from the perspective of gradient fields. We conduct\ncomprehensive evaluation on image-text alignment performance, human value\nalignment performance and generation diversity performance under different\ndivergence constraints, and the results indicate that alignment based on\nJensen-Shannon divergence achieves the best trade-off among them. The option of\ndivergence employed for aligning text-to-image models significantly impacts the\ntrade-off between alignment performance (especially human value alignment) and\ngeneration diversity, which highlights the necessity of selecting an\nappropriate divergence for practical applications.\n","authors":["Haoyuan Sun","Bo Xia","Yongzhe Chang","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2409.09774v3.pdf","comment":"34 pages"},{"id":"http://arxiv.org/abs/2411.03672v1","updated":"2024-11-06T05:11:25Z","published":"2024-11-06T05:11:25Z","title":"Towards 3D Semantic Scene Completion for Autonomous Driving: A\n Meta-Learning Framework Empowered by Deformable Large-Kernel Attention and\n Mamba Model","summary":" Semantic scene completion (SSC) is essential for achieving comprehensive\nperception in autonomous driving systems. However, existing SSC methods often\noverlook the high deployment costs in real-world applications. Traditional\narchitectures, such as 3D Convolutional Neural Networks (3D CNNs) and\nself-attention mechanisms, face challenges in efficiently capturing long-range\ndependencies within 3D voxel grids, limiting their effectiveness. To address\nthese issues, we introduce MetaSSC, a novel meta-learning-based framework for\nSSC that leverages deformable convolution, large-kernel attention, and the\nMamba (D-LKA-M) model. Our approach begins with a voxel-based semantic\nsegmentation (SS) pretraining task, aimed at exploring the semantics and\ngeometry of incomplete regions while acquiring transferable meta-knowledge.\nUsing simulated cooperative perception datasets, we supervise the perception\ntraining of a single vehicle using aggregated sensor data from multiple nearby\nconnected autonomous vehicles (CAVs), generating richer and more comprehensive\nlabels. This meta-knowledge is then adapted to the target domain through a\ndual-phase training strategy that does not add extra model parameters, enabling\nefficient deployment. To further enhance the model's capability in capturing\nlong-sequence relationships within 3D voxel grids, we integrate Mamba blocks\nwith deformable convolution and large-kernel attention into the backbone\nnetwork. Extensive experiments demonstrate that MetaSSC achieves\nstate-of-the-art performance, significantly outperforming competing models\nwhile also reducing deployment costs.\n","authors":["Yansong Qu","Zilin Huang","Zihao Sheng","Tiantian Chen","Sikai Chen"],"pdf_url":"https://arxiv.org/pdf/2411.03672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02077v4","updated":"2024-11-06T05:11:24Z","published":"2024-07-02T09:11:17Z","title":"Hierarchical Temporal Context Learning for Camera-based Semantic Scene\n Completion","summary":" Camera-based 3D semantic scene completion (SSC) is pivotal for predicting\ncomplicated 3D layouts with limited 2D image observations. The existing\nmainstream solutions generally leverage temporal information by roughly\nstacking history frames to supplement the current frame, such straightforward\ntemporal modeling inevitably diminishes valid clues and increases learning\ndifficulty. To address this problem, we present HTCL, a novel Hierarchical\nTemporal Context Learning paradigm for improving camera-based semantic scene\ncompletion. The primary innovation of this work involves decomposing temporal\ncontext learning into two hierarchical steps: (a) cross-frame affinity\nmeasurement and (b) affinity-based dynamic refinement. Firstly, to separate\ncritical relevant context from redundant information, we introduce the pattern\naffinity with scale-aware isolation and multiple independent learners for\nfine-grained contextual correspondence modeling. Subsequently, to dynamically\ncompensate for incomplete observations, we adaptively refine the feature\nsampling locations based on initially identified locations with high affinity\nand their neighboring relevant regions. Our method ranks $1^{st}$ on the\nSemanticKITTI benchmark and even surpasses LiDAR-based methods in terms of mIoU\non the OpenOccupancy benchmark. Our code is available on\nhttps://github.com/Arlo0o/HTCL.\n","authors":["Bohan Li","Jiajun Deng","Wenyao Zhang","Zhujin Liang","Dalong Du","Xin Jin","Wenjun Zeng"],"pdf_url":"https://arxiv.org/pdf/2407.02077v4.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2411.03670v1","updated":"2024-11-06T05:09:34Z","published":"2024-11-06T05:09:34Z","title":"Touchstone Benchmark: Are We on the Right Way for Evaluating AI\n Algorithms for Medical Segmentation?","summary":" How can we test AI performance? This question seems trivial, but it isn't.\nStandard benchmarks often have problems such as in-distribution and small-size\ntest sets, oversimplified metrics, unfair comparisons, and short-term outcome\npressure. As a consequence, good performance on standard benchmarks does not\nguarantee success in real-world scenarios. To address these problems, we\npresent Touchstone, a large-scale collaborative segmentation benchmark of 9\ntypes of abdominal organs. This benchmark is based on 5,195 training CT scans\nfrom 76 hospitals around the world and 5,903 testing CT scans from 11\nadditional hospitals. This diverse test set enhances the statistical\nsignificance of benchmark results and rigorously evaluates AI algorithms across\nvarious out-of-distribution scenarios. We invited 14 inventors of 19 AI\nalgorithms to train their algorithms, while our team, as a third party,\nindependently evaluated these algorithms on three test sets. In addition, we\nalso evaluated pre-existing AI frameworks--which, differing from algorithms,\nare more flexible and can support different algorithms--including MONAI from\nNVIDIA, nnU-Net from DKFZ, and numerous other open-source frameworks. We are\ncommitted to expanding this benchmark to encourage more innovation of AI\nalgorithms for the medical domain.\n","authors":["Pedro R. A. S. Bassi","Wenxuan Li","Yucheng Tang","Fabian Isensee","Zifu Wang","Jieneng Chen","Yu-Cheng Chou","Yannick Kirchhoff","Maximilian Rokuss","Ziyan Huang","Jin Ye","Junjun He","Tassilo Wald","Constantin Ulrich","Michael Baumgartner","Saikat Roy","Klaus H. Maier-Hein","Paul Jaeger","Yiwen Ye","Yutong Xie","Jianpeng Zhang","Ziyang Chen","Yong Xia","Zhaohu Xing","Lei Zhu","Yousef Sadegheih","Afshin Bozorgpour","Pratibha Kumari","Reza Azad","Dorit Merhof","Pengcheng Shi","Ting Ma","Yuxin Du","Fan Bai","Tiejun Huang","Bo Zhao","Haonan Wang","Xiaomeng Li","Hanxue Gu","Haoyu Dong","Jichen Yang","Maciej A. Mazurowski","Saumya Gupta","Linshan Wu","Jiaxin Zhuang","Hao Chen","Holger Roth","Daguang Xu","Matthew B. Blaschko","Sergio Decherchi","Andrea Cavalli","Alan L. Yuille","Zongwei Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.03670v1.pdf","comment":"Accepted to NeurIPS-2024"},{"id":"http://arxiv.org/abs/2409.13941v2","updated":"2024-11-06T05:05:12Z","published":"2024-09-20T23:04:21Z","title":"TalkMosaic: Interactive PhotoMosaic with Multi-modal LLM Q&A\n Interactions","summary":" We use images of cars of a wide range of varieties to compose an image of an\nanimal such as a bird or a lion for the theme of environmental protection to\nmaximize the information about cars in a single composed image and to raise the\nawareness about environmental challenges. We present a novel way of image\ninteraction with an artistically-composed photomosaic image, in which a simple\noperation of \"click and display\" is used to demonstrate the interactive switch\nbetween a tile image in a photomosaic image and the corresponding original car\nimage, which will be automatically saved on the Desktop. We build a multimodal\ncustom GPT named TalkMosaic by incorporating car images information and the\nrelated knowledge to ChatGPT. By uploading the original car image to\nTalkMosaic, we can ask questions about the given car image and get the\ncorresponding answers efficiently and effectively such as where to buy the tire\nin the car image that satisfies high environmental standards. We give an\nin-depth analysis on how to speed up the inference of multimodal LLM using\nsparse attention and quantization techniques with presented probabilistic\nFlashAttention (PrFlashAttention) and Staircase Adaptive Quantization (SAQ)\nmethods. The implemented prototype demonstrates the feasibility and\neffectiveness of the presented approach.\n","authors":["Kevin Li","Fulu Li"],"pdf_url":"https://arxiv.org/pdf/2409.13941v2.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.14789v2","updated":"2024-11-06T04:41:48Z","published":"2024-08-27T05:31:30Z","title":"Revisiting Surgical Instrument Segmentation Without Human Intervention:\n A Graph Partitioning View","summary":" Surgical instrument segmentation (SIS) on endoscopic images stands as a\nlong-standing and essential task in the context of computer-assisted\ninterventions for boosting minimally invasive surgery. Given the recent surge\nof deep learning methodologies and their data-hungry nature, training a neural\npredictive model based on massive expert-curated annotations has been\ndominating and served as an off-the-shelf approach in the field, which could,\nhowever, impose prohibitive burden to clinicians for preparing fine-grained\npixel-wise labels corresponding to the collected surgical video frames. In this\nwork, we propose an unsupervised method by reframing the video frame\nsegmentation as a graph partitioning problem and regarding image pixels as\ngraph nodes, which is significantly different from the previous efforts. A\nself-supervised pre-trained model is firstly leveraged as a feature extractor\nto capture high-level semantic features. Then, Laplacian matrixs are computed\nfrom the features and are eigendecomposed for graph partitioning. On the \"deep\"\neigenvectors, a surgical video frame is meaningfully segmented into different\nmodules such as tools and tissues, providing distinguishable semantic\ninformation like locations, classes, and relations. The segmentation problem\ncan then be naturally tackled by applying clustering or threshold on the\neigenvectors. Extensive experiments are conducted on various datasets (e.g.,\nEndoVis2017, EndoVis2018, UCL, etc.) for different clinical endpoints. Across\nall the challenging scenarios, our method demonstrates outstanding performance\nand robustness higher than unsupervised state-of-the-art (SOTA) methods. The\ncode is released at https://github.com/MingyuShengSMY/GraphClusteringSIS.git.\n","authors":["Mingyu Sheng","Jianan Fan","Dongnan Liu","Ron Kikinis","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2408.14789v2.pdf","comment":"This paper is accepted by The 32nd ACM International Conference on\n Multimedia (ACM MM 2024) Workshop on Multimedia Computing for Health and\n Medicine (MCHM)"},{"id":"http://arxiv.org/abs/2406.15735v3","updated":"2024-11-06T03:53:13Z","published":"2024-06-22T04:56:16Z","title":"Identifying and Solving Conditional Image Leakage in Image-to-Video\n Diffusion Model","summary":" Diffusion models have obtained substantial progress in image-to-video\ngeneration. However, in this paper, we find that these models tend to generate\nvideos with less motion than expected. We attribute this to the issue called\nconditional image leakage, where the image-to-video diffusion models (I2V-DMs)\ntend to over-rely on the conditional image at large time steps. We further\naddress this challenge from both inference and training aspects. First, we\npropose to start the generation process from an earlier time step to avoid the\nunreliable large-time steps of I2V-DMs, as well as an initial noise\ndistribution with optimal analytic expressions (Analytic-Init) by minimizing\nthe KL divergence between it and the actual marginal distribution to bridge the\ntraining-inference gap. Second, we design a time-dependent noise distribution\n(TimeNoise) for the conditional image during training, applying higher noise\nlevels at larger time steps to disrupt it and reduce the model's dependency on\nit. We validate these general strategies on various I2V-DMs on our collected\nopen-domain image benchmark and the UCF101 dataset. Extensive results show that\nour methods outperform baselines by producing higher motion scores with lower\nerrors while maintaining image alignment and temporal consistency, thereby\nyielding superior overall performance and enabling more accurate motion\ncontrol. The project page: \\url{https://cond-image-leak.github.io/}.\n","authors":["Min Zhao","Hongzhou Zhu","Chendong Xiang","Kaiwen Zheng","Chongxuan Li","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2406.15735v3.pdf","comment":"NeurIPS 2024. Project page: https://cond-image-leak.github.io/"},{"id":"http://arxiv.org/abs/2411.01797v2","updated":"2024-11-06T03:45:13Z","published":"2024-11-04T04:45:45Z","title":"AIWR: Aerial Image Water Resource Dataset for Segmentation Analysis","summary":" Effective water resource management is crucial in agricultural regions like\nnortheastern Thailand, where limited water retention in sandy soils poses\nsignificant challenges. In response to this issue, the Aerial Image Water\nResource (AIWR) dataset was developed, comprising 800 aerial images focused on\nnatural and artificial water bodies in this region. The dataset was created\nusing Bing Maps and follows the standards of the Fundamental Geographic Data\nSet (FGDS). It includes ground truth annotations validated by experts in remote\nsensing, making it an invaluable resource for researchers in geoinformatics,\ncomputer vision, and artificial intelligence. The AIWR dataset presents\nconsiderable challenges, such as segmentation due to variations in the size,\ncolor, shape, and similarity of water bodies, which often resemble other land\nuse categories. The objective of the proposed dataset is to explore advanced\nAI-driven methods for water body segmentation, addressing the unique challenges\nposed by the dataset complexity and limited size. This dataset and related\nresearch contribute to the development of novel algorithms for water\nmanagement, supporting sustainable agricultural practices in regions facing\nsimilar challenges.\n","authors":["Sangdaow Noppitak","Emmanuel Okafor","Olarik Surinta"],"pdf_url":"https://arxiv.org/pdf/2411.01797v2.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.03638v1","updated":"2024-11-06T03:30:46Z","published":"2024-11-06T03:30:46Z","title":"Adaptive Stereo Depth Estimation with Multi-Spectral Images Across All\n Lighting Conditions","summary":" Depth estimation under adverse conditions remains a significant challenge.\nRecently, multi-spectral depth estimation, which integrates both visible light\nand thermal images, has shown promise in addressing this issue. However,\nexisting algorithms struggle with precise pixel-level feature matching,\nlimiting their ability to fully exploit geometric constraints across different\nspectra. To address this, we propose a novel framework incorporating stereo\ndepth estimation to enforce accurate geometric constraints. In particular, we\ntreat the visible light and thermal images as a stereo pair and utilize a\nCross-modal Feature Matching (CFM) Module to construct a cost volume for\npixel-level matching. To mitigate the effects of poor lighting on stereo\nmatching, we introduce Degradation Masking, which leverages robust monocular\nthermal depth estimation in degraded regions. Our method achieves\nstate-of-the-art (SOTA) performance on the Multi-Spectral Stereo (MS2) dataset,\nwith qualitative evaluations demonstrating high-quality depth maps under\nvarying lighting conditions.\n","authors":["Zihan Qin","Jialei Xu","Wenbo Zhao","Junjun Jiang","Xianming Liu"],"pdf_url":"https://arxiv.org/pdf/2411.03638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03637v1","updated":"2024-11-06T03:28:06Z","published":"2024-11-06T03:28:06Z","title":"Structure Consistent Gaussian Splatting with Matching Prior for Few-shot\n Novel View Synthesis","summary":" Despite the substantial progress of novel view synthesis, existing methods,\neither based on the Neural Radiance Fields (NeRF) or more recently 3D Gaussian\nSplatting (3DGS), suffer significant degradation when the input becomes sparse.\nNumerous efforts have been introduced to alleviate this problem, but they still\nstruggle to synthesize satisfactory results efficiently, especially in the\nlarge scene. In this paper, we propose SCGaussian, a Structure Consistent\nGaussian Splatting method using matching priors to learn 3D consistent scene\nstructure. Considering the high interdependence of Gaussian attributes, we\noptimize the scene structure in two folds: rendering geometry and, more\nimportantly, the position of Gaussian primitives, which is hard to be directly\nconstrained in the vanilla 3DGS due to the non-structure property. To achieve\nthis, we present a hybrid Gaussian representation. Besides the ordinary\nnon-structure Gaussian primitives, our model also consists of ray-based\nGaussian primitives that are bound to matching rays and whose optimization of\ntheir positions is restricted along the ray. Thus, we can utilize the matching\ncorrespondence to directly enforce the position of these Gaussian primitives to\nconverge to the surface points where rays intersect. Extensive experiments on\nforward-facing, surrounding, and complex large scenes show the effectiveness of\nour approach with state-of-the-art performance and high efficiency. Code is\navailable at https://github.com/prstrive/SCGaussian.\n","authors":["Rui Peng","Wangze Xu","Luyang Tang","Liwei Liao","Jianbo Jiao","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2411.03637v1.pdf","comment":"NeurIPS 2024 Accepted"},{"id":"http://arxiv.org/abs/2410.21872v2","updated":"2024-11-06T02:52:47Z","published":"2024-10-29T09:08:57Z","title":"Advancing Efficient Brain Tumor Multi-Class Classification -- New\n Insights from the Vision Mamba Model in Transfer Learning","summary":" Early and accurate diagnosis of brain tumors is crucial for improving patient\nsurvival rates. However, the detection and classification of brain tumors are\nchallenging due to their diverse types and complex morphological\ncharacteristics. This study investigates the application of pre-trained models\nfor brain tumor classification, with a particular focus on deploying the Mamba\nmodel. We fine-tuned several mainstream transfer learning models and applied\nthem to the multi-class classification of brain tumors. By comparing these\nmodels to those trained from scratch, we demonstrated the significant\nadvantages of transfer learning, especially in the medical imaging field, where\nannotated data is often limited. Notably, we introduced the Vision Mamba (Vim),\na novel network architecture, and applied it for the first time in brain tumor\nclassification, achieving exceptional classification accuracy. Experimental\nresults indicate that the Vim model achieved 100% classification accuracy on an\nindependent test set, emphasizing its potential for tumor classification tasks.\nThese findings underscore the effectiveness of transfer learning in brain tumor\nclassification and reveal that, compared to existing state-of-the-art models,\nthe Vim model is lightweight, efficient, and highly accurate, offering a new\nperspective for clinical applications. Furthermore, the framework proposed in\nthis study for brain tumor classification, based on transfer learning and the\nVision Mamba model, is broadly applicable to other medical imaging\nclassification problems.\n","authors":["Yinyi Lai","Anbo Cao","Yuan Gao","Jiaqi Shang","Zongyu Li","Jia Guo"],"pdf_url":"https://arxiv.org/pdf/2410.21872v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03628v1","updated":"2024-11-06T02:50:30Z","published":"2024-11-06T02:50:30Z","title":"StreamingBench: Assessing the Gap for MLLMs to Achieve Streaming Video\n Understanding","summary":" The rapid development of Multimodal Large Language Models (MLLMs) has\nexpanded their capabilities from image comprehension to video understanding.\nHowever, most of these MLLMs focus primarily on offline video comprehension,\nnecessitating extensive processing of all video frames before any queries can\nbe made. This presents a significant gap compared to the human ability to\nwatch, listen, think, and respond to streaming inputs in real time,\nhighlighting the limitations of current MLLMs. In this paper, we introduce\nStreamingBench, the first comprehensive benchmark designed to evaluate the\nstreaming video understanding capabilities of MLLMs. StreamingBench assesses\nthree core aspects of streaming video understanding: (1) real-time visual\nunderstanding, (2) omni-source understanding, and (3) contextual understanding.\nThe benchmark consists of 18 tasks, featuring 900 videos and 4,500\nhuman-curated QA pairs. Each video features five questions presented at\ndifferent time points to simulate a continuous streaming scenario. We conduct\nexperiments on StreamingBench with 13 open-source and proprietary MLLMs and\nfind that even the most advanced proprietary MLLMs like Gemini 1.5 Pro and\nGPT-4o perform significantly below human-level streaming video understanding\ncapabilities. We hope our work can facilitate further advancements for MLLMs,\nempowering them to approach human-level video comprehension and interaction in\nmore realistic scenarios.\n","authors":["Junming Lin","Zheng Fang","Chi Chen","Zihao Wan","Fuwen Luo","Peng Li","Yang Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2411.03628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03618v1","updated":"2024-11-06T02:23:38Z","published":"2024-11-06T02:23:38Z","title":"Cross Feature Fusion of Fundus Image and Generated Lesion Map for\n Referable Diabetic Retinopathy Classification","summary":" Diabetic Retinopathy (DR) is a primary cause of blindness, necessitating\nearly detection and diagnosis. This paper focuses on referable DR\nclassification to enhance the applicability of the proposed method in clinical\npractice. We develop an advanced cross-learning DR classification method\nleveraging transfer learning and cross-attention mechanisms. The proposed\nmethod employs the Swin U-Net architecture to segment lesion maps from DR\nfundus images. The Swin U-Net segmentation model, enriched with DR lesion\ninsights, is transferred to generate a lesion map. Both the fundus image and\nits segmented lesion map are used as complementary inputs for the\nclassification model. A cross-attention mechanism is deployed to improve the\nmodel's ability to capture fine-grained details from the input pairs. Our\nexperiments, utilizing two public datasets, FGADR and EyePACS, demonstrate a\nsuperior accuracy of 94.6%, surpassing current state-of-the-art methods by\n4.4%. To this end, we aim for the proposed method to be seamlessly integrated\ninto clinical workflows, enhancing accuracy and efficiency in identifying\nreferable DR.\n","authors":["Dahyun Mok","Junghyun Bum","Le Duc Tai","Hyunseung Choo"],"pdf_url":"https://arxiv.org/pdf/2411.03618v1.pdf","comment":"ACCV 2024 accepted"},{"id":"http://arxiv.org/abs/2406.16473v2","updated":"2024-11-06T02:17:05Z","published":"2024-06-24T09:25:02Z","title":"D2SP: Dynamic Dual-Stage Purification Framework for Dual Noise\n Mitigation in Vision-based Affective Recognition","summary":" The contemporary state-of-the-art of Dynamic Facial Expression Recognition\n(DFER) technology facilitates remarkable progress by deriving emotional\nmappings of facial expressions from video content, underpinned by training on\nvoluminous datasets. Yet, the DFER datasets encompass a substantial volume of\nnoise data. Noise arises from low-quality captures that defy logical labeling,\nand instances that suffer from mislabeling due to annotation bias, engendering\ntwo principal types of uncertainty: the uncertainty regarding data usability\nand the uncertainty concerning label reliability. Addressing the two types of\nuncertainty, we have meticulously crafted a two-stage framework aiming at\n\\textbf{S}eeking \\textbf{C}ertain data \\textbf{I}n extensive \\textbf{U}ncertain\ndata (SCIU). This initiative aims to purge the DFER datasets of these\nuncertainties, thereby ensuring that only clean, verified data is employed in\ntraining processes. To mitigate the issue of low-quality samples, we introduce\nthe Coarse-Grained Pruning (CGP) stage, which assesses sample weights and\nprunes those deemed unusable due to their low weight. For samples with\nincorrect annotations, the Fine-Grained Correction (FGC) stage evaluates\nprediction stability to rectify mislabeled data. Moreover, SCIU is conceived as\na universally compatible, plug-and-play framework, tailored to integrate\nseamlessly with prevailing DFER methodologies. Rigorous experiments across\nprevalent DFER datasets and against numerous benchmark methods substantiates\nSCIU's capacity to markedly elevate performance metrics.\n","authors":["Haoran Wang","Xinji Mai","Zeng Tao","Xuan Tong","Junxiong Lin","Yan Wang","Jiawen Yu","Boyang Wang","Shaoqi Yan","Qing Zhao","Ziheng Zhou","Shuyong Gao","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.16473v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03615v1","updated":"2024-11-06T02:16:34Z","published":"2024-11-06T02:16:34Z","title":"ADMIRE: a locally adaptive single-image, non-uniformity correction and\n denoising algorithm: application to uncooled IR camera","summary":" We propose a new way to correct for the non-uniformity (NU) and the noise in\nuncooled infrared-type images. This method works on static images, needs no\nregistration, no camera motion and no model for the non uniformity. The\nproposed method uses an hybrid scheme including an automatic locally-adaptive\ncontrast adjustment and a state-of-the-art image denoising method. It permits\nto correct for a fully non-linear NU and the noise efficiently using only one\nimage. We compared it with total variation on real raw and simulated NU\ninfrared images. The strength of this approach lies in its simplicity, low\ncomputational cost. It needs no test-pattern or calibration and produces no\n\"ghost-artefact\".\n","authors":["Yohann Tendero","Jerome Gilles"],"pdf_url":"https://arxiv.org/pdf/2411.03615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03610v1","updated":"2024-11-06T02:05:44Z","published":"2024-11-06T02:05:44Z","title":"LCP-Fusion: A Neural Implicit SLAM with Enhanced Local Constraints and\n Computable Prior","summary":" Recently the dense Simultaneous Localization and Mapping (SLAM) based on\nneural implicit representation has shown impressive progress in hole filling\nand high-fidelity mapping. Nevertheless, existing methods either heavily rely\non known scene bounds or suffer inconsistent reconstruction due to drift in\npotential loop-closure regions, or both, which can be attributed to the\ninflexible representation and lack of local constraints. In this paper, we\npresent LCP-Fusion, a neural implicit SLAM system with enhanced local\nconstraints and computable prior, which takes the sparse voxel octree structure\ncontaining feature grids and SDF priors as hybrid scene representation,\nenabling the scalability and robustness during mapping and tracking. To enhance\nthe local constraints, we propose a novel sliding window selection strategy\nbased on visual overlap to address the loop-closure, and a practical warping\nloss to constrain relative poses. Moreover, we estimate SDF priors as coarse\ninitialization for implicit features, which brings additional explicit\nconstraints and robustness, especially when a light but efficient adaptive\nearly ending is adopted. Experiments demonstrate that our method achieve better\nlocalization accuracy and reconstruction consistency than existing RGB-D\nimplicit SLAM, especially in challenging real scenes (ScanNet) as well as\nself-captured scenes with unknown scene bounds. The code is available at\nhttps://github.com/laliwang/LCP-Fusion.\n","authors":["Jiahui Wang","Yinan Deng","Yi Yang","Yufeng Yue"],"pdf_url":"https://arxiv.org/pdf/2411.03610v1.pdf","comment":"Accepted by 2024 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2411.03576v1","updated":"2024-11-06T00:34:26Z","published":"2024-11-06T00:34:26Z","title":"Hybrid Attention for Robust RGB-T Pedestrian Detection in Real-World\n Conditions","summary":" Multispectral pedestrian detection has gained significant attention in recent\nyears, particularly in autonomous driving applications. To address the\nchallenges posed by adversarial illumination conditions, the combination of\nthermal and visible images has demonstrated its advantages. However, existing\nfusion methods rely on the critical assumption that the RGB-Thermal (RGB-T)\nimage pairs are fully overlapping. These assumptions often do not hold in\nreal-world applications, where only partial overlap between images can occur\ndue to sensors configuration. Moreover, sensor failure can cause loss of\ninformation in one modality. In this paper, we propose a novel module called\nthe Hybrid Attention (HA) mechanism as our main contribution to mitigate\nperformance degradation caused by partial overlap and sensor failure, i.e. when\nat least part of the scene is acquired by only one sensor. We propose an\nimproved RGB-T fusion algorithm, robust against partial overlap and sensor\nfailure encountered during inference in real-world applications. We also\nleverage a mobile-friendly backbone to cope with resource constraints in\nembedded systems. We conducted experiments by simulating various partial\noverlap and sensor failure scenarios to evaluate the performance of our\nproposed method. The results demonstrate that our approach outperforms\nstate-of-the-art methods, showcasing its superiority in handling real-world\nchallenges.\n","authors":["Arunkumar Rathinam","Leo Pauly","Abd El Rahman Shabayek","Wassim Rharbaoui","Anis Kacem","Vincent Gaudillière","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2411.03576v1.pdf","comment":"Accepted for publication in IEEE Robotics and Automation Letters,\n October 2024"},{"id":"http://arxiv.org/abs/2411.03569v1","updated":"2024-11-06T00:17:36Z","published":"2024-11-06T00:17:36Z","title":"Towards Personalized Federated Learning via Comprehensive Knowledge\n Distillation","summary":" Federated learning is a distributed machine learning paradigm designed to\nprotect data privacy. However, data heterogeneity across various clients\nresults in catastrophic forgetting, where the model rapidly forgets previous\nknowledge while acquiring new knowledge. To address this challenge,\npersonalized federated learning has emerged to customize a personalized model\nfor each client. However, the inherent limitation of this mechanism is its\nexcessive focus on personalization, potentially hindering the generalization of\nthose models. In this paper, we present a novel personalized federated learning\nmethod that uses global and historical models as teachers and the local model\nas the student to facilitate comprehensive knowledge distillation. The\nhistorical model represents the local model from the last round of client\ntraining, containing historical personalized knowledge, while the global model\nrepresents the aggregated model from the last round of server aggregation,\ncontaining global generalized knowledge. By applying knowledge distillation, we\neffectively transfer global generalized knowledge and historical personalized\nknowledge to the local model, thus mitigating catastrophic forgetting and\nenhancing the general performance of personalized models. Extensive\nexperimental results demonstrate the significant advantages of our method.\n","authors":["Pengju Wang","Bochao Liu","Weijia Guo","Yong Li","Shiming Ge"],"pdf_url":"https://arxiv.org/pdf/2411.03569v1.pdf","comment":"Accepted by IEEE SMC 2024"},{"id":"http://arxiv.org/abs/2411.03568v1","updated":"2024-11-06T00:16:16Z","published":"2024-11-06T00:16:16Z","title":"The American Sign Language Knowledge Graph: Infusing ASL Models with\n Linguistic Knowledge","summary":" Language models for American Sign Language (ASL) could make language\ntechnologies substantially more accessible to those who sign. To train models\non tasks such as isolated sign recognition (ISR) and ASL-to-English\ntranslation, datasets provide annotated video examples of ASL signs. To\nfacilitate the generalizability and explainability of these models, we\nintroduce the American Sign Language Knowledge Graph (ASLKG), compiled from\ntwelve sources of expert linguistic knowledge. We use the ASLKG to train\nneuro-symbolic models for 3 ASL understanding tasks, achieving accuracies of\n91% on ISR, 14% for predicting the semantic features of unseen signs, and 36%\nfor classifying the topic of Youtube-ASL videos.\n","authors":["Lee Kezar","Nidhi Munikote","Zian Zeng","Zed Sehyr","Naomi Caselli","Jesse Thomason"],"pdf_url":"https://arxiv.org/pdf/2411.03568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21169v3","updated":"2024-11-06T00:11:08Z","published":"2024-10-28T16:11:35Z","title":"Document Parsing Unveiled: Techniques, Challenges, and Prospects for\n Structured Information Extraction","summary":" Document parsing is essential for converting unstructured and semi-structured\ndocuments-such as contracts, academic papers, and invoices-into structured,\nmachine-readable data. Document parsing extract reliable structured data from\nunstructured inputs, providing huge convenience for numerous applications.\nEspecially with recent achievements in Large Language Models, document parsing\nplays an indispensable role in both knowledge base construction and training\ndata generation. This survey presents a comprehensive review of the current\nstate of document parsing, covering key methodologies, from modular pipeline\nsystems to end-to-end models driven by large vision-language models. Core\ncomponents such as layout detection, content extraction (including text,\ntables, and mathematical expressions), and multi-modal data integration are\nexamined in detail. Additionally, this paper discusses the challenges faced by\nmodular document parsing systems and vision-language models in handling complex\nlayouts, integrating multiple modules, and recognizing high-density text. It\nemphasizes the importance of developing larger and more diverse datasets and\noutlines future research directions.\n","authors":["Qintong Zhang","Victor Shea-Jay Huang","Bin Wang","Junyuan Zhang","Zhengren Wang","Hao Liang","Shawn Wang","Matthieu Lin","Conghui He","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.21169v3.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000..7f5166c Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 0000000..9ded9d9 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 0000000..138781a --- /dev/null +++ b/index.html @@ -0,0 +1,45866 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Robotics 41 + +
+
+
+ + ☆ Fed-EC: Bandwidth-Efficient Clustering-Based Federated Learning For + Autonomous Visual Robot Navigation + + +
+ Centralized learning requires data to be aggregated at a central server, +which poses significant challenges in terms of data privacy and bandwidth +consumption. Federated learning presents a compelling alternative, however, +vanilla federated learning methods deployed in robotics aim to learn a single +global model across robots that works ideally for all. But in practice one +model may not be well suited for robots deployed in various environments. This +paper proposes Federated-EmbedCluster (Fed-EC), a clustering-based federated +learning framework that is deployed with vision based autonomous robot +navigation in diverse outdoor environments. The framework addresses the key +federated learning challenge of deteriorating model performance of a single +global model due to the presence of non-IID data across real-world robots. +Extensive real-world experiments validate that Fed-EC reduces the communication +size by 23x for each robot while matching the performance of centralized +learning for goal-oriented navigation and outperforms local learning. Fed-EC +can transfer previously learnt models to new robots that join the cluster. + +
+
+
+
+
+ + ☆ Rescheduling after vehicle failures in the multi-depot rural postman + problem with rechargeable and reusable vehicles + + +
+ We present a centralized auction algorithm to solve the Multi-Depot Rural +Postman Problem with Rechargeable and Reusable Vehicles (MD-RPP-RRV), focusing +on rescheduling arc routing after vehicle failures. The problem involves +finding heuristically obtained best feasible routes for multiple rechargeable +and reusable vehicles with capacity constraints capable of performing multiple +trips from multiple depots, with the possibility of vehicle failures. Our +algorithm auctions the failed trips to active (non-failed) vehicles through +local auctioning, modifying initial routes to handle dynamic vehicle failures +efficiently. When a failure occurs, the algorithm searches for the best active +vehicle to perform the failed trip and inserts the trip into that vehicle's +route, which avoids a complete rescheduling and reduces the computational +effort. We compare the algorithm's solutions against offline optimal solutions +obtained from solving a Mixed Integer Linear Programming (MILP) formulation +using the Gurobi solver; this formulation assumes that perfect information +about the vehicle failures and failure times is given. The results demonstrate +that the centralized auction algorithm produces solutions that are, in some +cases, near optimal; moreover, the execution time for the proposed approach is +much more consistent and is, for some instances, orders of magnitude less than +the execution time of the Gurobi solver. The theoretical analysis provides an +upper bound for the competitive ratio and computational complexity of our +algorithm, offering a formal performance guarantee in dynamic failure +scenarios. + +
+
+
+
+
+ + ☆ Problem Space Transformations for Generalisation in Behavioural Cloning + + +
+ The combination of behavioural cloning and neural networks has driven +significant progress in robotic manipulation. As these algorithms may require a +large number of demonstrations for each task of interest, they remain +fundamentally inefficient in complex scenarios. This issue is aggravated when +the system is treated as a black-box, ignoring its physical properties. This +work characterises widespread properties of robotic manipulation, such as pose +equivariance and locality. We empirically demonstrate that transformations +arising from each of these properties allow neural policies trained with +behavioural cloning to better generalise to out-of-distribution problem +instances. + +
+
+
+
+
+ + ☆ Memorized action chunking with Transformers: Imitation learning for + vision-based tissue surface scanning + + +
+ Optical sensing technologies are emerging technologies used in cancer +surgeries to ensure the complete removal of cancerous tissue. While point-wise +assessment has many potential applications, incorporating automated large area +scanning would enable holistic tissue sampling. However, such scanning tasks +are challenging due to their long-horizon dependency and the requirement for +fine-grained motion. To address these issues, we introduce Memorized Action +Chunking with Transformers (MACT), an intuitive yet efficient imitation +learning method for tissue surface scanning tasks. It utilizes a sequence of +past images as historical information to predict near-future action sequences. +In addition, hybrid temporal-spatial positional embeddings were employed to +facilitate learning. In various simulation settings, MACT demonstrated +significant improvements in contour scanning and area scanning over the +baseline model. In real-world testing, with only 50 demonstration trajectories, +MACT surpassed the baseline model by achieving a 60-80% success rate on all +scanning tasks. Our findings suggest that MACT is a promising model for +adaptive scanning in surgical settings. + +
+
+
+
+
+ + ☆ Design and control of a robotic payload stabilization mechanism for + rocket flights + + +
+ The use of parallel manipulators in aerospace engineering has gained +significant attention due to their ability to provide improved stability and +precision. This paper presents the design, control, and analysis of 'STEWIE', +which is a three-degree-of-freedom (DoF) parallel manipulator robot developed +by members of the thrustMIT rocketry team, as a payload stabilization mechanism +for their sounding rocket, 'Altair'. The goal of the robot was to demonstrate +the attitude control of the parallel plate against the continuous change in +orientation experienced by the rocket during its flight, stabilizing the +payloads. At the same time, the high gravitational forces (G-forces) and +vibrations experienced by the sounding rocket are counteracted. A novel design +of the mechanism, inspired by a standard Stewart platform, is proposed which +was down-scaled to fit inside a 4U CubeSat within its space constraints. The +robot uses three micro servo motors to actuate the links that control the +alignment of the parallel plate. In addition to the actuation mechanism, a +robust control system for its manipulation was developed for the robot. The +robot represents a significant advancement in the field of space robotics in +the aerospace industry by demonstrating the successful implementation of +complex robotic mechanisms in small, confined spaces such as CubeSats, which +are standard form factors for large payloads in the aerospace industry. + +
+
+ comment: For code and design files, refer to + https://github.com/utkarshanand140/Stewie-Robot +
+
+
+
+
+ + ☆ Select2Plan: Training-Free ICL-Based Planning through VQA and Memory + Retrieval + + +
+ This study explores the potential of off-the-shelf Vision-Language Models +(VLMs) for high-level robot planning in the context of autonomous navigation. +Indeed, while most of existing learning-based approaches for path planning +require extensive task-specific training/fine-tuning, we demonstrate how such +training can be avoided for most practical cases. To do this, we introduce +Select2Plan (S2P), a novel training-free framework for high-level robot +planning which completely eliminates the need for fine-tuning or specialised +training. By leveraging structured Visual Question-Answering (VQA) and +In-Context Learning (ICL), our approach drastically reduces the need for data +collection, requiring a fraction of the task-specific data typically used by +trained models, or even relying only on online data. Our method facilitates the +effective use of a generally trained VLM in a flexible and cost-efficient way, +and does not require additional sensing except for a simple monocular camera. +We demonstrate its adaptability across various scene types, context sources, +and sensing setups. We evaluate our approach in two distinct scenarios: +traditional First-Person View (FPV) and infrastructure-driven Third-Person View +(TPV) navigation, demonstrating the flexibility and simplicity of our method. +Our technique significantly enhances the navigational capabilities of a +baseline VLM of approximately 50% in TPV scenario, and is comparable to trained +models in the FPV one, with as few as 20 demonstrations. + +
+
+
+
+
+ + ☆ Object-Centric Dexterous Manipulation from Human Motion Data + + +
+ Manipulating objects to achieve desired goal states is a basic but important +skill for dexterous manipulation. Human hand motions demonstrate proficient +manipulation capability, providing valuable data for training robots with +multi-finger hands. Despite this potential, substantial challenges arise due to +the embodiment gap between human and robot hands. In this work, we introduce a +hierarchical policy learning framework that uses human hand motion data for +training object-centric dexterous robot manipulation. At the core of our method +is a high-level trajectory generative model, learned with a large-scale human +hand motion capture dataset, to synthesize human-like wrist motions conditioned +on the desired object goal states. Guided by the generated wrist motions, deep +reinforcement learning is further used to train a low-level finger controller +that is grounded in the robot's embodiment to physically interact with the +object to achieve the goal. Through extensive evaluation across 10 household +objects, our approach not only demonstrates superior performance but also +showcases generalization capability to novel object geometries and goal states. +Furthermore, we transfer the learned policies from simulation to a real-world +bimanual dexterous robot system, further demonstrating its applicability in +real-world scenarios. Project website: +https://cypypccpy.github.io/obj-dex.github.io/. + +
+
+ comment: 20 pages, 7 figures +
+
+
+
+
+ + ☆ ET-SEED: Efficient Trajectory-Level SE(3) Equivariant Diffusion Policy + + +
+ Imitation learning, e.g., diffusion policy, has been proven effective in +various robotic manipulation tasks. However, extensive demonstrations are +required for policy robustness and generalization. To reduce the demonstration +reliance, we leverage spatial symmetry and propose ET-SEED, an efficient +trajectory-level SE(3) equivariant diffusion model for generating action +sequences in complex robot manipulation tasks. Further, previous equivariant +diffusion models require the per-step equivariance in the Markov process, +making it difficult to learn policy under such strong constraints. We +theoretically extend equivariant Markov kernels and simplify the condition of +equivariant diffusion process, thereby significantly improving training +efficiency for trajectory-level SE(3) equivariant diffusion policy in an +end-to-end manner. We evaluate ET-SEED on representative robotic manipulation +tasks, involving rigid body, articulated and deformable object. Experiments +demonstrate superior data efficiency and manipulation proficiency of our +proposed method, as well as its ability to generalize to unseen configurations +with only a few demonstrations. Website: https://et-seed.github.io/ + +
+
+ comment: Accept to CoRL 2024 Workshop on X-Embodiment Robot Learning +
+
+
+
+
+ + ☆ Continuous-Time State Estimation Methods in Robotics: A Survey + + +
+ Accurate, efficient, and robust state estimation is more important than ever +in robotics as the variety of platforms and complexity of tasks continue to +grow. Historically, discrete-time filters and smoothers have been the dominant +approach, in which the estimated variables are states at discrete sample times. +The paradigm of continuous-time state estimation proposes an alternative +strategy by estimating variables that express the state as a continuous +function of time, which can be evaluated at any query time. Not only can this +benefit downstream tasks such as planning and control, but it also +significantly increases estimator performance and flexibility, as well as +reduces sensor preprocessing and interfacing complexity. Despite this, +continuous-time methods remain underutilized, potentially because they are less +well-known within robotics. To remedy this, this work presents a unifying +formulation of these methods and the most exhaustive literature review to date, +systematically categorizing prior work by methodology, application, state +variables, historical context, and theoretical contribution to the field. By +surveying splines and Gaussian processes together and contextualizing works +from other research domains, this work identifies and analyzes open problems in +continuous-time state estimation and suggests new research directions. + +
+
+ comment: Submitted to IEEE Transactions on Robotics (T-RO) +
+
+
+
+
+ + ☆ DEIO: Deep Event Inertial Odometry + + +
+ Event cameras are bio-inspired, motion-activated sensors that demonstrate +impressive potential in handling challenging situations, such as motion blur +and high-dynamic range. Despite their promise, existing event-based +simultaneous localization and mapping (SLAM) approaches exhibit limited +performance in real-world applications. On the other hand, state-of-the-art +SLAM approaches that incorporate deep neural networks for better robustness and +applicability. However, these is a lack of research in fusing learning-based +event SLAM methods with IMU, which could be indispensable to push the +event-based SLAM to large-scale, low-texture or complex scenarios. In this +paper, we propose DEIO, the first monocular deep event-inertial odometry +framework that combines learning-based method with traditional nonlinear +graph-based optimization. Specifically, we tightly integrate a trainable +event-based differentiable bundle adjustment (e-DBA) with the IMU +pre-integration in a factor graph which employs keyframe-based sliding window +optimization. Numerical Experiments in nine public challenge datasets show that +our method can achieve superior performance compared with the image-based and +event-based benchmarks. The source code is available at: +https://github.com/arclab-hku/DEIO. + +
+
+
+
+
+ + ☆ Biomechanics-Aware Trajectory Optimization for Navigation during Robotic + Physiotherapy + + +
+ Robotic devices hold promise for aiding patients in orthopedic +rehabilitation. However, current robotic-assisted physiotherapy methods +struggle including biomechanical metrics in their control algorithms, crucial +for safe and effective therapy. This paper introduces BATON, a +Biomechanics-Aware Trajectory Optimization approach to robotic Navigation of +human musculoskeletal loads. The method integrates a high-fidelity +musculoskeletal model of the human shoulder into real-time control of +robot-patient interaction during rotator cuff tendon rehabilitation. We extract +skeletal dynamics and tendon loading information from an OpenSim shoulder model +to solve an optimal control problem, generating strain-minimizing trajectories. +Trajectories were realized on a healthy subject by an impedance-controlled +robot while estimating the state of the subject's shoulder. Target poses were +prescribed to design personalized rehabilitation across a wide range of +shoulder motion avoiding high-strain areas. BATON was designed with real-time +capabilities, enabling continuous trajectory replanning to address unforeseen +variations in tendon strain, such as those from changing muscle activation of +the subject. + +
+
+ comment: 13 pages, 9 figures, under review +
+
+
+
+
+ + ☆ Fundamental Three-Dimensional Configuration of Wire-Wound Muscle-Tendon + Complex Drive + + +
+ For robots to become more versatile and expand their areas of application, +their bodies need to be suitable for contact with the environment. When the +human body comes into contact with the environment, it is possible for it to +continue to move even if the positional relationship between muscles or the +shape of the muscles changes. We have already focused on the effect of +geometric deformation of muscles and proposed a drive system called wire-wound +Muscle-Tendon Complex (ww-MTC), an extension of the wire drive system. Our +previous study using a robot with a two-dimensional configuration demonstrated +several advantages: reduced wire loosening, interference, and wear; improved +robustness during environmental contact; and a muscular appearance. However, +this design had some problems, such as excessive muscle expansion that hindered +inter-muscle movement, and confinement to planar motion. In this study, we +develop the ww-MTC into a three-dimensional shape. We present a fundamental +construction method for a muscle exterior that expands gently and can be +contacted over its entire surface. We also apply the three-dimensional ww-MTC +to a 2-axis 3-muscle robot, and confirm that the robot can continue to move +while adapting to its environment. + +
+
+ comment: Accepted at Humanoids2024, website - + https://sites.google.com/view/yoshimoto-ribayashi/projects, YouTube - + https://youtu.be/EDeAqg7aAb4 +
+
+
+
+
+ + ☆ From Novice to Expert: LLM Agent Policy Optimization via Step-wise + Reinforcement Learning + + +
+ The outstanding capabilities of large language models (LLMs) render them a +crucial component in various autonomous agent systems. While traditional +methods depend on the inherent knowledge of LLMs without fine-tuning, more +recent approaches have shifted toward the reinforcement learning strategy to +further enhance agents' ability to solve complex interactive tasks with +environments and tools. However, previous approaches are constrained by the +sparse reward issue, where existing datasets solely provide a final scalar +reward for each multi-step reasoning chain, potentially leading to +ineffectiveness and inefficiency in policy learning. In this paper, we +introduce StepAgent, which utilizes step-wise reward to optimize the agent's +reinforcement learning process. Inheriting the spirit of novice-to-expert +theory, we first compare the actions of the expert and the agent to +automatically generate intermediate rewards for fine-grained optimization. +Additionally, we propose implicit-reward and inverse reinforcement learning +techniques to facilitate agent reflection and policy adjustment. Further +theoretical analysis demonstrates that the action distribution of the agent can +converge toward the expert action distribution over multiple training cycles. +Experimental results across various datasets indicate that StepAgent +outperforms existing baseline methods. + +
+
+
+
+
+ + ☆ How to Drawjectory? -- Trajectory Planning using Programming by + Demonstration + + +
+ A flight trajectory defines how exactly a quadrocopter moves in the +three-dimensional space from one position to another. Automatic flight +trajectory planning faces challenges such as high computational effort and a +lack of precision. Hence, when low computational effort or precise control is +required, programming the flight route trajectory manually might be preferable. +However, this requires in-depth knowledge of how to accurately plan flight +trajectories in three-dimensional space. We propose planning quadrocopter +flight trajectories manually using the Programming by Demonstration (PbD) +approach -- simply drawing the trajectory in the three-dimensional space by +hand. This simplifies the planning process and reduces the level of in-depth +knowledge required. + We implemented the approach in the context of the Quadcopter Lab at Ulm +University. In order to evaluate our approach, we compare the precision and +accuracy of the trajectories drawn by a user using our approach as well as the +required time with those manually programmed using a domain specific language. +The evaluation shows that the Drawjectory workflow is, on average, 78.7 seconds +faster without a significant loss of precision, shown by an average deviation +6.67 cm. + +
+
+
+
+
+ + ☆ Observability-Aware Control for Cooperatively Localizing Quadrotor UAVs + + +
+ Cooperatively Localizing robots should seek optimal control strategies to +maximize precision of position estimation and ensure safety in flight. +Observability-Aware Trajectory Optimization has strong potential to address +this issue, but no concrete link between observability and precision has been +proven yet. In this paper, we prove that improvement in positioning precision +inherently follows from optimizing observability. Based on this finding, we +develop an Observability-Aware Control principle to generate +observability-optimal control strategies. We implement this principle in a +Model Predictive Control framework, and we verify it on a team of quadrotor +Unmanned Aerial Vehicles comprising a follower vehicle localizing itself by +tracking a leader vehicle in both simulations and real-world flight tests. Our +results demonstrate that maximizing observability contributed to improving +global positioning precision for the quadrotor team. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ 3DGS-CD: 3D Gaussian Splatting-based Change Detection for Physical + Object Rearrangement + + +
+ We present 3DGS-CD, the first 3D Gaussian Splatting (3DGS)-based method for +detecting physical object rearrangements in 3D scenes. Our approach estimates +3D object-level changes by comparing two sets of unaligned images taken at +different times. Leveraging 3DGS's novel view rendering and EfficientSAM's +zero-shot segmentation capabilities, we detect 2D object-level changes, which +are then associated and fused across views to estimate 3D changes. Our method +can detect changes in cluttered environments using sparse post-change images +within as little as 18s, using as few as a single new image. It does not rely +on depth input, user instructions, object classes, or object models -- An +object is recognized simply if it has been re-arranged. Our approach is +evaluated on both public and self-collected real-world datasets, achieving up +to 14% higher accuracy and three orders of magnitude faster performance +compared to the state-of-the-art radiance-field-based change detection method. +This significant performance boost enables a broad range of downstream +applications, where we highlight three key use cases: object reconstruction, +robot workspace reset, and 3DGS model update. Our code and data will be made +available at https://github.com/520xyxyzq/3DGS-CD. + +
+
+
+
+
+ + ☆ Graph-Based Multi-Modal Sensor Fusion for Autonomous Driving + + +
+ The growing demand for robust scene understanding in mobile robotics and +autonomous driving has highlighted the importance of integrating multiple +sensing modalities. By combining data from diverse sensors like cameras and +LIDARs, fusion techniques can overcome the limitations of individual sensors, +enabling a more complete and accurate perception of the environment. We +introduce a novel approach to multi-modal sensor fusion, focusing on developing +a graph-based state representation that supports critical decision-making +processes in autonomous driving. We present a Sensor-Agnostic Graph-Aware +Kalman Filter [3], the first online state estimation technique designed to fuse +multi-modal graphs derived from noisy multi-sensor data. The estimated +graph-based state representations serve as a foundation for advanced +applications like Multi-Object Tracking (MOT), offering a comprehensive +framework for enhancing the situational awareness and safety of autonomous +systems. We validate the effectiveness of our proposed framework through +extensive experiments conducted on both synthetic and real-world driving +datasets (nuScenes). Our results showcase an improvement in MOTA and a +reduction in estimated position errors (MOTP) and identity switches (IDS) for +tracked objects using the SAGA-KF. Furthermore, we highlight the capability of +such a framework to develop methods that can leverage heterogeneous information +(like semantic objects and geometric structures) from various sensing +modalities, enabling a more holistic approach to scene understanding and +enhancing the safety and effectiveness of autonomous systems. + +
+
+ comment: An extended abstract accepted at Young Researchers' Symposium, ICVGIP + '24. This extended abstract contains the following: 1. Short summary of our + work, SAGA-KF, accepted at ICPR'24. 2. A proposal that was awarded the + Qualcomm Innovation Fellowship'24 +
+
+
+
+
+ + ☆ LEGATO: Cross-Embodiment Imitation Using a Grasping Tool RA-L + + +
+ Cross-embodiment imitation learning enables policies trained on specific +embodiments to transfer across different robots, unlocking the potential for +large-scale imitation learning that is both cost-effective and highly reusable. +This paper presents LEGATO, a cross-embodiment imitation learning framework for +visuomotor skill transfer across varied kinematic morphologies. We introduce a +handheld gripper that unifies action and observation spaces, allowing tasks to +be defined consistently across robots. Using this gripper, we train visuomotor +policies via imitation learning, applying a motion-invariant transformation to +compute the training loss. Gripper motions are then retargeted into +high-degree-of-freedom whole-body motions using inverse kinematics for +deployment across diverse embodiments. Our evaluations in simulation and +real-robot experiments highlight the framework's effectiveness in learning and +transferring visuomotor skills across various robots. More information can be +found at the project page: https://ut-hcrl.github.io/LEGATO. + +
+
+ comment: Submitted to RA-L +
+
+
+
+
+ + ☆ Imagined Potential Games: A Framework for Simulating, Learning and + Evaluating Interactive Behaviors + + +
+ Interacting with human agents in complex scenarios presents a significant +challenge for robotic navigation, particularly in environments that necessitate +both collision avoidance and collaborative interaction, such as indoor spaces. +Unlike static or predictably moving obstacles, human behavior is inherently +complex and unpredictable, stemming from dynamic interactions with other +agents. Existing simulation tools frequently fail to adequately model such +reactive and collaborative behaviors, impeding the development and evaluation +of robust social navigation strategies. This paper introduces a novel framework +utilizing distributed potential games to simulate human-like interactions in +highly interactive scenarios. Within this framework, each agent imagines a +virtual cooperative game with others based on its estimation. We demonstrate +this formulation can facilitate the generation of diverse and realistic +interaction patterns in a configurable manner across various scenarios. +Additionally, we have developed a gym-like environment leveraging our +interactive agent model to facilitate the learning and evaluation of +interactive navigation algorithms. + +
+
+ comment: 13 pages, 10 figures. arXiv admin note: substantial text overlap with + arXiv:2310.01614 +
+
+
+
+
+ + ☆ Development of a Practical Articulated Wheeled In-pipe Robot for Both + 3-4 in Force Main Inspection of Sewer Pipes + + +
+ This paper reports a practical articulated wheeled in-pipe inspection robot +"AIRo-7.1" which is waterproof and dustproof, and can adapt to 3 to 4 in inner +diameters. The joint torque can be adjusted by a PWM open-loop control. The +middle joint angle can be controlled by a position feedback control system +while the other two joints are bent by torsional springs. Thanks to this simple +and high-density design, not only downsizing of the robot but also wide range +of the adaptive inner diameter were achieved. However, the relationship between +the actual middle joint torque value and the PWM duty ratio should be pre-known +because the reducer used in AIRo-7.1 was designed by ourselves. Therefore, +preliminary experiments were conducted to clarify the relationship between +them. To examine the adaptive movement, experiments in both 3 in and 4 in pipes +with vertical, bend, and diameter change sections. Finally, field experiment +was also conducted. From the results, high adaptability to different inner +diameters of pipes and slippery environments were confirmed although waterproof +and dustproof were not perfectly working. + +
+
+ comment: The Twenty-Ninth International Symposium on Artificial Life and + Robotics 2024 (AROB 29th 2024), The Ninth International Symposium on + BioComplexity 2024 (ISBC 9th 2024), The Seventh International Symposium on + Swarm Behavior and Bio-Inspired Robotics 2024 (SWARM 7th 2024) B-Con Plaza, + Beppu, Japan and ONLINE, January 24-26, 2024 +
+
+
+
+
+ + ☆ Real-Time Safe Bipedal Robot Navigation using Linear Discrete Control + Barrier Functions + + +
+ Safe navigation in real-time is an essential task for humanoid robots in +real-world deployment. Since humanoid robots are inherently underactuated +thanks to unilateral ground contacts, a path is considered safe if it is +obstacle-free and respects the robot's physical limitations and underlying +dynamics. Existing approaches often decouple path planning from gait control +due to the significant computational challenge caused by the full-order robot +dynamics. In this work, we develop a unified, safe path and gait planning +framework that can be evaluated online in real-time, allowing the robot to +navigate clustered environments while sustaining stable locomotion. Our +approach uses the popular Linear Inverted Pendulum (LIP) model as a template +model to represent walking dynamics. It incorporates heading angles in the +model to evaluate kinematic constraints essential for physically feasible gaits +properly. In addition, we leverage discrete control barrier functions (DCBF) +for obstacle avoidance, ensuring that the subsequent foot placement provides a +safe navigation path within clustered environments. To guarantee real-time +computation, we use a novel approximation of the DCBF to produce linear DCBF +(LDCBF) constraints. We validate the proposed approach in simulation using a +Digit robot in randomly generated environments. The results demonstrate that +our approach can generate safe gaits for a non-trivial humanoid robot to +navigate environments with randomly generated obstacles in real-time. + +
+
+ comment: 7 pages, 10 figures +
+
+
+
+
+ + ☆ Robot Swarming over the internet + + +
+ This paper considers cooperative control of robots involving two different +testbed systems in remote locations with communication on the internet. This +provides us the capability to exchange robots status like positions, velocities +and directions needed for the swarming algorithm. The results show that all +robots properly follow some leader defined one of the testbeds. Measurement of +data exchange rates show no loss of packets, and average transfer delays stay +within tolerance limits for practical applications. In our knowledge, the +novelty of this paper concerns this kind of control over a large network like +internet. + +
+
+
+
+
+ + ☆ LCP-Fusion: A Neural Implicit SLAM with Enhanced Local Constraints and + Computable Prior IROS 2024 + + +
+ Recently the dense Simultaneous Localization and Mapping (SLAM) based on +neural implicit representation has shown impressive progress in hole filling +and high-fidelity mapping. Nevertheless, existing methods either heavily rely +on known scene bounds or suffer inconsistent reconstruction due to drift in +potential loop-closure regions, or both, which can be attributed to the +inflexible representation and lack of local constraints. In this paper, we +present LCP-Fusion, a neural implicit SLAM system with enhanced local +constraints and computable prior, which takes the sparse voxel octree structure +containing feature grids and SDF priors as hybrid scene representation, +enabling the scalability and robustness during mapping and tracking. To enhance +the local constraints, we propose a novel sliding window selection strategy +based on visual overlap to address the loop-closure, and a practical warping +loss to constrain relative poses. Moreover, we estimate SDF priors as coarse +initialization for implicit features, which brings additional explicit +constraints and robustness, especially when a light but efficient adaptive +early ending is adopted. Experiments demonstrate that our method achieve better +localization accuracy and reconstruction consistency than existing RGB-D +implicit SLAM, especially in challenging real scenes (ScanNet) as well as +self-captured scenes with unknown scene bounds. The code is available at +https://github.com/laliwang/LCP-Fusion. + +
+
+ comment: Accepted by 2024 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2024) +
+
+
+
+
+ + ☆ vMF-Contact: Uncertainty-aware Evidential Learning for Probabilistic + Contact-grasp in Noisy Clutter + + +
+ Grasp learning in noisy environments, such as occlusions, sensor noise, and +out-of-distribution (OOD) objects, poses significant challenges. Recent +learning-based approaches focus primarily on capturing aleatoric uncertainty +from inherent data noise. The epistemic uncertainty, which represents the OOD +recognition, is often addressed by ensembles with multiple forward paths, +limiting real-time application. In this paper, we propose an uncertainty-aware +approach for 6-DoF grasp detection using evidential learning to comprehensively +capture both uncertainties in real-world robotic grasping. As a key +contribution, we introduce vMF-Contact, a novel architecture for learning +hierarchical contact grasp representations with probabilistic modeling of +directional uncertainty as von Mises-Fisher (vMF) distribution. To achieve +this, we derive and analyze the theoretical formulation of the second-order +objective on the posterior parametrization, providing formal guarantees for the +model's ability to quantify uncertainty and improve grasp prediction +performance. Moreover, we enhance feature expressiveness by applying partial +point reconstructions as an auxiliary task, improving the comprehension of +uncertainty quantification as well as the generalization to unseen objects. In +the real-world experiments, our method demonstrates a significant improvement +by 39% in the overall clearance rate compared to the baselines. Video is under +https://www.youtube.com/watch?v=4aQsrDgdV8Y&t=12s + +
+
+
+
+
+ + ☆ Can Robotic Cues Manipulate Human Decisions? Exploring Consensus + Building via Bias-Controlled Non-linear Opinion Dynamics and Robotic Eye Gaze + Mediated Interaction in Human-Robot Teaming + + +
+ Although robots are becoming more advanced with human-like anthropomorphic +features and decision-making abilities to improve collaboration, the active +integration of humans into this process remains under-explored. This article +presents the first experimental study exploring decision-making interactions +between humans and robots with visual cues from robotic eyes, which can +dynamically influence human opinion formation. The cues generated by robotic +eyes gradually guide human decisions towards alignment with the robot's +choices. Both human and robot decision-making processes are modeled as +non-linear opinion dynamics with evolving biases. To examine these opinion +dynamics under varying biases, we conduct numerical parametric and equilibrium +continuation analyses using tuned parameters designed explicitly for the +presented human-robot interaction experiment. Furthermore, to facilitate the +transition from disagreement to agreement, we introduced a human opinion +observation algorithm integrated with the formation of the robot's opinion, +where the robot's behavior is controlled based on its formed opinion. The +algorithms developed aim to enhance human involvement in consensus building, +fostering effective collaboration between humans and robots. Experiments with +51 participants (N = 51) show that human-robot teamwork can be improved by +guiding human decisions using robotic cues. Finally, we provide detailed +insights on the effects of trust, cognitive load, and participant demographics +on decision-making based on user feedback and post-experiment interviews. + +
+
+ comment: 35 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Out-of-Distribution Recovery with Object-Centric Keypoint Inverse Policy + For Visuomotor Imitation Learning + + +
+ We propose an object-centric recovery policy framework to address the +challenges of out-of-distribution (OOD) scenarios in visuomotor policy +learning. Previous behavior cloning (BC) methods rely heavily on a large amount +of labeled data coverage, failing in unfamiliar spatial states. Without relying +on extra data collection, our approach learns a recovery policy constructed by +an inverse policy inferred from object keypoint manifold gradient in the +original training data. The recovery policy serves as a simple add-on to any +base visuomotor BC policy, agnostic to a specific method, guiding the system +back towards the training distribution to ensure task success even in OOD +situations. We demonstrate the effectiveness of our object-centric framework in +both simulation and real robot experiments, achieving an improvement of 77.7% +over the base policy in OOD. Project Website: +https://sites.google.com/view/ocr-penn + +
+
+ comment: Accepted for Spotlight (5 out of 21 papers) at CoRL 2024 Workshop on + Lifelong Learning for Home Robots +
+
+
+
+
+ + ♻ ☆ LDTrack: Dynamic People Tracking by Service Robots using Diffusion + Models + + +
+ Tracking of dynamic people in cluttered and crowded human-centered +environments is a challenging robotics problem due to the presence of +intraclass variations including occlusions, pose deformations, and lighting +variations. This paper introduces a novel deep learning architecture, using +conditional latent diffusion models, the Latent Diffusion Track (LDTrack), for +tracking multiple dynamic people under intraclass variations. By uniquely +utilizing conditional latent diffusion models to capture temporal person +embeddings, our architecture can adapt to appearance changes of people over +time. We incorporated a latent feature encoder network which enables the +diffusion process to operate within a high-dimensional latent space to allow +for the extraction and spatial-temporal refinement of such rich features as +person appearance, motion, location, identity, and contextual information. +Extensive experiments demonstrate the effectiveness of LDTrack over other +state-of-the-art tracking methods in cluttered and crowded human-centered +environments under intraclass variations. Namely, the results show our method +outperforms existing deep learning robotic people tracking methods in both +tracking accuracy and tracking precision with statistical significance. +Additionally, a comprehensive multi-object tracking comparison study was +performed against the state-of-the-art methods in urban environments, +demonstrating the generalizability of LDTrack. An ablation study was performed +to validate the design choices of LDTrack. + +
+
+
+
+
+ + ♻ ☆ A Comparison of Prompt Engineering Techniques for Task Planning and + Execution in Service Robotics + + +
+ Recent advances in LLM have been instrumental in autonomous robot control and +human-robot interaction by leveraging their vast general knowledge and +capabilities to understand and reason across a wide range of tasks and +scenarios. Previous works have investigated various prompt engineering +techniques for improving the performance of LLM to accomplish tasks, while +others have proposed methods that utilize LLMs to plan and execute tasks based +on the available functionalities of a given robot platform. In this work, we +consider both lines of research by comparing prompt engineering techniques and +combinations thereof within the application of high-level task planning and +execution in service robotics. We define a diverse set of tasks and a simple +set of functionalities in simulation, and measure task completion accuracy and +execution time for several state-of-the-art models. + +
+
+ comment: 6 pages, 3 figures, 2 tables, to be published in the 2024 IEEE-RAS + International Conference on Humanoid Robots, We make our code, including all + prompts, available at https://github.com/AIS-Bonn/Prompt_Engineering +
+
+
+
+
+ + ♻ ☆ DexDiffuser: Generating Dexterous Grasps with Diffusion Models + + +
+ We introduce DexDiffuser, a novel dexterous grasping method that generates, +evaluates, and refines grasps on partial object point clouds. DexDiffuser +includes the conditional diffusion-based grasp sampler DexSampler and the +dexterous grasp evaluator DexEvaluator. DexSampler generates high-quality +grasps conditioned on object point clouds by iterative denoising of randomly +sampled grasps. We also introduce two grasp refinement strategies: +Evaluator-Guided Diffusion (EGD) and Evaluator-based Sampling Refinement (ESR). +The experiment results demonstrate that DexDiffuser consistently outperforms +the state-of-the-art multi-finger grasp generation method FFHNet with an, on +average, 9.12% and 19.44% higher grasp success rate in simulation and real +robot experiments, respectively. Supplementary materials are available at +https://yulihn.github.io/DexDiffuser_page/ + +
+
+ comment: 7 pages +
+
+
+
+
+ + ♻ ☆ UMIRobot: An Open-{Software, Hardware} Low-Cost Robotic Manipulator for + Education IROS 2023 + + +
+ Robot teleoperation has been studied for the past 70 years and is relevant in +many contexts, such as in the handling of hazardous materials and telesurgery. +The COVID19 pandemic has rekindled interest in this topic, but the existing +robotic education kits fall short of being suitable for teleoperated robotic +manipulator learning. In addition, the global restrictions of motion motivated +large investments in online/hybrid education. In this work, a newly developed +robotics education kit and its ecosystem are presented which is used as the +backbone of an online/hybrid course in teleoperated robots. The students are +divided into teams. Each team designs, fabricates (3D printing and assembling), +and implements a control strategy for a master device and gripper. Coupling +those with the UMIRobot, provided as a kit, the students compete in a +teleoperation challenge. The kit is low cost (< 100USD), which allows +higher-learning institutions to provide one kit per student and they can learn +in a risk-free environment. As of now, 73 such kits have been assembled and +sent to course participants in eight countries. As major success stories, we +show an example of gripper and master designed for the proposed course. In +addition, we show a teleoperated task between Japan and Bangladesh executed by +course participants. Design files, videos, source code, and more information +are available at https://mmmarinho.github.io/UMIRobot/ + +
+
+ comment: Accepted on IROS 2023, 8 pages. Fixed a few typos +
+
+
+
+
+ + ♻ ☆ Robust Perception-Informed Navigation using PAC-NMPC with a Learned + Value Function + + +
+ Nonlinear model predictive control (NMPC) is typically restricted to short, +finite horizons to limit the computational burden of online optimization. As a +result, global planning frameworks are frequently necessary to avoid local +minima when using NMPC for navigation in complex environments. By contrast, +reinforcement learning (RL) can generate policies that minimize the expected +cost over an infinite-horizon and can often avoid local minima, even when +operating only on current sensor measurements. However, these learned policies +are usually unable to provide performance guarantees (e.g., on collision +avoidance), especially when outside of the training distribution. In this +paper, we augment Probably Approximately Correct NMPC (PAC-NMPC), a +sampling-based stochastic NMPC algorithm capable of providing statistical +guarantees of performance and safety, with an approximate perception-dependent +value function trained via RL. We demonstrate in simulation that our algorithm +can improve the long-term behavior of PAC-NMPC while outperforming other +approaches with regards to safety for both planar car dynamics and more +complex, high-dimensional fixed-wing aerial vehicle dynamics. We also +demonstrate that, even when our value function is trained in simulation, our +algorithm can successfully achieve statistically safe navigation on hardware +using a 1/10th scale rally car in cluttered real-world environments using only +current sensor information. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ♻ ☆ Hibikino-Musashi@Home 2024 Team Description Paper + + +
+ This paper provides an overview of the techniques employed by +Hibikino-Musashi@Home, which intends to participate in the domestic standard +platform league. The team has developed a dataset generator for training a +robot vision system and an open-source development environment running on a +Human Support Robot simulator. + The large language model powered task planner selects appropriate primitive +skills to perform the task requested by users. The team aims to design a home +service robot that can assist humans in their homes and continuously attends +competitions to evaluate and improve the developed system. + +
+
+
+
+
+ + ♻ ☆ Open Problem: Active Representation Learning + + +
+ In this work, we introduce the concept of Active Representation Learning, a +novel class of problems that intertwines exploration and representation +learning within partially observable environments. We extend ideas from Active +Simultaneous Localization and Mapping (active SLAM), and translate them to +scientific discovery problems, exemplified by adaptive microscopy. We explore +the need for a framework that derives exploration skills from representations +that are in some sense actionable, aiming to enhance the efficiency and +effectiveness of data collection and model building in the natural sciences. + +
+
+
+
+
+ + ♻ ☆ Skill-aware Mutual Information Optimisation for Generalisation in + Reinforcement Learning NeurIPS + + +
+ Meta-Reinforcement Learning (Meta-RL) agents can struggle to operate across +tasks with varying environmental features that require different optimal skills +(i.e., different modes of behaviour). Using context encoders based on +contrastive learning to enhance the generalisability of Meta-RL agents is now +widely studied but faces challenges such as the requirement for a large sample +size, also referred to as the $\log$-$K$ curse. To improve RL generalisation to +different tasks, we first introduce Skill-aware Mutual Information (SaMI), an +optimisation objective that aids in distinguishing context embeddings according +to skills, thereby equipping RL agents with the ability to identify and execute +different skills across tasks. We then propose Skill-aware Noise Contrastive +Estimation (SaNCE), a $K$-sample estimator used to optimise the SaMI objective. +We provide a framework for equipping an RL agent with SaNCE in practice and +conduct experimental validation on modified MuJoCo and Panda-gym benchmarks. We +empirically find that RL agents that learn by maximising SaMI achieve +substantially improved zero-shot generalisation to unseen tasks. Additionally, +the context encoder trained with SaNCE demonstrates greater robustness to a +reduction in the number of available samples, thus possessing the potential to +overcome the $\log$-$K$ curse. + +
+
+ comment: The Thirty-eighth Annual Conference on Neural Information Processing + Systems (NeurIPS), 2024 +
+
+
+
+
+ + ♻ ☆ Active Vapor-Based Robotic Wiper + + +
+ This paper presents a method for estimating normals of mirrors and +transparent objects challenging for cameras to recognize. We propose spraying +water vapor onto mirror or transparent surfaces to create a diffuse reflective +surface. Using an ultrasonic humidifier on a robotic arm, we apply water vapor +to the target object's surface, forming a cross-shaped misted area. This +creates partially diffuse reflective surfaces, enabling the camera to detect +the target object's surface. Adjusting the gripper-mounted camera viewpoint +maximizes the extracted misted area's appearance in the image, allowing normal +estimation of the target surface. Experiments show the method's effectiveness, +with RMSEs of azimuth estimation for mirrors and transparent glass at +approximately 4.2 and 5.8 degrees, respectively. Our robot experiments +demonstrated that our robotic wiper can perform contact-force-regulated wiping +motions to clean a transparent window, akin to human performance. + +
+
+ comment: 4 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Safe Reinforcement Learning on the Constraint Manifold: Theory and + Applications + + +
+ Integrating learning-based techniques, especially reinforcement learning, +into robotics is promising for solving complex problems in unstructured +environments. However, most existing approaches are trained in well-tuned +simulators and subsequently deployed on real robots without online fine-tuning. +In this setting, extensive engineering is required to mitigate the sim-to-real +gap, which can be challenging for complex systems. Instead, learning with +real-world interaction data offers a promising alternative: it not only +eliminates the need for a fine-tuned simulator but also applies to a broader +range of tasks where accurate modeling is unfeasible. One major problem for +on-robot reinforcement learning is ensuring safety, as uncontrolled exploration +can cause catastrophic damage to the robot or the environment. Indeed, safety +specifications, often represented as constraints, can be complex and +non-linear, making safety challenging to guarantee in learning systems. In this +paper, we show how we can impose complex safety constraints on learning-based +robotics systems in a principled manner, both from theoretical and practical +points of view. Our approach is based on the concept of the Constraint +Manifold, representing the set of safe robot configurations. Exploiting +differential geometry techniques, i.e., the tangent space, we can construct a +safe action space, allowing learning agents to sample arbitrary actions while +ensuring safety. We demonstrate the method's effectiveness in a real-world +Robot Air Hockey task, showing that our method can handle high-dimensional +tasks with complex constraints. Videos of the real robot experiments are +available on the project website (https://puzeliu.github.io/TRO-ATACOM). + +
+
+ comment: 19 pages; sumitted to IEEE Transactions on Robotics +
+
+
+
+
+ + ♻ ☆ SceneComplete: Open-World 3D Scene Completion in Complex Real World + Environments for Robot Manipulation + + +
+ Careful robot manipulation in every-day cluttered environments requires an +accurate understanding of the 3D scene, in order to grasp and place objects +stably and reliably and to avoid mistakenly colliding with other objects. In +general, we must construct such a 3D interpretation of a complex scene based on +limited input, such as a single RGB-D image. We describe SceneComplete, a +system for constructing a complete, segmented, 3D model of a scene from a +single view. It provides a novel pipeline for composing general-purpose +pretrained perception modules (vision-language, segmentation, image-inpainting, +image-to-3D, and pose-estimation) to obtain high-accuracy results. We +demonstrate its accuracy and effectiveness with respect to ground-truth models +in a large benchmark dataset and show that its accurate whole-object +reconstruction enables robust grasp proposal generation, including for a +dexterous hand. Project website - https://scenecomplete.github.io/ + +
+
+
+
+
+ + ♻ ☆ CPnP: Consistent Pose Estimator for Perspective-n-Point Problem with + Bias Elimination + + +
+ The Perspective-n-Point (PnP) problem has been widely studied in both +computer vision and photogrammetry societies. With the development of feature +extraction techniques, a large number of feature points might be available in a +single shot. It is promising to devise a consistent estimator, i.e., the +estimate can converge to the true camera pose as the number of points +increases. To this end, we propose a consistent PnP solver, named \emph{CPnP}, +with bias elimination. Specifically, linear equations are constructed from the +original projection model via measurement model modification and variable +elimination, based on which a closed-form least-squares solution is obtained. +We then analyze and subtract the asymptotic bias of this solution, resulting in +a consistent estimate. Additionally, Gauss-Newton (GN) iterations are executed +to refine the consistent solution. Our proposed estimator is efficient in terms +of computations -- it has $O(n)$ computational complexity. Experimental tests +on both synthetic data and real images show that our proposed estimator is +superior to some well-known ones for images with dense visual features, in +terms of estimation precision and computing time. + +
+
+
+
+
+ + ♻ ☆ Adaptive Complexity Model Predictive Control + + +
+ This work introduces a formulation of model predictive control (MPC) which +adaptively reasons about the complexity of the model based on the task while +maintaining feasibility and stability guarantees. Existing MPC implementations +often handle computational complexity by shortening prediction horizons or +simplifying models, both of which can result in instability. Inspired by +related approaches in behavioral economics, motion planning, and biomechanics, +our method solves MPC problems with a simple model for dynamics and constraints +over regions of the horizon where such a model is feasible and a complex model +where it is not. The approach leverages an interleaving of planning and +execution to iteratively identify these regions, which can be safely simplified +if they satisfy an exact template/anchor relationship. We show that this method +does not compromise the stability and feasibility properties of the system, and +measure performance in simulation experiments on a quadrupedal robot executing +agile behaviors over terrains of interest. We find that this adaptive method +enables more agile motion and expands the range of executable tasks compared to +fixed-complexity implementations. + +
+
+ comment: Published in Transactions on Robotics +
+
+
+
+
+ + ♻ ☆ Contraction Theory for Nonlinear Stability Analysis and Learning-based + Control: A Tutorial Overview + + +
+ Contraction theory is an analytical tool to study differential dynamics of a +non-autonomous (i.e., time-varying) nonlinear system under a contraction metric +defined with a uniformly positive definite matrix, the existence of which +results in a necessary and sufficient characterization of incremental +exponential stability of multiple solution trajectories with respect to each +other. By using a squared differential length as a Lyapunov-like function, its +nonlinear stability analysis boils down to finding a suitable contraction +metric that satisfies a stability condition expressed as a linear matrix +inequality, indicating that many parallels can be drawn between well-known +linear systems theory and contraction theory for nonlinear systems. +Furthermore, contraction theory takes advantage of a superior robustness +property of exponential stability used in conjunction with the comparison +lemma. This yields much-needed safety and stability guarantees for neural +network-based control and estimation schemes, without resorting to a more +involved method of using uniform asymptotic stability for input-to-state +stability. Such distinctive features permit systematic construction of a +contraction metric via convex optimization, thereby obtaining an explicit +exponential bound on the distance between a time-varying target trajectory and +solution trajectories perturbed externally due to disturbances and learning +errors. The objective of this paper is therefore to present a tutorial overview +of contraction theory and its advantages in nonlinear stability analysis of +deterministic and stochastic systems, with an emphasis on deriving formal +robustness and stability guarantees for various learning-based and data-driven +automatic control methods. In particular, we provide a detailed review of +techniques for finding contraction metrics and associated control and +estimation laws using deep neural networks. + +
+
+ comment: Annual Reviews in Control, Accepted, Oct. 1st +
+
+
+
+
+ + ♻ ☆ Precise and Dexterous Robotic Manipulation via Human-in-the-Loop + Reinforcement Learning + + +
+ Reinforcement learning (RL) holds great promise for enabling autonomous +acquisition of complex robotic manipulation skills, but realizing this +potential in real-world settings has been challenging. We present a +human-in-the-loop vision-based RL system that demonstrates impressive +performance on a diverse set of dexterous manipulation tasks, including dynamic +manipulation, precision assembly, and dual-arm coordination. Our approach +integrates demonstrations and human corrections, efficient RL algorithms, and +other system-level design choices to learn policies that achieve near-perfect +success rates and fast cycle times within just 1 to 2.5 hours of training. We +show that our method significantly outperforms imitation learning baselines and +prior RL approaches, with an average 2x improvement in success rate and 1.8x +faster execution. Through extensive experiments and analysis, we provide +insights into the effectiveness of our approach, demonstrating how it learns +robust, adaptive policies for both reactive and predictive control strategies. +Our results suggest that RL can indeed learn a wide range of complex +vision-based manipulation policies directly in the real world within practical +training times. We hope this work will inspire a new generation of learned +robotic manipulation techniques, benefiting both industrial applications and +research advancements. Videos and code are available at our project website +https://hil-serl.github.io/. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 101 + +
+
+
+ + ☆ Community Forensics: Using Thousands of Generators to Train Fake Image + Detectors + + +
+ One of the key challenges of detecting AI-generated images is spotting images +that have been created by previously unseen generative models. We argue that +the limited diversity of the training data is a major obstacle to addressing +this problem, and we propose a new dataset that is significantly larger and +more diverse than prior work. As part of creating this dataset, we +systematically download thousands of text-to-image latent diffusion models and +sample images from them. We also collect images from dozens of popular open +source and commercial models. The resulting dataset contains 2.7M images that +have been sampled from 4803 different models. These images collectively capture +a wide range of scene content, generator architectures, and image processing +settings. Using this dataset, we study the generalization abilities of fake +image detectors. Our experiments suggest that detection performance improves as +the number of models in the training set increases, even when these models have +similar architectures. We also find that detection performance improves as the +diversity of the models increases, and that our trained detectors generalize +better than those trained on other datasets. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Fed-EC: Bandwidth-Efficient Clustering-Based Federated Learning For + Autonomous Visual Robot Navigation + + +
+ Centralized learning requires data to be aggregated at a central server, +which poses significant challenges in terms of data privacy and bandwidth +consumption. Federated learning presents a compelling alternative, however, +vanilla federated learning methods deployed in robotics aim to learn a single +global model across robots that works ideally for all. But in practice one +model may not be well suited for robots deployed in various environments. This +paper proposes Federated-EmbedCluster (Fed-EC), a clustering-based federated +learning framework that is deployed with vision based autonomous robot +navigation in diverse outdoor environments. The framework addresses the key +federated learning challenge of deteriorating model performance of a single +global model due to the presence of non-IID data across real-world robots. +Extensive real-world experiments validate that Fed-EC reduces the communication +size by 23x for each robot while matching the performance of centralized +learning for goal-oriented navigation and outperforms local learning. Fed-EC +can transfer previously learnt models to new robots that join the cluster. + +
+
+
+
+
+ + ☆ RaVL: Discovering and Mitigating Spurious Correlations in Fine-Tuned + Vision-Language Models NeurIPS 2024 + + +
+ Fine-tuned vision-language models (VLMs) often capture spurious correlations +between image features and textual attributes, resulting in degraded zero-shot +performance at test time. Existing approaches for addressing spurious +correlations (i) primarily operate at the global image-level rather than +intervening directly on fine-grained image features and (ii) are predominantly +designed for unimodal settings. In this work, we present RaVL, which takes a +fine-grained perspective on VLM robustness by discovering and mitigating +spurious correlations using local image features rather than operating at the +global image level. Given a fine-tuned VLM, RaVL first discovers spurious +correlations by leveraging a region-level clustering approach to identify +precise image features contributing to zero-shot classification errors. Then, +RaVL mitigates the identified spurious correlation with a novel region-aware +loss function that enables the VLM to focus on relevant regions and ignore +spurious relationships during fine-tuning. We evaluate RaVL on 654 VLMs with +various model architectures, data domains, and learned spurious correlations. +Our results show that RaVL accurately discovers (191% improvement over the +closest baseline) and mitigates (8.2% improvement on worst-group image +classification accuracy) spurious correlations. Qualitative evaluations on +general-domain and medical-domain VLMs confirm our findings. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Textual Decomposition Then Sub-motion-space Scattering for + Open-Vocabulary Motion Generation + + +
+ Text-to-motion generation is a crucial task in computer vision, which +generates the target 3D motion by the given text. The existing annotated +datasets are limited in scale, resulting in most existing methods overfitting +to the small datasets and unable to generalize to the motions of the open +domain. Some methods attempt to solve the open-vocabulary motion generation +problem by aligning to the CLIP space or using the Pretrain-then-Finetuning +paradigm. However, the current annotated dataset's limited scale only allows +them to achieve mapping from sub-text-space to sub-motion-space, instead of +mapping between full-text-space and full-motion-space (full mapping), which is +the key to attaining open-vocabulary motion generation. To this end, this paper +proposes to leverage the atomic motion (simple body part motions over a short +time period) as an intermediate representation, and leverage two orderly +coupled steps, i.e., Textual Decomposition and Sub-motion-space Scattering, to +address the full mapping problem. For Textual Decomposition, we design a +fine-grained description conversion algorithm, and combine it with the +generalization ability of a large language model to convert any given motion +text into atomic texts. Sub-motion-space Scattering learns the compositional +process from atomic motions to the target motions, to make the learned +sub-motion-space scattered to form the full-motion-space. For a given motion of +the open domain, it transforms the extrapolation into interpolation and thereby +significantly improves generalization. Our network, $DSO$-Net, combines textual +$d$ecomposition and sub-motion-space $s$cattering to solve the +$o$pen-vocabulary motion generation. Extensive experiments demonstrate that our +DSO-Net achieves significant improvements over the state-of-the-art methods on +open-vocabulary motion generation. Code is available at +https://vankouf.github.io/DSONet/. + +
+
+ comment: project page: https://vankouf.github.io/DSONet/ +
+
+
+
+
+ + ☆ H-POPE: Hierarchical Polling-based Probing Evaluation of Hallucinations + in Large Vision-Language Models + + +
+ By leveraging both texts and images, large vision language models (LVLMs) +have shown significant progress in various multi-modal tasks. Nevertheless, +these models often suffer from hallucinations, e.g., they exhibit +inconsistencies between the visual input and the textual output. To address +this, we propose H-POPE, a coarse-to-fine-grained benchmark that systematically +assesses hallucination in object existence and attributes. Our evaluation shows +that models are prone to hallucinations on object existence, and even more so +on fine-grained attributes. We further investigate whether these models rely on +visual input to formulate the output texts. + +
+
+ comment: Poster at https://sites.google.com/berkeley.edu/bb-stat/home +
+
+
+
+
+ + ☆ Pseudo-labeling with Keyword Refining for Few-Supervised Video + Captioning + + +
+ Video captioning generate a sentence that describes the video content. +Existing methods always require a number of captions (\eg, 10 or 20) per video +to train the model, which is quite costly. In this work, we explore the +possibility of using only one or very few ground-truth sentences, and introduce +a new task named few-supervised video captioning. Specifically, we propose a +few-supervised video captioning framework that consists of lexically +constrained pseudo-labeling module and keyword-refined captioning module. +Unlike the random sampling in natural language processing that may cause +invalid modifications (\ie, edit words), the former module guides the model to +edit words using some actions (\eg, copy, replace, insert, and delete) by a +pretrained token-level classifier, and then fine-tunes candidate sentences by a +pretrained language model. Meanwhile, the former employs the repetition +penalized sampling to encourage the model to yield concise pseudo-labeled +sentences with less repetition, and selects the most relevant sentences upon a +pretrained video-text model. Moreover, to keep semantic consistency between +pseudo-labeled sentences and video content, we develop the transformer-based +keyword refiner with the video-keyword gated fusion strategy to emphasize more +on relevant words. Extensive experiments on several benchmarks demonstrate the +advantages of the proposed approach in both few-supervised and fully-supervised +scenarios. The code implementation is available at +https://github.com/mlvccn/PKG_VidCap + +
+
+ comment: 12 figures, Accepted in Pattern Recognition +
+
+
+
+
+ + ☆ Multi-branch Spatio-Temporal Graph Neural Network For Efficient Ice + Layer Thickness Prediction + + +
+ Understanding spatio-temporal patterns in polar ice layers is essential for +tracking changes in ice sheet balance and assessing ice dynamics. While +convolutional neural networks are widely used in learning ice layer patterns +from raw echogram images captured by airborne snow radar sensors, noise in the +echogram images prevents researchers from getting high-quality results. +Instead, we focus on geometric deep learning using graph neural networks, +aiming to build a spatio-temporal graph neural network that learns from +thickness information of the top ice layers and predicts for deeper layers. In +this paper, we developed a novel multi-branch spatio-temporal graph neural +network that used the GraphSAGE framework for spatio features learning and a +temporal convolution operation to capture temporal changes, enabling different +branches of the network to be more specialized and focusing on a single +learning task. We found that our proposed multi-branch network can consistently +outperform the current fused spatio-temporal graph neural network in both +accuracy and efficiency. + +
+
+
+
+
+ + ☆ Aligning Characteristic Descriptors with Images for Human-Expert-like + Explainability + + +
+ In mission-critical domains such as law enforcement and medical diagnosis, +the ability to explain and interpret the outputs of deep learning models is +crucial for ensuring user trust and supporting informed decision-making. +Despite advancements in explainability, existing methods often fall short in +providing explanations that mirror the depth and clarity of those given by +human experts. Such expert-level explanations are essential for the dependable +application of deep learning models in law enforcement and medical contexts. +Additionally, we recognize that most explanations in real-world scenarios are +communicated primarily through natural language. Addressing these needs, we +propose a novel approach that utilizes characteristic descriptors to explain +model decisions by identifying their presence in images, thereby generating +expert-like explanations. Our method incorporates a concept bottleneck layer +within the model architecture, which calculates the similarity between image +and descriptor encodings to deliver inherent and faithful explanations. Through +experiments in face recognition and chest X-ray diagnosis, we demonstrate that +our approach offers a significant contrast over existing techniques, which are +often limited to the use of saliency maps. We believe our approach represents a +significant step toward making deep learning systems more accountable, +transparent, and trustworthy in the critical domains of face recognition and +medical diagnosis. + +
+
+
+
+
+ + ☆ Synomaly Noise and Multi-Stage Diffusion: A Novel Approach for + Unsupervised Anomaly Detection in Ultrasound Imaging + + +
+ Ultrasound (US) imaging is widely used in routine clinical practice due to +its advantages of being radiation-free, cost-effective, and portable. However, +the low reproducibility and quality of US images, combined with the scarcity of +expert-level annotation, make the training of fully supervised segmentation +models challenging. To address these issues, we propose a novel unsupervised +anomaly detection framework based on a diffusion model that incorporates a +synthetic anomaly (Synomaly) noise function and a multi-stage diffusion +process. Synomaly noise introduces synthetic anomalies into healthy images +during training, allowing the model to effectively learn anomaly removal. The +multi-stage diffusion process is introduced to progressively denoise images, +preserving fine details while improving the quality of anomaly-free +reconstructions. The generated high-fidelity counterfactual healthy images can +further enhance the interpretability of the segmentation models, as well as +provide a reliable baseline for evaluating the extent of anomalies and +supporting clinical decision-making. Notably, the unsupervised anomaly +detection model is trained purely on healthy images, eliminating the need for +anomalous training samples and pixel-level annotations. We validate the +proposed approach on carotid US, brain MRI, and liver CT datasets. The +experimental results demonstrate that the proposed framework outperforms +existing state-of-the-art unsupervised anomaly detection methods, achieving +performance comparable to fully supervised segmentation models in the US +dataset. Additionally, ablation studies underline the importance of +hyperparameter selection for Synomaly noise and the effectiveness of the +multi-stage diffusion process in enhancing model performance. + +
+
+
+
+
+ + ☆ Local vs distributed representations: What is the right basis for + interpretability? + + +
+ Much of the research on the interpretability of deep neural networks has +focused on studying the visual features that maximally activate individual +neurons. However, recent work has cast doubts on the usefulness of such local +representations for understanding the behavior of deep neural networks because +individual neurons tend to respond to multiple unrelated visual patterns, a +phenomenon referred to as "superposition". A promising alternative to +disentangle these complex patterns is learning sparsely distributed vector +representations from entire network layers, as the resulting basis vectors +seemingly encode single identifiable visual patterns consistently. Thus, one +would expect the resulting code to align better with human perceivable visual +patterns, but supporting evidence remains, at best, anecdotal. To fill this +gap, we conducted three large-scale psychophysics experiments collected from a +pool of 560 participants. Our findings provide (i) strong evidence that +features obtained from sparse distributed representations are easier to +interpret by human observers and (ii) that this effect is more pronounced in +the deepest layers of a neural network. Complementary analyses also reveal that +(iii) features derived from sparse distributed representations contribute more +to the model's decision. Overall, our results highlight that distributed +representations constitute a superior basis for interpretability, underscoring +a need for the field to move beyond the interpretation of local neural codes in +favor of sparsely distributed ones. + +
+
+
+
+
+ + ☆ ET-SEED: Efficient Trajectory-Level SE(3) Equivariant Diffusion Policy + + +
+ Imitation learning, e.g., diffusion policy, has been proven effective in +various robotic manipulation tasks. However, extensive demonstrations are +required for policy robustness and generalization. To reduce the demonstration +reliance, we leverage spatial symmetry and propose ET-SEED, an efficient +trajectory-level SE(3) equivariant diffusion model for generating action +sequences in complex robot manipulation tasks. Further, previous equivariant +diffusion models require the per-step equivariance in the Markov process, +making it difficult to learn policy under such strong constraints. We +theoretically extend equivariant Markov kernels and simplify the condition of +equivariant diffusion process, thereby significantly improving training +efficiency for trajectory-level SE(3) equivariant diffusion policy in an +end-to-end manner. We evaluate ET-SEED on representative robotic manipulation +tasks, involving rigid body, articulated and deformable object. Experiments +demonstrate superior data efficiency and manipulation proficiency of our +proposed method, as well as its ability to generalize to unseen configurations +with only a few demonstrations. Website: https://et-seed.github.io/ + +
+
+ comment: Accept to CoRL 2024 Workshop on X-Embodiment Robot Learning +
+
+
+
+
+ + ☆ ReEdit: Multimodal Exemplar-Based Image Editing with Diffusion Models + + +
+ Modern Text-to-Image (T2I) Diffusion models have revolutionized image editing +by enabling the generation of high-quality photorealistic images. While the de +facto method for performing edits with T2I models is through text instructions, +this approach non-trivial due to the complex many-to-many mapping between +natural language and images. In this work, we address exemplar-based image +editing -- the task of transferring an edit from an exemplar pair to a content +image(s). We propose ReEdit, a modular and efficient end-to-end framework that +captures edits in both text and image modalities while ensuring the fidelity of +the edited image. We validate the effectiveness of ReEdit through extensive +comparisons with state-of-the-art baselines and sensitivity analyses of key +design choices. Our results demonstrate that ReEdit consistently outperforms +contemporary approaches both qualitatively and quantitatively. Additionally, +ReEdit boasts high practical applicability, as it does not require any +task-specific optimization and is four times faster than the next best +baseline. + +
+
+ comment: First three authors contributed equally to this work +
+
+
+
+
+ + ☆ HRDecoder: High-Resolution Decoder Network for Fundus Image Lesion + Segmentation + + +
+ High resolution is crucial for precise segmentation in fundus images, yet +handling high-resolution inputs incurs considerable GPU memory costs, with +diminishing performance gains as overhead increases. To address this issue +while tackling the challenge of segmenting tiny objects, recent studies have +explored local-global fusion methods. These methods preserve fine details using +local regions and capture long-range context information from downscaled global +images. However, the necessity of multiple forward passes inevitably incurs +significant computational overhead, adversely affecting inference speed. In +this paper, we propose HRDecoder, a simple High-Resolution Decoder network for +fundus lesion segmentation. It integrates a high-resolution representation +learning module to capture fine-grained local features and a high-resolution +fusion module to fuse multi-scale predictions. Our method effectively improves +the overall segmentation accuracy of fundus lesions while consuming reasonable +memory and computational overhead, and maintaining satisfying inference speed. +Experimental results on the IDRID and DDR datasets demonstrate the +effectiveness of our method. Code is available at +https://github.com/CVIU-CSU/HRDecoder. + +
+
+ comment: 11 pages, 3 figures, accepted by MICCAI 2024, the revised version +
+
+
+
+
+ + ☆ Face Reconstruction from Face Embeddings using Adapter to a Face + Foundation Model + + +
+ Face recognition systems extract embedding vectors from face images and use +these embeddings to verify or identify individuals. Face reconstruction attack +(also known as template inversion) refers to reconstructing face images from +face embeddings and using the reconstructed face image to enter a face +recognition system. In this paper, we propose to use a face foundation model to +reconstruct face images from the embeddings of a blackbox face recognition +model. The foundation model is trained with 42M images to generate face images +from the facial embeddings of a fixed face recognition model. We propose to use +an adapter to translate target embeddings into the embedding space of the +foundation model. The generated images are evaluated on different face +recognition models and different datasets, demonstrating the effectiveness of +our method to translate embeddings of different face recognition models. We +also evaluate the transferability of reconstructed face images when attacking +different face recognition models. Our experimental results show that our +reconstructed face images outperform previous reconstruction attacks against +face recognition models. + +
+
+
+
+
+ + ☆ Energy Score-based Pseudo-Label Filtering and Adaptive Loss for + Imbalanced Semi-supervised SAR target recognition + + +
+ Automatic target recognition (ATR) is an important use case for synthetic +aperture radar (SAR) image interpretation. Recent years have seen significant +advancements in SAR ATR technology based on semi-supervised learning. However, +existing semi-supervised SAR ATR algorithms show low recognition accuracy in +the case of class imbalance. This work offers a non-balanced semi-supervised +SAR target recognition approach using dynamic energy scores and adaptive loss. +First, an energy score-based method is developed to dynamically select +unlabeled samples near to the training distribution as pseudo-labels during +training, assuring pseudo-label reliability in long-tailed distribution +circumstances. Secondly, loss functions suitable for class imbalances are +proposed, including adaptive margin perception loss and adaptive hard triplet +loss, the former offsets inter-class confusion of classifiers, alleviating the +imbalance issue inherent in pseudo-label generation. The latter effectively +tackles the model's preference for the majority class by focusing on complex +difficult samples during training. Experimental results on extremely imbalanced +SAR datasets demonstrate that the proposed method performs well under the dual +constraints of scarce labels and data imbalance, effectively overcoming the +model bias caused by data imbalance and achieving high-precision target +recognition. + +
+
+
+
+
+ + ☆ Act in Collusion: A Persistent Distributed Multi-Target Backdoor in + Federated Learning + + +
+ Federated learning, a novel paradigm designed to protect data privacy, is +vulnerable to backdoor attacks due to its distributed nature. Current research +often designs attacks based on a single attacker with a single backdoor, +overlooking more realistic and complex threats in federated learning. We +propose a more practical threat model for federated learning: the distributed +multi-target backdoor. In this model, multiple attackers control different +clients, embedding various triggers and targeting different classes, +collaboratively implanting backdoors into the global model via central +aggregation. Empirical validation shows that existing methods struggle to +maintain the effectiveness of multiple backdoors in the global model. Our key +insight is that similar backdoor triggers cause parameter conflicts and +injecting new backdoors disrupts gradient directions, significantly weakening +some backdoors performance. To solve this, we propose a Distributed +Multi-Target Backdoor Attack (DMBA), ensuring efficiency and persistence of +backdoors from different malicious clients. To avoid parameter conflicts, we +design a multi-channel dispersed frequency trigger strategy to maximize trigger +differences. To mitigate gradient interference, we introduce backdoor replay in +local training to neutralize conflicting gradients. Extensive validation shows +that 30 rounds after the attack, Attack Success Rates of three different +backdoors from various clients remain above 93%. The code will be made publicly +available after the review period. + +
+
+
+
+
+ + ☆ Self-supervised Representation Learning for Cell Event Recognition + through Time Arrow Prediction + + +
+ The spatio-temporal nature of live-cell microscopy data poses challenges in +the analysis of cell states which is fundamental in bioimaging. Deep-learning +based segmentation or tracking methods rely on large amount of high quality +annotations to work effectively. In this work, we explore an alternative +solution: using feature maps obtained from self-supervised representation +learning (SSRL) on time arrow prediction (TAP) for the downstream supervised +task of cell event recognition. We demonstrate through extensive experiments +and analysis that this approach can achieve better performance with limited +annotation compared to models trained from end to end using fully supervised +approach. Our analysis also provides insight into applications of the SSRL +using TAP in live-cell microscopy. + +
+
+
+
+
+ + ☆ ROBIN: Robust and Invisible Watermarks for Diffusion Models with + Adversarial Optimization NeurIPS 2024 + + +
+ Watermarking generative content serves as a vital tool for authentication, +ownership protection, and mitigation of potential misuse. Existing watermarking +methods face the challenge of balancing robustness and concealment. They +empirically inject a watermark that is both invisible and robust and passively +achieve concealment by limiting the strength of the watermark, thus reducing +the robustness. In this paper, we propose to explicitly introduce a watermark +hiding process to actively achieve concealment, thus allowing the embedding of +stronger watermarks. To be specific, we implant a robust watermark in an +intermediate diffusion state and then guide the model to hide the watermark in +the final generated image. We employ an adversarial optimization algorithm to +produce the optimal hiding prompt guiding signal for each watermark. The prompt +embedding is optimized to minimize artifacts in the generated image, while the +watermark is optimized to achieve maximum strength. The watermark can be +verified by reversing the generation process. Experiments on various diffusion +models demonstrate the watermark remains verifiable even under significant +image tampering and shows superior invisibility compared to other +state-of-the-art robust watermarking methods. + +
+
+ comment: Accept to NeurIPS 2024 +
+
+
+
+
+ + ☆ FedRISE: Rating Induced Sign Election of Gradients for Byzantine + Tolerant Federated Aggregation + + +
+ One of the most common defense strategies against model poisoning in +federated learning is to employ a robust aggregator mechanism that makes the +training more resilient. Many of the existing Byzantine robust aggregators +provide theoretical guarantees and are empirically effective against certain +categories of attacks. However, we observe that certain high-strength attacks +can subvert the aggregator and collapse the training. In addition, most +aggregators require identifying tolerant settings to converge. Impact of +attacks becomes more pronounced when the number of Byzantines is near-majority, +and becomes harder to evade if the attacker is omniscient with access to data, +honest updates and aggregation methods. Motivated by these observations, we +develop a robust aggregator called FedRISE for cross-silo FL that is consistent +and less susceptible to poisoning updates by an omniscient attacker. The +proposed method explicitly determines the optimal direction of each gradient +through a sign-voting strategy that uses variance-reduced sparse gradients. We +argue that vote weighting based on the cosine similarity of raw gradients is +misleading, and we introduce a sign-based gradient valuation function that +ignores the gradient magnitude. We compare our method against 8 robust +aggregators under 6 poisoning attacks on 3 datasets and architectures. Our +results show that existing robust aggregators collapse for at least some +attacks under severe settings, while FedRISE demonstrates better robustness +because of a stringent gradient inclusion formulation. + +
+
+ comment: This is a work under submission/review process +
+
+
+
+
+ + ☆ MambaPEFT: Exploring Parameter-Efficient Fine-Tuning for Mamba + + +
+ An ecosystem of Transformer-based models has been established by building +large models with extensive data. Parameter-efficient fine-tuning (PEFT) is a +crucial technology for deploying these models to downstream tasks with minimal +cost while achieving effective performance. Recently, Mamba, a State Space +Model (SSM)-based model, has attracted attention as a potential alternative to +Transformers. While many large-scale Mamba-based models have been proposed, +efficiently adapting pre-trained Mamba-based models to downstream tasks remains +unexplored. In this paper, we conduct an exploratory analysis of PEFT methods +for Mamba. We investigate the effectiveness of existing PEFT methods for +Transformers when applied to Mamba. We also modify these methods to better +align with the Mamba architecture. Additionally, we propose new Mamba-specific +PEFT methods that leverage the distinctive structure of Mamba. Our experiments +indicate that PEFT performs more effectively for Mamba than Transformers. +Lastly, we demonstrate how to effectively combine multiple PEFT methods and +provide a framework that outperforms previous works. To ensure reproducibility, +we will release the code after publication. + +
+
+
+
+
+ + ☆ An Edge Computing-Based Solution for Real-Time Leaf Disease + Classification using Thermal Imaging + + +
+ Deep learning (DL) technologies can transform agriculture by improving crop +health monitoring and management, thus improving food safety. In this paper, we +explore the potential of edge computing for real-time classification of leaf +diseases using thermal imaging. We present a thermal image dataset for plant +disease classification and evaluate deep learning models, including +InceptionV3, MobileNetV1, MobileNetV2, and VGG-16, on resource-constrained +devices like the Raspberry Pi 4B. Using pruning and quantization-aware +training, these models achieve inference times up to 1.48x faster on Edge TPU +Max for VGG16, and up to 2.13x faster with precision reduction on Intel NCS2 +for MobileNetV1, compared to high-end GPUs like the RTX 3090, while maintaining +state-of-the-art accuracy. + +
+
+
+
+
+ + ☆ An Enhancement of Haar Cascade Algorithm Applied to Face Recognition for + Gate Pass Security + + +
+ This study is focused on enhancing the Haar Cascade Algorithm to decrease the +false positive and false negative rate in face matching and face detection to +increase the accuracy rate even under challenging conditions. The face +recognition library was implemented with Haar Cascade Algorithm in which the +128-dimensional vectors representing the unique features of a face are encoded. +A subprocess was applied where the grayscale image from Haar Cascade was +converted to RGB to improve the face encoding. Logical process and face +filtering are also used to decrease non-face detection. The Enhanced Haar +Cascade Algorithm produced a 98.39% accuracy rate (21.39% increase), 63.59% +precision rate, 98.30% recall rate, and 72.23% in F1 Score. In comparison, the +Haar Cascade Algorithm achieved a 46.70% to 77.00% accuracy rate, 44.15% +precision rate, 98.61% recall rate, and 47.01% in F1 Score. Both algorithms +used the Confusion Matrix Test with 301,950 comparisons using the same dataset +of 550 images. The 98.39% accuracy rate shows a significant decrease in false +positive and false negative rates in facial recognition. Face matching and face +detection are more accurate in images with complex backgrounds, lighting +variations, and occlusions, or even those with similar attributes. + +
+
+
+
+
+ + ☆ Generalize or Detect? Towards Robust Semantic Segmentation Under + Multiple Distribution Shifts NeurIPS 2024 + + +
+ In open-world scenarios, where both novel classes and domains may exist, an +ideal segmentation model should detect anomaly classes for safety and +generalize to new domains. However, existing methods often struggle to +distinguish between domain-level and semantic-level distribution shifts, +leading to poor out-of-distribution (OOD) detection or domain generalization +performance. In this work, we aim to equip the model to generalize effectively +to covariate-shift regions while precisely identifying semantic-shift regions. +To achieve this, we design a novel generative augmentation method to produce +coherent images that incorporate both anomaly (or novel) objects and various +covariate shifts at both image and object levels. Furthermore, we introduce a +training strategy that recalibrates uncertainty specifically for semantic +shifts and enhances the feature extractor to align features associated with +domain shifts. We validate the effectiveness of our method across benchmarks +featuring both semantic and domain shifts. Our method achieves state-of-the-art +performance across all benchmarks for both OOD detection and domain +generalization. Code is available at +https://github.com/gaozhitong/MultiShiftSeg. + +
+
+ comment: Published in NeurIPS 2024 +
+
+
+
+
+ + ☆ Both Text and Images Leaked! A Systematic Analysis of Multimodal LLM + Data Contamination + + +
+ The rapid progression of multimodal large language models (MLLMs) has +demonstrated superior performance on various multimodal benchmarks. However, +the issue of data contamination during training creates challenges in +performance evaluation and comparison. While numerous methods exist for +detecting dataset contamination in large language models (LLMs), they are less +effective for MLLMs due to their various modalities and multiple training +phases. In this study, we introduce a multimodal data contamination detection +framework, MM-Detect, designed for MLLMs. Our experimental results indicate +that MM-Detect is sensitive to varying degrees of contamination and can +highlight significant performance improvements due to leakage of the training +set of multimodal benchmarks. Furthermore, We also explore the possibility of +contamination originating from the pre-training phase of LLMs used by MLLMs and +the fine-tuning phase of MLLMs, offering new insights into the stages at which +contamination may be introduced. + +
+
+
+
+
+ + ☆ SA3DIP: Segment Any 3D Instance with Potential 3D Priors + + +
+ The proliferation of 2D foundation models has sparked research into adapting +them for open-world 3D instance segmentation. Recent methods introduce a +paradigm that leverages superpoints as geometric primitives and incorporates 2D +multi-view masks from Segment Anything model (SAM) as merging guidance, +achieving outstanding zero-shot instance segmentation results. However, the +limited use of 3D priors restricts the segmentation performance. Previous +methods calculate the 3D superpoints solely based on estimated normal from +spatial coordinates, resulting in under-segmentation for instances with similar +geometry. Besides, the heavy reliance on SAM and hand-crafted algorithms in 2D +space suffers from over-segmentation due to SAM's inherent part-level +segmentation tendency. To address these issues, we propose SA3DIP, a novel +method for Segmenting Any 3D Instances via exploiting potential 3D Priors. +Specifically, on one hand, we generate complementary 3D primitives based on +both geometric and textural priors, which reduces the initial errors that +accumulate in subsequent procedures. On the other hand, we introduce +supplemental constraints from the 3D space by using a 3D detector to guide a +further merging process. Furthermore, we notice a considerable portion of +low-quality ground truth annotations in ScanNetV2 benchmark, which affect the +fair evaluations. Thus, we present ScanNetV2-INS with complete ground truth +labels and supplement additional instances for 3D class-agnostic instance +segmentation. Experimental evaluations on various 2D-3D datasets demonstrate +the effectiveness and robustness of our approach. Our code and proposed +ScanNetV2-INS dataset are available HERE. + +
+
+
+
+
+ + ☆ GS2Pose: Tow-stage 6D Object Pose Estimation Guided by Gaussian + Splatting + + +
+ This paper proposes a new method for accurate and robust 6D pose estimation +of novel objects, named GS2Pose. By introducing 3D Gaussian splatting, GS2Pose +can utilize the reconstruction results without requiring a high-quality CAD +model, which means it only requires segmented RGBD images as input. +Specifically, GS2Pose employs a two-stage structure consisting of coarse +estimation followed by refined estimation. In the coarse stage, a lightweight +U-Net network with a polarization attention mechanism, called Pose-Net, is +designed. By using the 3DGS model for supervised training, Pose-Net can +generate NOCS images to compute a coarse pose. In the refinement stage, GS2Pose +formulates a pose regression algorithm following the idea of reprojection or +Bundle Adjustment (BA), referred to as GS-Refiner. By leveraging Lie algebra to +extend 3DGS, GS-Refiner obtains a pose-differentiable rendering pipeline that +refines the coarse pose by comparing the input images with the rendered images. +GS-Refiner also selectively updates parameters in the 3DGS model to achieve +environmental adaptation, thereby enhancing the algorithm's robustness and +flexibility to illuminative variation, occlusion, and other challenging +disruptive factors. GS2Pose was evaluated through experiments conducted on the +LineMod dataset, where it was compared with similar algorithms, yielding highly +competitive results. The code for GS2Pose will soon be released on GitHub. + +
+
+
+
+
+ + ☆ VQA$^2$:Visual Question Answering for Video Quality Assessment + + +
+ The advent and proliferation of large multi-modal models (LMMs) have +introduced a new paradigm to video-related computer vision fields, including +training and inference methods based on visual question answering (VQA). These +methods enable models to handle multiple downstream tasks robustly. Video +Quality Assessment (VQA), a classic field in low-level visual quality +evaluation, originally focused on quantitative video quality scoring. However, +driven by advances in LMMs, it is now evolving towards more comprehensive +visual quality understanding tasks. Visual question answering has significantly +improved low-level visual evaluation within the image domain recently. However, +related work is almost nonexistent in the video domain, leaving substantial +room for improvement. To address this gap, we introduce the VQA2 Instruction +Dataset the first visual question answering instruction dataset entirely +focuses on video quality assessment, and based on it, we propose the VQA2 +series models The VQA2 Instruction Dataset consists of three stages and covers +various video types, containing 157,735 instruction question-answer pairs, +including both manually annotated and synthetic data. We conduct extensive +experiments on both video quality scoring and video quality understanding +tasks. Results demonstrate that the VQA2 series models achieve state-of-the-art +(SOTA) performance in quality scoring tasks, and their performance in visual +quality question answering surpasses the renowned GPT-4o. Additionally, our +final model, the VQA2-Assistant, performs well across both scoring and +question-answering tasks, validating its versatility. + +
+
+ comment: 10 pages 3 figures +
+
+
+
+
+ + ☆ Harmformer: Harmonic Networks Meet Transformers for Continuous + Roto-Translation Equivariance NeurIPS 2024 + + +
+ CNNs exhibit inherent equivariance to image translation, leading to efficient +parameter and data usage, faster learning, and improved robustness. The concept +of translation equivariant networks has been successfully extended to rotation +transformation using group convolution for discrete rotation groups and +harmonic functions for the continuous rotation group encompassing $360^\circ$. +We explore the compatibility of the SA mechanism with full rotation +equivariance, in contrast to previous studies that focused on discrete +rotation. We introduce the Harmformer, a harmonic transformer with a +convolutional stem that achieves equivariance for both translation and +continuous rotation. Accompanied by an end-to-end equivariance proof, the +Harmformer not only outperforms previous equivariant transformers, but also +demonstrates inherent stability under any continuous rotation, even without +seeing rotated samples during training. + +
+
+ comment: Appears in NeurIPS 2024 Workshop on Symmetry and Geometry in Neural + Representations +
+
+
+
+
+ + ☆ Sub-DM:Subspace Diffusion Model with Orthogonal Decomposition for MRI + Reconstruction + + +
+ Diffusion model-based approaches recently achieved re-markable success in MRI +reconstruction, but integration into clinical routine remains challenging due +to its time-consuming convergence. This phenomenon is partic-ularly notable +when directly apply conventional diffusion process to k-space data without +considering the inherent properties of k-space sampling, limiting k-space +learning efficiency and image reconstruction quality. To tackle these +challenges, we introduce subspace diffusion model with orthogonal +decomposition, a method (referred to as Sub-DM) that restrict the diffusion +process via projections onto subspace as the k-space data distribution evolves +toward noise. Particularly, the subspace diffusion model circumvents the +inference challenges posed by the com-plex and high-dimensional characteristics +of k-space data, so the highly compact subspace ensures that diffusion process +requires only a few simple iterations to produce accurate prior information. +Furthermore, the orthogonal decomposition strategy based on wavelet transform +hin-ders the information loss during the migration of the vanilla diffusion +process to the subspace. Considering the strate-gy is approximately reversible, +such that the entire pro-cess can be reversed. As a result, it allows the +diffusion processes in different spaces to refine models through a mutual +feedback mechanism, enabling the learning of ac-curate prior even when dealing +with complex k-space data. Comprehensive experiments on different datasets +clearly demonstrate that the superiority of Sub-DM against state of-the-art +methods in terms of reconstruction speed and quality. + +
+
+ comment: 10 pages, 11 figures +
+
+
+
+
+ + ☆ Deferred Poisoning: Making the Model More Vulnerable via Hessian + Singularization + + +
+ Recent studies have shown that deep learning models are very vulnerable to +poisoning attacks. Many defense methods have been proposed to address this +issue. However, traditional poisoning attacks are not as threatening as +commonly believed. This is because they often cause differences in how the +model performs on the training set compared to the validation set. Such +inconsistency can alert defenders that their data has been poisoned, allowing +them to take the necessary defensive actions. In this paper, we introduce a +more threatening type of poisoning attack called the Deferred Poisoning Attack. +This new attack allows the model to function normally during the training and +validation phases but makes it very sensitive to evasion attacks or even +natural noise. We achieve this by ensuring the poisoned model's loss function +has a similar value as a normally trained model at each input sample but with a +large local curvature. A similar model loss ensures that there is no obvious +inconsistency between the training and validation accuracy, demonstrating high +stealthiness. On the other hand, the large curvature implies that a small +perturbation may cause a significant increase in model loss, leading to +substantial performance degradation, which reflects a worse robustness. We +fulfill this purpose by making the model have singular Hessian information at +the optimal point via our proposed Singularization Regularization term. We have +conducted both theoretical and empirical analyses of the proposed method and +validated its effectiveness through experiments on image classification tasks. +Furthermore, we have confirmed the hazards of this form of poisoning attack +under more general scenarios using natural noise, offering a new perspective +for research in the field of security. + +
+
+
+
+
+ + ☆ Homotopy Continuation Made Easy: Regression-based Online Simulation of + Starting Problem-Solution Pairs + + +
+ While automatically generated polynomial elimination templates have sparked +great progress in the field of 3D computer vision, there remain many problems +for which the degree of the constraints or the number of unknowns leads to +intractability. In recent years, homotopy continuation has been introduced as a +plausible alternative. However, the method currently depends on expensive +parallel tracking of all possible solutions in the complex domain, or a +classification network for starting problem-solution pairs trained over a +limited set of real-world examples. Our innovation consists of employing a +regression network trained in simulation to directly predict a solution from +input correspondences, followed by an online simulator that invents a +consistent problem-solution pair. Subsequently, homotopy continuation is +applied to track that single solution back to the original problem. We apply +this elegant combination to generalized camera resectioning, and also introduce +a new solution to the challenging generalized relative pose and scale problem. +As demonstrated, the proposed method successfully compensates the raw error +committed by the regressor alone, and leads to state-of-the-art efficiency and +success rates while running on CPU resources, only. + +
+
+
+
+
+ + ☆ NeurIPS 2023 Competition: Privacy Preserving Federated Learning Document + VQA + + +
+ The Privacy Preserving Federated Learning Document VQA (PFL-DocVQA) +competition challenged the community to develop provably private and +communication-efficient solutions in a federated setting for a real-life use +case: invoice processing. The competition introduced a dataset of real invoice +documents, along with associated questions and answers requiring information +extraction and reasoning over the document images. Thereby, it brings together +researchers and expertise from the document analysis, privacy, and federated +learning communities. Participants fine-tuned a pre-trained, state-of-the-art +Document Visual Question Answering model provided by the organizers for this +new domain, mimicking a typical federated invoice processing setup. The base +model is a multi-modal generative language model, and sensitive information +could be exposed through either the visual or textual input modality. +Participants proposed elegant solutions to reduce communication costs while +maintaining a minimum utility threshold in track 1 and to protect all +information from each document provider using differential privacy in track 2. +The competition served as a new testbed for developing and testing private +federated learning methods, simultaneously raising awareness about privacy +within the document image analysis and recognition community. Ultimately, the +competition analysis provides best practices and recommendations for +successfully running privacy-focused federated learning challenges in the +future. + +
+
+ comment: 27 pages, 6 figures +
+
+
+
+
+ + ☆ Relation Learning and Aggregate-attention for Multi-person Motion + Prediction + + +
+ Multi-person motion prediction is an emerging and intricate task with broad +real-world applications. Unlike single person motion prediction, it considers +not just the skeleton structures or human trajectories but also the +interactions between others. Previous methods use various networks to achieve +impressive predictions but often overlook that the joints relations within an +individual (intra-relation) and interactions among groups (inter-relation) are +distinct types of representations. These methods often lack explicit +representation of inter&intra-relations, and inevitably introduce undesired +dependencies. To address this issue, we introduce a new collaborative framework +for multi-person motion prediction that explicitly modeling these relations:a +GCN-based network for intra-relations and a novel reasoning network for +inter-relations.Moreover, we propose a novel plug-and-play aggregation module +called the Interaction Aggregation Module (IAM), which employs an +aggregate-attention mechanism to seamlessly integrate these relations. +Experiments indicate that the module can also be applied to other dual-path +models. Extensive experiments on the 3DPW, 3DPW-RC, CMU-Mocap, MuPoTS-3D, as +well as synthesized datasets Mix1 & Mix2 (9 to 15 persons), demonstrate that +our method achieves state-of-the-art performance. + +
+
+ comment: Submitted to IEEE Transactions on Multimedia +
+
+
+
+
+ + ☆ Efficient Fourier Filtering Network with Contrastive Learning for + UAV-based Unaligned Bi-modal Salient Object Detection + + +
+ Unmanned aerial vehicle (UAV)-based bi-modal salient object detection (BSOD) +aims to segment salient objects in a scene utilizing complementary cues in +unaligned RGB and thermal image pairs. However, the high computational expense +of existing UAV-based BSOD models limits their applicability to real-world UAV +devices. To address this problem, we propose an efficient Fourier filter +network with contrastive learning that achieves both real-time and accurate +performance. Specifically, we first design a semantic contrastive alignment +loss to align the two modalities at the semantic level, which facilitates +mutual refinement in a parameter-free way. Second, inspired by the fast Fourier +transform that obtains global relevance in linear complexity, we propose +synchronized alignment fusion, which aligns and fuses bi-modal features in the +channel and spatial dimensions by a hierarchical filtering mechanism. Our +proposed model, AlignSal, reduces the number of parameters by 70.0%, decreases +the floating point operations by 49.4%, and increases the inference speed by +152.5% compared to the cutting-edge BSOD model (i.e., MROS). Extensive +experiments on the UAV RGB-T 2400 and three weakly aligned datasets demonstrate +that AlignSal achieves both real-time inference speed and better performance +and generalizability compared to sixteen state-of-the-art BSOD models across +most evaluation metrics. In addition, our ablation studies further verify +AlignSal's potential in boosting the performance of existing aligned BSOD +models on UAV-based unaligned data. The code is available at: +https://github.com/JoshuaLPF/AlignSal. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ PX2Tooth: Reconstructing the 3D Point Cloud Teeth from a Single + Panoramic X-ray + + +
+ Reconstructing the 3D anatomical structures of the oral cavity, which +originally reside in the cone-beam CT (CBCT), from a single 2D Panoramic +X-ray(PX) remains a critical yet challenging task, as it can effectively reduce +radiation risks and treatment costs during the diagnostic in digital dentistry. +However, current methods are either error-prone or only trained/evaluated on +small-scale datasets (less than 50 cases), resulting in compromised +trustworthiness. In this paper, we propose PX2Tooth, a novel approach to +reconstruct 3D teeth using a single PX image with a two-stage framework. First, +we design the PXSegNet to segment the permanent teeth from the PX images, +providing clear positional, morphological, and categorical information for each +tooth. Subsequently, we design a novel tooth generation network (TGNet) that +learns to transform random point clouds into 3D teeth. TGNet integrates the +segmented patch information and introduces a Prior Fusion Module (PFM) to +enhance the generation quality, especially in the root apex region. Moreover, +we construct a dataset comprising 499 pairs of CBCT and Panoramic X-rays. +Extensive experiments demonstrate that PX2Tooth can achieve an Intersection +over Union (IoU) of 0.793, significantly surpassing previous methods, +underscoring the great potential of artificial intelligence in digital +dentistry. + +
+
+ comment: Ma W, Wu H, Xiao Z, et al. PX2Tooth: Reconstructing the 3D Point + Cloud Teeth from a Single Panoramic X-Ray[C]//International Conference on + Medical Image Computing and Computer-Assisted Intervention. Cham: Springer + Nature Switzerland, 2024: 411-421 +
+
+
+
+
+ + ☆ Estimation of Psychosocial Work Environment Exposures Through Video + Object Detection. Proof of Concept Using CCTV Footage + + +
+ This paper examines the use of computer vision algorithms to estimate aspects +of the psychosocial work environment using CCTV footage. We present a proof of +concept for a methodology that detects and tracks people in video footage and +estimates interactions between customers and employees by estimating their +poses and calculating the duration of their encounters. We propose a pipeline +that combines existing object detection and tracking algorithms (YOLOv8 and +DeepSORT) with pose estimation algorithms (BlazePose) to estimate the number of +customers and employees in the footage as well as the duration of their +encounters. We use a simple rule-based approach to classify the interactions as +positive, neutral or negative based on three different criteria: distance, +duration and pose. The proposed methodology is tested on a small dataset of +CCTV footage. While the data is quite limited in particular with respect to the +quality of the footage, we have chosen this case as it represents a typical +setting where the method could be applied. The results show that the object +detection and tracking part of the pipeline has a reasonable performance on the +dataset with a high degree of recall and reasonable accuracy. At this stage, +the pose estimation is still limited to fully detect the type of interactions +due to difficulties in tracking employees in the footage. We conclude that the +method is a promising alternative to self-reported measures of the psychosocial +work environment and could be used in future studies to obtain external +observations of the work environment. + +
+
+ comment: 11 pages, 9 figures, presented at IWOAR 9th International Workshop on + Sensor-Based Activity Recognition and Artificial Intelligence, September + 26-27, Potsdam, Germany +
+
+
+
+
+ + ☆ Zero-shot Dynamic MRI Reconstruction with Global-to-local Diffusion + Model + + +
+ Diffusion models have recently demonstrated considerable advancement in the +generation and reconstruction of magnetic resonance imaging (MRI) data. These +models exhibit great potential in handling unsampled data and reducing noise, +highlighting their promise as generative models. However, their application in +dynamic MRI remains relatively underexplored. This is primarily due to the +substantial amount of fully-sampled data typically required for training, which +is difficult to obtain in dynamic MRI due to its spatio-temporal complexity and +high acquisition costs. To address this challenge, we propose a dynamic MRI +reconstruction method based on a time-interleaved acquisition scheme, termed +the Glob-al-to-local Diffusion Model. Specifically, fully encoded +full-resolution reference data are constructed by merging under-sampled k-space +data from adjacent time frames, generating two distinct bulk training datasets +for global and local models. The global-to-local diffusion framework +alternately optimizes global information and local image details, enabling +zero-shot reconstruction. Extensive experiments demonstrate that the proposed +method performs well in terms of noise reduction and detail preservation, +achieving reconstruction quality comparable to that of supervised approaches. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ☆ These Maps Are Made by Propagation: Adapting Deep Stereo Networks to + Road Scenarios with Decisive Disparity Diffusion + + +
+ Stereo matching has emerged as a cost-effective solution for road surface 3D +reconstruction, garnering significant attention towards improving both +computational efficiency and accuracy. This article introduces decisive +disparity diffusion (D3Stereo), marking the first exploration of dense deep +feature matching that adapts pre-trained deep convolutional neural networks +(DCNNs) to previously unseen road scenarios. A pyramid of cost volumes is +initially created using various levels of learned representations. +Subsequently, a novel recursive bilateral filtering algorithm is employed to +aggregate these costs. A key innovation of D3Stereo lies in its alternating +decisive disparity diffusion strategy, wherein intra-scale diffusion is +employed to complete sparse disparity images, while inter-scale inheritance +provides valuable prior information for higher resolutions. Extensive +experiments conducted on our created UDTIRI-Stereo and Stereo-Road datasets +underscore the effectiveness of D3Stereo strategy in adapting pre-trained DCNNs +and its superior performance compared to all other explicit programming-based +algorithms designed specifically for road surface 3D reconstruction. Additional +experiments conducted on the Middlebury dataset with backbone DCNNs pre-trained +on the ImageNet database further validate the versatility of D3Stereo strategy +in tackling general stereo matching problems. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ☆ Explaining Human Activity Recognition with SHAP: Validating Insights + with Perturbation and Quantitative Measures + + +
+ In Human Activity Recognition (HAR), understanding the intricacy of body +movements within high-risk applications is essential. This study uses SHapley +Additive exPlanations (SHAP) to explain the decision-making process of Graph +Convolution Networks (GCNs) when classifying activities with skeleton data. We +employ SHAP to explain two real-world datasets: one for cerebral palsy (CP) +classification and the widely used NTU RGB+D 60 action recognition dataset. To +test the explanation, we introduce a novel perturbation approach that modifies +the model's edge importance matrix, allowing us to evaluate the impact of +specific body key points on prediction outcomes. To assess the fidelity of our +explanations, we employ informed perturbation, targeting body key points +identified as important by SHAP and comparing them against random perturbation +as a control condition. This perturbation enables a judgment on whether the +body key points are truly influential or non-influential based on the SHAP +values. Results on both datasets show that body key points identified as +important through SHAP have the largest influence on the accuracy, specificity, +and sensitivity metrics. Our findings highlight that SHAP can provide granular +insights into the input feature contribution to the prediction outcome of GCNs +in HAR tasks. This demonstrates the potential for more interpretable and +trustworthy models in high-stakes applications like healthcare or +rehabilitation. + +
+
+
+
+
+ + ☆ Fine-Tuning Vision-Language Model for Automated Engineering Drawing + Information Extraction + + +
+ Geometric Dimensioning and Tolerancing (GD&T) plays a critical role in +manufacturing by defining acceptable variations in part features to ensure +component quality and functionality. However, extracting GD&T information from +2D engineering drawings is a time-consuming and labor-intensive task, often +relying on manual efforts or semi-automated tools. To address these challenges, +this study proposes an automated and computationally efficient GD&T extraction +method by fine-tuning Florence-2, an open-source vision-language model (VLM). +The model is trained on a dataset of 400 drawings with ground truth annotations +provided by domain experts. For comparison, two state-of-the-art closed-source +VLMs, GPT-4o and Claude-3.5-Sonnet, are evaluated on the same dataset. All +models are assessed using precision, recall, F1-score, and hallucination +metrics. Due to the computational cost and impracticality of fine-tuning large +closed-source VLMs for domain-specific tasks, GPT-4o and Claude-3.5-Sonnet are +evaluated in a zero-shot setting. In contrast, Florence-2, a smaller model with +0.23 billion parameters, is optimized through full-parameter fine-tuning across +three distinct experiments, each utilizing datasets augmented to different +levels. The results show that Florence-2 achieves a 29.95% increase in +precision, a 37.75% increase in recall, a 52.40% improvement in F1-score, and a +43.15% reduction in hallucination rate compared to the best-performing +closed-source model. These findings highlight the effectiveness of fine-tuning +smaller, open-source VLMs like Florence-2, offering a practical and efficient +solution for automated GD&T extraction to support downstream manufacturing +tasks. + +
+
+ comment: Paper has been submitted to the 9th International Conference on + Innovation in Artificial Intelligence (ICIAI 2025) +
+
+
+
+
+ + ☆ 3DGS-CD: 3D Gaussian Splatting-based Change Detection for Physical + Object Rearrangement + + +
+ We present 3DGS-CD, the first 3D Gaussian Splatting (3DGS)-based method for +detecting physical object rearrangements in 3D scenes. Our approach estimates +3D object-level changes by comparing two sets of unaligned images taken at +different times. Leveraging 3DGS's novel view rendering and EfficientSAM's +zero-shot segmentation capabilities, we detect 2D object-level changes, which +are then associated and fused across views to estimate 3D changes. Our method +can detect changes in cluttered environments using sparse post-change images +within as little as 18s, using as few as a single new image. It does not rely +on depth input, user instructions, object classes, or object models -- An +object is recognized simply if it has been re-arranged. Our approach is +evaluated on both public and self-collected real-world datasets, achieving up +to 14% higher accuracy and three orders of magnitude faster performance +compared to the state-of-the-art radiance-field-based change detection method. +This significant performance boost enables a broad range of downstream +applications, where we highlight three key use cases: object reconstruction, +robot workspace reset, and 3DGS model update. Our code and data will be made +available at https://github.com/520xyxyzq/3DGS-CD. + +
+
+
+
+
+ + ☆ Graph-Based Multi-Modal Sensor Fusion for Autonomous Driving + + +
+ The growing demand for robust scene understanding in mobile robotics and +autonomous driving has highlighted the importance of integrating multiple +sensing modalities. By combining data from diverse sensors like cameras and +LIDARs, fusion techniques can overcome the limitations of individual sensors, +enabling a more complete and accurate perception of the environment. We +introduce a novel approach to multi-modal sensor fusion, focusing on developing +a graph-based state representation that supports critical decision-making +processes in autonomous driving. We present a Sensor-Agnostic Graph-Aware +Kalman Filter [3], the first online state estimation technique designed to fuse +multi-modal graphs derived from noisy multi-sensor data. The estimated +graph-based state representations serve as a foundation for advanced +applications like Multi-Object Tracking (MOT), offering a comprehensive +framework for enhancing the situational awareness and safety of autonomous +systems. We validate the effectiveness of our proposed framework through +extensive experiments conducted on both synthetic and real-world driving +datasets (nuScenes). Our results showcase an improvement in MOTA and a +reduction in estimated position errors (MOTP) and identity switches (IDS) for +tracked objects using the SAGA-KF. Furthermore, we highlight the capability of +such a framework to develop methods that can leverage heterogeneous information +(like semantic objects and geometric structures) from various sensing +modalities, enabling a more holistic approach to scene understanding and +enhancing the safety and effectiveness of autonomous systems. + +
+
+ comment: An extended abstract accepted at Young Researchers' Symposium, ICVGIP + '24. This extended abstract contains the following: 1. Short summary of our + work, SAGA-KF, accepted at ICPR'24. 2. A proposal that was awarded the + Qualcomm Innovation Fellowship'24 +
+
+
+
+
+ + ☆ OccLoff: Learning Optimized Feature Fusion for 3D Occupancy Prediction + + +
+ 3D semantic occupancy prediction is crucial for finely representing the +surrounding environment, which is essential for ensuring the safety in +autonomous driving. Existing fusion-based occupancy methods typically involve +performing a 2D-to-3D view transformation on image features, followed by +computationally intensive 3D operations to fuse these with LiDAR features, +leading to high computational costs and reduced accuracy. Moreover, current +research on occupancy prediction predominantly focuses on designing specific +network architectures, often tailored to particular models, with limited +attention given to the more fundamental aspect of semantic feature learning. +This gap hinders the development of more transferable methods that could +enhance the performance of various occupancy models. To address these +challenges, we propose OccLoff, a framework that Learns to Optimize Feature +Fusion for 3D occupancy prediction. Specifically, we introduce a sparse fusion +encoder with entropy masks that directly fuses 3D and 2D features, improving +model accuracy while reducing computational overhead. Additionally, we propose +a transferable proxy-based loss function and an adaptive hard sample weighting +algorithm, which enhance the performance of several state-of-the-art methods. +Extensive evaluations on the nuScenes and SemanticKITTI benchmarks demonstrate +the superiority of our framework, and ablation studies confirm the +effectiveness of each proposed module. + +
+
+
+
+
+ + ☆ AMNCutter: Affinity-Attention-Guided Multi-View Normalized Cutter for + Unsupervised Surgical Instrument Segmentation + + +
+ Surgical instrument segmentation (SIS) is pivotal for robotic-assisted +minimally invasive surgery, assisting surgeons by identifying surgical +instruments in endoscopic video frames. Recent unsupervised surgical instrument +segmentation (USIS) methods primarily rely on pseudo-labels derived from +low-level features such as color and optical flow, but these methods show +limited effectiveness and generalizability in complex and unseen endoscopic +scenarios. In this work, we propose a label-free unsupervised model featuring a +novel module named Multi-View Normalized Cutter (m-NCutter). Different from +previous USIS works, our model is trained using a graph-cutting loss function +that leverages patch affinities for supervision, eliminating the need for +pseudo-labels. The framework adaptively determines which affinities from which +levels should be prioritized. Therefore, the low- and high-level features and +their affinities are effectively integrated to train a label-free unsupervised +model, showing superior effectiveness and generalization ability. We conduct +comprehensive experiments across multiple SIS datasets to validate our +approach's state-of-the-art (SOTA) performance, robustness, and exceptional +potential as a pre-trained model. Our code is released at +https://github.com/MingyuShengSMY/AMNCutter. + +
+
+ comment: This paper was accepted by the 2025 IEEE Winter Conference on + Applications of Computer Vision (WACV) +
+
+
+
+
+ + ☆ Where Do We Stand with Implicit Neural Representations? A Technical and + Performance Survey + + +
+ Implicit Neural Representations (INRs) have emerged as a paradigm in +knowledge representation, offering exceptional flexibility and performance +across a diverse range of applications. INRs leverage multilayer perceptrons +(MLPs) to model data as continuous implicit functions, providing critical +advantages such as resolution independence, memory efficiency, and +generalisation beyond discretised data structures. Their ability to solve +complex inverse problems makes them particularly effective for tasks including +audio reconstruction, image representation, 3D object reconstruction, and +high-dimensional data synthesis. This survey provides a comprehensive review of +state-of-the-art INR methods, introducing a clear taxonomy that categorises +them into four key areas: activation functions, position encoding, combined +strategies, and network structure optimisation. We rigorously analyse their +critical properties, such as full differentiability, smoothness, compactness, +and adaptability to varying resolutions while also examining their strengths +and limitations in addressing locality biases and capturing fine details. Our +experimental comparison offers new insights into the trade-offs between +different approaches, showcasing the capabilities and challenges of the latest +INR techniques across various tasks. In addition to identifying areas where +current methods excel, we highlight key limitations and potential avenues for +improvement, such as developing more expressive activation functions, enhancing +positional encoding mechanisms, and improving scalability for complex, +high-dimensional data. This survey serves as a roadmap for researchers, +offering practical guidance for future exploration in the field of INRs. We aim +to foster new methodologies by outlining promising research directions for INRs +and applications. + +
+
+
+
+
+ + ☆ Towards 3D Semantic Scene Completion for Autonomous Driving: A + Meta-Learning Framework Empowered by Deformable Large-Kernel Attention and + Mamba Model + + +
+ Semantic scene completion (SSC) is essential for achieving comprehensive +perception in autonomous driving systems. However, existing SSC methods often +overlook the high deployment costs in real-world applications. Traditional +architectures, such as 3D Convolutional Neural Networks (3D CNNs) and +self-attention mechanisms, face challenges in efficiently capturing long-range +dependencies within 3D voxel grids, limiting their effectiveness. To address +these issues, we introduce MetaSSC, a novel meta-learning-based framework for +SSC that leverages deformable convolution, large-kernel attention, and the +Mamba (D-LKA-M) model. Our approach begins with a voxel-based semantic +segmentation (SS) pretraining task, aimed at exploring the semantics and +geometry of incomplete regions while acquiring transferable meta-knowledge. +Using simulated cooperative perception datasets, we supervise the perception +training of a single vehicle using aggregated sensor data from multiple nearby +connected autonomous vehicles (CAVs), generating richer and more comprehensive +labels. This meta-knowledge is then adapted to the target domain through a +dual-phase training strategy that does not add extra model parameters, enabling +efficient deployment. To further enhance the model's capability in capturing +long-sequence relationships within 3D voxel grids, we integrate Mamba blocks +with deformable convolution and large-kernel attention into the backbone +network. Extensive experiments demonstrate that MetaSSC achieves +state-of-the-art performance, significantly outperforming competing models +while also reducing deployment costs. + +
+
+
+
+
+ + ☆ Touchstone Benchmark: Are We on the Right Way for Evaluating AI + Algorithms for Medical Segmentation? NeurIPS-2024 + + +
+ How can we test AI performance? This question seems trivial, but it isn't. +Standard benchmarks often have problems such as in-distribution and small-size +test sets, oversimplified metrics, unfair comparisons, and short-term outcome +pressure. As a consequence, good performance on standard benchmarks does not +guarantee success in real-world scenarios. To address these problems, we +present Touchstone, a large-scale collaborative segmentation benchmark of 9 +types of abdominal organs. This benchmark is based on 5,195 training CT scans +from 76 hospitals around the world and 5,903 testing CT scans from 11 +additional hospitals. This diverse test set enhances the statistical +significance of benchmark results and rigorously evaluates AI algorithms across +various out-of-distribution scenarios. We invited 14 inventors of 19 AI +algorithms to train their algorithms, while our team, as a third party, +independently evaluated these algorithms on three test sets. In addition, we +also evaluated pre-existing AI frameworks--which, differing from algorithms, +are more flexible and can support different algorithms--including MONAI from +NVIDIA, nnU-Net from DKFZ, and numerous other open-source frameworks. We are +committed to expanding this benchmark to encourage more innovation of AI +algorithms for the medical domain. + +
+
+ comment: Accepted to NeurIPS-2024 +
+
+
+
+
+ + Adaptive Stereo Depth Estimation with Multi-Spectral Images Across All + Lighting Conditions + + +
+ Depth estimation under adverse conditions remains a significant challenge. +Recently, multi-spectral depth estimation, which integrates both visible light +and thermal images, has shown promise in addressing this issue. However, +existing algorithms struggle with precise pixel-level feature matching, +limiting their ability to fully exploit geometric constraints across different +spectra. To address this, we propose a novel framework incorporating stereo +depth estimation to enforce accurate geometric constraints. In particular, we +treat the visible light and thermal images as a stereo pair and utilize a +Cross-modal Feature Matching (CFM) Module to construct a cost volume for +pixel-level matching. To mitigate the effects of poor lighting on stereo +matching, we introduce Degradation Masking, which leverages robust monocular +thermal depth estimation in degraded regions. Our method achieves +state-of-the-art (SOTA) performance on the Multi-Spectral Stereo (MS2) dataset, +with qualitative evaluations demonstrating high-quality depth maps under +varying lighting conditions. + +
+
+
+
+
+ + ☆ Structure Consistent Gaussian Splatting with Matching Prior for Few-shot + Novel View Synthesis NeurIPS 2024 + + +
+ Despite the substantial progress of novel view synthesis, existing methods, +either based on the Neural Radiance Fields (NeRF) or more recently 3D Gaussian +Splatting (3DGS), suffer significant degradation when the input becomes sparse. +Numerous efforts have been introduced to alleviate this problem, but they still +struggle to synthesize satisfactory results efficiently, especially in the +large scene. In this paper, we propose SCGaussian, a Structure Consistent +Gaussian Splatting method using matching priors to learn 3D consistent scene +structure. Considering the high interdependence of Gaussian attributes, we +optimize the scene structure in two folds: rendering geometry and, more +importantly, the position of Gaussian primitives, which is hard to be directly +constrained in the vanilla 3DGS due to the non-structure property. To achieve +this, we present a hybrid Gaussian representation. Besides the ordinary +non-structure Gaussian primitives, our model also consists of ray-based +Gaussian primitives that are bound to matching rays and whose optimization of +their positions is restricted along the ray. Thus, we can utilize the matching +correspondence to directly enforce the position of these Gaussian primitives to +converge to the surface points where rays intersect. Extensive experiments on +forward-facing, surrounding, and complex large scenes show the effectiveness of +our approach with state-of-the-art performance and high efficiency. Code is +available at https://github.com/prstrive/SCGaussian. + +
+
+ comment: NeurIPS 2024 Accepted +
+
+
+
+
+ + ☆ StreamingBench: Assessing the Gap for MLLMs to Achieve Streaming Video + Understanding + + +
+ The rapid development of Multimodal Large Language Models (MLLMs) has +expanded their capabilities from image comprehension to video understanding. +However, most of these MLLMs focus primarily on offline video comprehension, +necessitating extensive processing of all video frames before any queries can +be made. This presents a significant gap compared to the human ability to +watch, listen, think, and respond to streaming inputs in real time, +highlighting the limitations of current MLLMs. In this paper, we introduce +StreamingBench, the first comprehensive benchmark designed to evaluate the +streaming video understanding capabilities of MLLMs. StreamingBench assesses +three core aspects of streaming video understanding: (1) real-time visual +understanding, (2) omni-source understanding, and (3) contextual understanding. +The benchmark consists of 18 tasks, featuring 900 videos and 4,500 +human-curated QA pairs. Each video features five questions presented at +different time points to simulate a continuous streaming scenario. We conduct +experiments on StreamingBench with 13 open-source and proprietary MLLMs and +find that even the most advanced proprietary MLLMs like Gemini 1.5 Pro and +GPT-4o perform significantly below human-level streaming video understanding +capabilities. We hope our work can facilitate further advancements for MLLMs, +empowering them to approach human-level video comprehension and interaction in +more realistic scenarios. + +
+
+
+
+
+ + ☆ Cross Feature Fusion of Fundus Image and Generated Lesion Map for + Referable Diabetic Retinopathy Classification + + +
+ Diabetic Retinopathy (DR) is a primary cause of blindness, necessitating +early detection and diagnosis. This paper focuses on referable DR +classification to enhance the applicability of the proposed method in clinical +practice. We develop an advanced cross-learning DR classification method +leveraging transfer learning and cross-attention mechanisms. The proposed +method employs the Swin U-Net architecture to segment lesion maps from DR +fundus images. The Swin U-Net segmentation model, enriched with DR lesion +insights, is transferred to generate a lesion map. Both the fundus image and +its segmented lesion map are used as complementary inputs for the +classification model. A cross-attention mechanism is deployed to improve the +model's ability to capture fine-grained details from the input pairs. Our +experiments, utilizing two public datasets, FGADR and EyePACS, demonstrate a +superior accuracy of 94.6%, surpassing current state-of-the-art methods by +4.4%. To this end, we aim for the proposed method to be seamlessly integrated +into clinical workflows, enhancing accuracy and efficiency in identifying +referable DR. + +
+
+ comment: ACCV 2024 accepted +
+
+
+
+
+ + ☆ ADMIRE: a locally adaptive single-image, non-uniformity correction and + denoising algorithm: application to uncooled IR camera + + +
+ We propose a new way to correct for the non-uniformity (NU) and the noise in +uncooled infrared-type images. This method works on static images, needs no +registration, no camera motion and no model for the non uniformity. The +proposed method uses an hybrid scheme including an automatic locally-adaptive +contrast adjustment and a state-of-the-art image denoising method. It permits +to correct for a fully non-linear NU and the noise efficiently using only one +image. We compared it with total variation on real raw and simulated NU +infrared images. The strength of this approach lies in its simplicity, low +computational cost. It needs no test-pattern or calibration and produces no +"ghost-artefact". + +
+
+
+
+
+ + ☆ LCP-Fusion: A Neural Implicit SLAM with Enhanced Local Constraints and + Computable Prior IROS 2024 + + +
+ Recently the dense Simultaneous Localization and Mapping (SLAM) based on +neural implicit representation has shown impressive progress in hole filling +and high-fidelity mapping. Nevertheless, existing methods either heavily rely +on known scene bounds or suffer inconsistent reconstruction due to drift in +potential loop-closure regions, or both, which can be attributed to the +inflexible representation and lack of local constraints. In this paper, we +present LCP-Fusion, a neural implicit SLAM system with enhanced local +constraints and computable prior, which takes the sparse voxel octree structure +containing feature grids and SDF priors as hybrid scene representation, +enabling the scalability and robustness during mapping and tracking. To enhance +the local constraints, we propose a novel sliding window selection strategy +based on visual overlap to address the loop-closure, and a practical warping +loss to constrain relative poses. Moreover, we estimate SDF priors as coarse +initialization for implicit features, which brings additional explicit +constraints and robustness, especially when a light but efficient adaptive +early ending is adopted. Experiments demonstrate that our method achieve better +localization accuracy and reconstruction consistency than existing RGB-D +implicit SLAM, especially in challenging real scenes (ScanNet) as well as +self-captured scenes with unknown scene bounds. The code is available at +https://github.com/laliwang/LCP-Fusion. + +
+
+ comment: Accepted by 2024 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2024) +
+
+
+
+
+ + ☆ Hybrid Attention for Robust RGB-T Pedestrian Detection in Real-World + Conditions + + +
+ Multispectral pedestrian detection has gained significant attention in recent +years, particularly in autonomous driving applications. To address the +challenges posed by adversarial illumination conditions, the combination of +thermal and visible images has demonstrated its advantages. However, existing +fusion methods rely on the critical assumption that the RGB-Thermal (RGB-T) +image pairs are fully overlapping. These assumptions often do not hold in +real-world applications, where only partial overlap between images can occur +due to sensors configuration. Moreover, sensor failure can cause loss of +information in one modality. In this paper, we propose a novel module called +the Hybrid Attention (HA) mechanism as our main contribution to mitigate +performance degradation caused by partial overlap and sensor failure, i.e. when +at least part of the scene is acquired by only one sensor. We propose an +improved RGB-T fusion algorithm, robust against partial overlap and sensor +failure encountered during inference in real-world applications. We also +leverage a mobile-friendly backbone to cope with resource constraints in +embedded systems. We conducted experiments by simulating various partial +overlap and sensor failure scenarios to evaluate the performance of our +proposed method. The results demonstrate that our approach outperforms +state-of-the-art methods, showcasing its superiority in handling real-world +challenges. + +
+
+ comment: Accepted for publication in IEEE Robotics and Automation Letters, + October 2024 +
+
+
+
+
+ + ☆ Towards Personalized Federated Learning via Comprehensive Knowledge + Distillation + + +
+ Federated learning is a distributed machine learning paradigm designed to +protect data privacy. However, data heterogeneity across various clients +results in catastrophic forgetting, where the model rapidly forgets previous +knowledge while acquiring new knowledge. To address this challenge, +personalized federated learning has emerged to customize a personalized model +for each client. However, the inherent limitation of this mechanism is its +excessive focus on personalization, potentially hindering the generalization of +those models. In this paper, we present a novel personalized federated learning +method that uses global and historical models as teachers and the local model +as the student to facilitate comprehensive knowledge distillation. The +historical model represents the local model from the last round of client +training, containing historical personalized knowledge, while the global model +represents the aggregated model from the last round of server aggregation, +containing global generalized knowledge. By applying knowledge distillation, we +effectively transfer global generalized knowledge and historical personalized +knowledge to the local model, thus mitigating catastrophic forgetting and +enhancing the general performance of personalized models. Extensive +experimental results demonstrate the significant advantages of our method. + +
+
+ comment: Accepted by IEEE SMC 2024 +
+
+
+
+
+ + ☆ The American Sign Language Knowledge Graph: Infusing ASL Models with + Linguistic Knowledge + + +
+ Language models for American Sign Language (ASL) could make language +technologies substantially more accessible to those who sign. To train models +on tasks such as isolated sign recognition (ISR) and ASL-to-English +translation, datasets provide annotated video examples of ASL signs. To +facilitate the generalizability and explainability of these models, we +introduce the American Sign Language Knowledge Graph (ASLKG), compiled from +twelve sources of expert linguistic knowledge. We use the ASLKG to train +neuro-symbolic models for 3 ASL understanding tasks, achieving accuracies of +91% on ISR, 14% for predicting the semantic features of unseen signs, and 36% +for classifying the topic of Youtube-ASL videos. + +
+
+
+
+
+ + ♻ ☆ No Train, all Gain: Self-Supervised Gradients Improve Deep Frozen + Representations NeurIPS 2024 + + +
+ This paper introduces FUNGI, Features from UNsupervised GradIents, a method +to enhance the features of transformer encoders by leveraging self-supervised +gradients. Our method is simple: given any pretrained model, we first compute +gradients from various self-supervised objectives for each input. These +gradients are projected to a lower dimension and then concatenated with the +model's output embedding. The resulting features are evaluated on k-nearest +neighbor classification over 11 datasets from vision, 5 from natural language +processing, and 2 from audio. Across backbones spanning various sizes and +pretraining strategies, FUNGI features provide consistent performance +improvements over the embeddings. We also show that using FUNGI features can +benefit linear classification, clustering and image retrieval, and that they +significantly improve the retrieval-based in-context scene understanding +abilities of pretrained models, for example improving upon DINO by +17% for +semantic segmentation - without any training. + +
+
+ comment: NeurIPS 2024. Code available at + https://github.com/WalterSimoncini/fungivision +
+
+
+
+
+ + ♻ ☆ DeNetDM: Debiasing by Network Depth Modulation NeurIPS 2024 + + +
+ Neural networks trained on biased datasets tend to inadvertently learn +spurious correlations, hindering generalization. We formally prove that (1) +samples that exhibit spurious correlations lie on a lower rank manifold +relative to the ones that do not; and (2) the depth of a network acts as an +implicit regularizer on the rank of the attribute subspace that is encoded in +its representations. Leveraging these insights, we present DeNetDM, a novel +debiasing method that uses network depth modulation as a way of developing +robustness to spurious correlations. Using a training paradigm derived from +Product of Experts, we create both biased and debiased branches with deep and +shallow architectures and then distill knowledge to produce the target debiased +model. Our method requires no bias annotations or explicit data augmentation +while performing on par with approaches that require either or both. We +demonstrate that DeNetDM outperforms existing debiasing techniques on both +synthetic and real-world datasets by 5\%. The project page is available at +https://vssilpa.github.io/denetdm/. + +
+
+ comment: Camera-ready version : NeurIPS 2024, * indicates these authors + contributed equally +
+
+
+
+
+ + ♻ ☆ Gaussian Deja-vu: Creating Controllable 3D Gaussian Head-Avatars with + Enhanced Generalization and Personalization Abilities + + +
+ Recent advancements in 3D Gaussian Splatting (3DGS) have unlocked significant +potential for modeling 3D head avatars, providing greater flexibility than +mesh-based methods and more efficient rendering compared to NeRF-based +approaches. Despite these advancements, the creation of controllable 3DGS-based +head avatars remains time-intensive, often requiring tens of minutes to hours. +To expedite this process, we here introduce the "Gaussian Deja-vu" framework, +which first obtains a generalized model of the head avatar and then +personalizes the result. The generalized model is trained on large 2D +(synthetic and real) image datasets. This model provides a well-initialized 3D +Gaussian head that is further refined using a monocular video to achieve the +personalized head avatar. For personalizing, we propose learnable +expression-aware rectification blendmaps to correct the initial 3D Gaussians, +ensuring rapid convergence without the reliance on neural networks. Experiments +demonstrate that the proposed method meets its objectives. It outperforms +state-of-the-art 3D Gaussian head avatars in terms of photorealistic quality as +well as reduces training time consumption to at least a quarter of the existing +methods, producing the avatar in minutes. + +
+
+ comment: 11 pages, Accepted by WACV 2025 in Round 1 +
+
+
+
+
+ + ♻ ☆ bit2bit: 1-bit quanta video reconstruction via self-supervised photon + prediction NeurIPS 2024 + + +
+ Quanta image sensors, such as SPAD arrays, are an emerging sensor technology, +producing 1-bit arrays representing photon detection events over exposures as +short as a few nanoseconds. In practice, raw data are post-processed using +heavy spatiotemporal binning to create more useful and interpretable images at +the cost of degrading spatiotemporal resolution. In this work, we propose +bit2bit, a new method for reconstructing high-quality image stacks at the +original spatiotemporal resolution from sparse binary quanta image data. +Inspired by recent work on Poisson denoising, we developed an algorithm that +creates a dense image sequence from sparse binary photon data by predicting the +photon arrival location probability distribution. However, due to the binary +nature of the data, we show that the assumption of a Poisson distribution is +inadequate. Instead, we model the process with a Bernoulli lattice process from +the truncated Poisson. This leads to the proposal of a novel self-supervised +solution based on a masked loss function. We evaluate our method using both +simulated and real data. On simulated data from a conventional video, we +achieve 34.35 mean PSNR with extremely photon-sparse binary input (<0.06 +photons per pixel per frame). We also present a novel dataset containing a wide +range of real SPAD high-speed videos under various challenging imaging +conditions. The scenes cover strong/weak ambient light, strong motion, +ultra-fast events, etc., which will be made available to the community, on +which we demonstrate the promise of our approach. Both reconstruction quality +and throughput substantially surpass the state-of-the-art methods (e.g., Quanta +Burst Photography (QBP)). Our approach significantly enhances the visualization +and usability of the data, enabling the application of existing analysis +techniques. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ LDTrack: Dynamic People Tracking by Service Robots using Diffusion + Models + + +
+ Tracking of dynamic people in cluttered and crowded human-centered +environments is a challenging robotics problem due to the presence of +intraclass variations including occlusions, pose deformations, and lighting +variations. This paper introduces a novel deep learning architecture, using +conditional latent diffusion models, the Latent Diffusion Track (LDTrack), for +tracking multiple dynamic people under intraclass variations. By uniquely +utilizing conditional latent diffusion models to capture temporal person +embeddings, our architecture can adapt to appearance changes of people over +time. We incorporated a latent feature encoder network which enables the +diffusion process to operate within a high-dimensional latent space to allow +for the extraction and spatial-temporal refinement of such rich features as +person appearance, motion, location, identity, and contextual information. +Extensive experiments demonstrate the effectiveness of LDTrack over other +state-of-the-art tracking methods in cluttered and crowded human-centered +environments under intraclass variations. Namely, the results show our method +outperforms existing deep learning robotic people tracking methods in both +tracking accuracy and tracking precision with statistical significance. +Additionally, a comprehensive multi-object tracking comparison study was +performed against the state-of-the-art methods in urban environments, +demonstrating the generalizability of LDTrack. An ablation study was performed +to validate the design choices of LDTrack. + +
+
+
+
+
+ + ♻ ☆ DMPlug: A Plug-in Method for Solving Inverse Problems with Diffusion + Models NeurIPS 2024 + + +
+ Pretrained diffusion models (DMs) have recently been popularly used in +solving inverse problems (IPs). The existing methods mostly interleave +iterative steps in the reverse diffusion process and iterative steps to bring +the iterates closer to satisfying the measurement constraint. However, such +interleaving methods struggle to produce final results that look like natural +objects of interest (i.e., manifold feasibility) and fit the measurement (i.e., +measurement feasibility), especially for nonlinear IPs. Moreover, their +capabilities to deal with noisy IPs with unknown types and levels of +measurement noise are unknown. In this paper, we advocate viewing the reverse +process in DMs as a function and propose a novel plug-in method for solving IPs +using pretrained DMs, dubbed DMPlug. DMPlug addresses the issues of manifold +feasibility and measurement feasibility in a principled manner, and also shows +great potential for being robust to unknown types and levels of noise. Through +extensive experiments across various IP tasks, including two linear and three +nonlinear IPs, we demonstrate that DMPlug consistently outperforms +state-of-the-art methods, often by large margins especially for nonlinear IPs. +The code is available at https://github.com/sun-umn/DMPlug. + +
+
+ comment: Published in NeurIPS 2024 + (https://openreview.net/forum?id=81IFFsfQUj) +
+
+
+
+
+ + ♻ ☆ Deep neural network-based detection of counterfeit products from + smartphone images + + +
+ Counterfeit products such as drugs and vaccines as well as luxury items such +as high-fashion handbags, watches, jewelry, garments, and cosmetics, represent +significant direct losses of revenue to legitimate manufacturers and vendors, +as well as indirect costs to societies at large. We present the world's first +purely computer-vision-based system to combat such counterfeiting-one that does +not require special security tags or other alterations to the products or +modifications to supply chain tracking. Our deep neural network system shows +high accuracy on branded garments from our first manufacturer tested (99.71% +after 3.06% rejections) using images captured under natural, weakly controlled +conditions, such as in retail stores, customs checkpoints, warehouses, and +outdoors. Our system, suitably transfer trained on a small number of fake and +genuine articles, should find application in additional product categories as +well, for example fashion accessories, perfume boxes, medicines, and more. + +
+
+
+
+
+ + ♻ ☆ CLIBD: Bridging Vision and Genomics for Biodiversity Monitoring at Scale + + +
+ Measuring biodiversity is crucial for understanding ecosystem health. While +prior works have developed machine learning models for taxonomic classification +of photographic images and DNA separately, in this work, we introduce a +multimodal approach combining both, using CLIP-style contrastive learning to +align images, barcode DNA, and text-based representations of taxonomic labels +in a unified embedding space. This allows for accurate classification of both +known and unknown insect species without task-specific fine-tuning, leveraging +contrastive learning for the first time to fuse DNA and image data. Our method +surpasses previous single-modality approaches in accuracy by over 8% on +zero-shot learning tasks, showcasing its effectiveness in biodiversity studies. + +
+
+ comment: 25 pages with 11 figures +
+
+
+
+
+ + ♻ ☆ BetterDepth: Plug-and-Play Diffusion Refiner for Zero-Shot Monocular + Depth Estimation NeurIPS 2024 + + +
+ By training over large-scale datasets, zero-shot monocular depth estimation +(MDE) methods show robust performance in the wild but often suffer from +insufficient detail. Although recent diffusion-based MDE approaches exhibit a +superior ability to extract details, they struggle in geometrically complex +scenes that challenge their geometry prior, trained on less diverse 3D data. To +leverage the complementary merits of both worlds, we propose BetterDepth to +achieve geometrically correct affine-invariant MDE while capturing fine +details. Specifically, BetterDepth is a conditional diffusion-based refiner +that takes the prediction from pre-trained MDE models as depth conditioning, in +which the global depth layout is well-captured, and iteratively refines details +based on the input image. For the training of such a refiner, we propose global +pre-alignment and local patch masking methods to ensure BetterDepth remains +faithful to the depth conditioning while learning to add fine-grained scene +details. With efficient training on small-scale synthetic datasets, BetterDepth +achieves state-of-the-art zero-shot MDE performance on diverse public datasets +and on in-the-wild scenes. Moreover, BetterDepth can improve the performance of +other MDE models in a plug-and-play manner without further re-training. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Virchow2: Scaling Self-Supervised Mixed Magnification Models in + Pathology + + +
+ Foundation models are rapidly being developed for computational pathology +applications. However, it remains an open question which factors are most +important for downstream performance with data scale and diversity, model size, +and training algorithm all playing a role. In this work, we propose algorithmic +modifications, tailored for pathology, and we present the result of scaling +both data and model size, surpassing previous studies in both dimensions. We +introduce three new models: Virchow2, a 632 million parameter vision +transformer, Virchow2G, a 1.9 billion parameter vision transformer, and +Virchow2G Mini, a 22 million parameter distillation of Virchow2G, each trained +with 3.1 million histopathology whole slide images, with diverse tissues, +originating institutions, and stains. We achieve state of the art performance +on 12 tile-level tasks, as compared to the top performing competing models. Our +results suggest that data diversity and domain-specific methods can outperform +models that only scale in the number of parameters, but, on average, +performance benefits from the combination of domain-specific methods, data +scale, and model scale. + +
+
+
+
+
+ + ♻ ☆ Applying Guidance in a Limited Interval Improves Sample and Distribution + Quality in Diffusion Models NeurIPS 2024 + + +
+ Guidance is a crucial technique for extracting the best performance out of +image-generating diffusion models. Traditionally, a constant guidance weight +has been applied throughout the sampling chain of an image. We show that +guidance is clearly harmful toward the beginning of the chain (high noise +levels), largely unnecessary toward the end (low noise levels), and only +beneficial in the middle. We thus restrict it to a specific range of noise +levels, improving both the inference speed and result quality. This limited +guidance interval improves the record FID in ImageNet-512 significantly, from +1.81 to 1.40. We show that it is quantitatively and qualitatively beneficial +across different sampler parameters, network architectures, and datasets, +including the large-scale setting of Stable Diffusion XL. We thus suggest +exposing the guidance interval as a hyperparameter in all diffusion models that +use guidance. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Self-supervised 3D Point Cloud Completion via Multi-view Adversarial + Learning + + +
+ In real-world scenarios, scanned point clouds are often incomplete due to +occlusion issues. The task of self-supervised point cloud completion involves +reconstructing missing regions of these incomplete objects without the +supervision of complete ground truth. Current self-supervised methods either +rely on multiple views of partial observations for supervision or overlook the +intrinsic geometric similarity that can be identified and utilized from the +given partial point clouds. In this paper, we propose MAL-SPC, a framework that +effectively leverages both object-level and category-specific geometric +similarities to complete missing structures. Our MAL-SPC does not require any +3D complete supervision and only necessitates a single partial point cloud for +each object. Specifically, we first introduce a Pattern Retrieval Network to +retrieve similar position and curvature patterns between the partial input and +the predicted shape, then leverage these similarities to densify and refine the +reconstructed results. Additionally, we render the reconstructed complete shape +into multi-view depth maps and design an adversarial learning module to learn +the geometry of the target shape from category-specific single-view depth +images. To achieve anisotropic rendering, we design a density-aware radius +estimation algorithm to improve the quality of the rendered images. Our MAL-SPC +yields the best results compared to current state-of-the-art methods.We will +make the source code publicly available at \url{https://github.com/ltwu6/malspc + +
+
+ comment: 14 pages,10 figures +
+
+
+
+
+ + ♻ ☆ ChartInsights: Evaluating Multimodal Large Language Models for Low-Level + Chart Question Answering + + +
+ Chart question answering (ChartQA) tasks play a critical role in interpreting +and extracting insights from visualization charts. While recent advancements in +multimodal large language models (MLLMs) like GPT-4o have shown promise in +high-level ChartQA tasks, such as chart captioning, their effectiveness in +low-level ChartQA tasks (e.g., identifying correlations) remains underexplored. +In this paper, we address this gap by evaluating MLLMs on low-level ChartQA +using a newly curated dataset, ChartInsights, which consists of 22,347 (chart, +task, query, answer) covering 10 data analysis tasks across 7 chart types. We +systematically evaluate 19 advanced MLLMs, including 12 open-source and 7 +closed-source models. The average accuracy rate across these models is 39.8%, +with GPT-4o achieving the highest accuracy at 69.17%. To further explore the +limitations of MLLMs in low-level ChartQA, we conduct experiments that alter +visual elements of charts (e.g., changing color schemes, adding image noise) to +assess their impact on the task effectiveness. Furthermore, we propose a new +textual prompt strategy, Chain-of-Charts, tailored for low-level ChartQA tasks, +which boosts performance by 14.41%, achieving an accuracy of 83.58%. Finally, +incorporating a visual prompt strategy that directs attention to relevant +visual elements further improves accuracy to 84.32%. + +
+
+ comment: EMNLP 2024 Conference Paper +
+
+
+
+
+ + ♻ ☆ EViT: An Eagle Vision Transformer with Bi-Fovea Self-Attention + + +
+ Owing to advancements in deep learning technology, Vision Transformers (ViTs) +have demonstrated impressive performance in various computer vision tasks. +Nonetheless, ViTs still face some challenges, such as high computational +complexity and the absence of desirable inductive biases. To alleviate these +issues, {the potential advantages of combining eagle vision with ViTs are +explored. We summarize a Bi-Fovea Visual Interaction (BFVI) structure inspired +by the unique physiological and visual characteristics of eagle eyes. A novel +Bi-Fovea Self-Attention (BFSA) mechanism and Bi-Fovea Feedforward Network +(BFFN) are proposed based on this structural design approach, which can be used +to mimic the hierarchical and parallel information processing scheme of the +biological visual cortex, enabling networks to learn feature representations of +targets in a coarse-to-fine manner. Furthermore, a Bionic Eagle Vision (BEV) +block is designed as the basic building unit based on the BFSA mechanism and +BFFN. By stacking BEV blocks, a unified and efficient family of pyramid +backbone networks called Eagle Vision Transformers (EViTs) is developed. +Experimental results show that EViTs exhibit highly competitive performance in +various computer vision tasks, such as image classification, object detection +and semantic segmentation. Compared with other approaches, EViTs have +significant advantages, especially in terms of performance and computational +efficiency. Code is available at https://github.com/nkusyl/EViT + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ♻ ☆ Advantages of Neural Population Coding for Deep Learning + + +
+ Scalar variables, e.g., the orientation of a shape in an image, are commonly +predicted using a single output neuron in a neural network. In contrast, the +mammalian cortex represents variables with a population of neurons. In this +population code, each neuron is most active at its preferred value and shows +partial activity for other values. Here, we investigate the benefit of using a +population code for the output layer of a neural network. We compare population +codes against single-neuron outputs and one-hot vectors. First, we show +theoretically and in experiments with synthetic data that population codes +improve robustness to input noise in networks of stacked linear layers. Second, +we demonstrate the benefit of using population codes to encode ambiguous +outputs, such as the pose of symmetric objects. Using the T-LESS dataset of +feature-less real-world objects, we show that population codes improve the +accuracy of predicting 3D object orientation from image input. + +
+
+
+
+
+ + ♻ ☆ ATM: Improving Model Merging by Alternating Tuning and Merging + + +
+ Model merging has recently emerged as a cost-efficient paradigm for +multi-task learning. Among current approaches, task arithmetic stands out for +its simplicity and effectiveness. In this paper, we motivate the effectiveness +of task vectors by linking them to multi-task gradients. We show that in a +single-epoch scenario, task vectors are mathematically equivalent to the +gradients obtained via gradient descent in a multi-task setting, and still +approximate these gradients in subsequent epochs. Furthermore, we show that +task vectors perform optimally when equality is maintained, and their +effectiveness is largely driven by the first epoch's gradient. Building on this +insight, we propose viewing model merging as a single step in an iterative +process that Alternates between Tuning and Merging (ATM). This method acts as a +bridge between model merging and multi-task gradient descent, achieving +state-of-the-art results with the same data and computational requirements. We +extensively evaluate ATM across diverse settings, achieving up to 20% higher +accuracy in computer vision and NLP tasks, compared to the best baselines. +Finally, we provide both empirical and theoretical support for its +effectiveness, demonstrating increased orthogonality between task vectors and +proving that ATM minimizes an upper bound on the loss obtained by jointly +finetuning all tasks. + +
+
+ comment: Main paper: 10 Pages, 11 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ IFAdapter: Instance Feature Control for Grounded Text-to-Image + Generation + + +
+ While Text-to-Image (T2I) diffusion models excel at generating visually +appealing images of individual instances, they struggle to accurately position +and control the features generation of multiple instances. The Layout-to-Image +(L2I) task was introduced to address the positioning challenges by +incorporating bounding boxes as spatial control signals, but it still falls +short in generating precise instance features. In response, we propose the +Instance Feature Generation (IFG) task, which aims to ensure both positional +accuracy and feature fidelity in generated instances. To address the IFG task, +we introduce the Instance Feature Adapter (IFAdapter). The IFAdapter enhances +feature depiction by incorporating additional appearance tokens and utilizing +an Instance Semantic Map to align instance-level features with spatial +locations. The IFAdapter guides the diffusion process as a plug-and-play +module, making it adaptable to various community models. For evaluation, we +contribute an IFG benchmark and develop a verification pipeline to objectively +compare models' abilities to generate instances with accurate positioning and +features. Experimental results demonstrate that IFAdapter outperforms other +models in both quantitative and qualitative evaluations. + +
+
+
+
+
+ + ♻ ☆ GVKF: Gaussian Voxel Kernel Functions for Highly Efficient Surface + Reconstruction in Open Scenes NeurIPS 2024 + + +
+ In this paper we present a novel method for efficient and effective 3D +surface reconstruction in open scenes. Existing Neural Radiance Fields (NeRF) +based works typically require extensive training and rendering time due to the +adopted implicit representations. In contrast, 3D Gaussian splatting (3DGS) +uses an explicit and discrete representation, hence the reconstructed surface +is built by the huge number of Gaussian primitives, which leads to excessive +memory consumption and rough surface details in sparse Gaussian areas. To +address these issues, we propose Gaussian Voxel Kernel Functions (GVKF), which +establish a continuous scene representation based on discrete 3DGS through +kernel regression. The GVKF integrates fast 3DGS rasterization and highly +effective scene implicit representations, achieving high-fidelity open scene +surface reconstruction. Experiments on challenging scene datasets demonstrate +the efficiency and effectiveness of our proposed GVKF, featuring with high +reconstruction quality, real-time rendering speed, significant savings in +storage and training memory consumption. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ BCDNet: A Fast Residual Neural Network For Invasive Ductal Carcinoma + Detection + + +
+ It is of great significance to diagnose Invasive Ductal Carcinoma (IDC) in +early stage, which is the most common subtype of breast cancer. Although the +powerful models in the Computer-Aided Diagnosis (CAD) systems provide promising +results, it is still difficult to integrate them into other medical devices or +use them without sufficient computation resource. In this paper, we propose +BCDNet, which firstly upsamples the input image by the residual block and use +smaller convolutional block and a special MLP to learn features. BCDNet is +proofed to effectively detect IDC in histopathological RGB images with an +average accuracy of 91.6% and reduce training consumption effectively compared +to ResNet 50 and ViT-B-16. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Classification Done Right for Vision-Language Pre-Training NeurIPS 2024 + + +
+ We introduce SuperClass, a super simple classification method for +vision-language pre-training on image-text data. Unlike its contrastive +counterpart CLIP who contrast with a text encoder, SuperClass directly utilizes +tokenized raw text as supervised classification labels, without the need for +additional text filtering or selection. Due to the absence of the text encoding +as contrastive target, SuperClass does not require a text encoder and does not +need to maintain a large batch size as CLIP does. SuperClass demonstrated +superior performance on various downstream tasks, including classic computer +vision benchmarks and vision language downstream tasks. We further explored the +scaling behavior of SuperClass on model size, training length, or data size, +and reported encouraging results and comparisons to CLIP. +https://github.com/x-cls/superclass + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Concept-Attention Whitening for Interpretable Skin Lesion Diagnosis + + +
+ The black-box nature of deep learning models has raised concerns about their +interpretability for successful deployment in real-world clinical applications. +To address the concerns, eXplainable Artificial Intelligence (XAI) aims to +provide clear and understandable explanations of the decision-making process. +In the medical domain, concepts such as attributes of lesions or abnormalities +serve as key evidence for deriving diagnostic results. Existing concept-based +models mainly depend on concepts that appear independently and require +fine-grained concept annotations such as bounding boxes. However, a medical +image usually contains multiple concepts, and the fine-grained concept +annotations are difficult to acquire. In this paper, we aim to interpret +representations in deep neural networks by aligning the axes of the latent +space with known concepts of interest. We propose a novel Concept-Attention +Whitening (CAW) framework for interpretable skin lesion diagnosis. CAW is +comprised of a disease diagnosis branch and a concept alignment branch. In the +former branch, we train a convolutional neural network (CNN) with an inserted +CAW layer to perform skin lesion diagnosis. The CAW layer decorrelates features +and aligns image features to conceptual meanings via an orthogonal matrix. In +the latter branch, the orthogonal matrix is calculated under the guidance of +the concept attention mask. We particularly introduce a weakly-supervised +concept mask generator that only leverages coarse concept labels for filtering +local regions that are relevant to certain concepts, improving the optimization +of the orthogonal matrix. Extensive experiments on two public skin lesion +diagnosis datasets demonstrated that CAW not only enhanced interpretability but +also maintained a state-of-the-art diagnostic performance. + +
+
+ comment: MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ Degradation Oriented and Regularized Network for Blind Depth + Super-Resolution + + +
+ Recent RGB-guided depth super-resolution methods have achieved impressive +performance under the assumption of fixed and known degradation (e.g., bicubic +downsampling). However, in real-world scenarios, captured depth data often +suffer from unconventional and unknown degradation due to sensor limitations +and complex imaging environments (e.g., low reflective surfaces, varying +illumination). Consequently, the performance of these methods significantly +declines when real-world degradation deviate from their assumptions. In this +paper, we propose the Degradation Oriented and Regularized Network (DORNet), a +novel framework designed to adaptively address unknown degradation in +real-world scenes through implicit degradation representations. Our approach +begins with the development of a self-supervised degradation learning strategy, +which models the degradation representations of low-resolution depth data using +routing selection-based degradation regularization. To facilitate effective +RGB-D fusion, we further introduce a degradation-oriented feature +transformation module that selectively propagates RGB content into the depth +data based on the learned degradation priors. Extensive experimental results on +both real and synthetic datasets demonstrate the superiority of our DORNet in +handling unknown degradation, outperforming existing methods. The code is +available at https://github.com/yanzq95/DORNet. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Cross-Task Affinity Learning for Multitask Dense Scene Predictions + + +
+ Multitask learning (MTL) has become prominent for its ability to predict +multiple tasks jointly, achieving better per-task performance with fewer +parameters than single-task learning. Recently, decoder-focused architectures +have significantly improved multitask performance by refining task predictions +using features from related tasks. However, most refinement methods struggle to +efficiently capture both local and long-range dependencies between +task-specific representations and cross-task patterns. In this paper, we +introduce the Cross-Task Affinity Learning (CTAL) module, a lightweight +framework that enhances task refinement in multitask networks. CTAL effectively +captures local and long-range cross-task interactions by optimizing task +affinity matrices for parameter-efficient grouped convolutions without concern +for information loss. Our results demonstrate state-of-the-art MTL performance +for both CNN and transformer backbones, using significantly fewer parameters +than single-task learning. Our code is publicly available at +https://github.com/Armanfard-Lab/EMA-Net. + +
+
+ comment: Accepted for publication at the IEEE Winter Conference on + Applications of Computer Vision (WACV) 2025 +
+
+
+
+
+ + ♻ ☆ TFS-NeRF: Template-Free NeRF for Semantic 3D Reconstruction of Dynamic + Scene NeurIPS 2024 + + +
+ Despite advancements in Neural Implicit models for 3D surface reconstruction, +handling dynamic environments with arbitrary rigid, non-rigid, or deformable +entities remains challenging. Many template-based methods are entity-specific, +focusing on humans, while generic reconstruction methods adaptable to such +dynamic scenes often require additional inputs like depth or optical flow or +rely on pre-trained image features for reasonable outcomes. These methods +typically use latent codes to capture frame-by-frame deformations. In contrast, +some template-free methods bypass these requirements and adopt traditional LBS +(Linear Blend Skinning) weights for a detailed representation of deformable +object motions, although they involve complex optimizations leading to lengthy +training times. To this end, as a remedy, this paper introduces TFS-NeRF, a +template-free 3D semantic NeRF for dynamic scenes captured from sparse or +single-view RGB videos, featuring interactions among various entities and more +time-efficient than other LBS-based approaches. Our framework uses an +Invertible Neural Network (INN) for LBS prediction, simplifying the training +process. By disentangling the motions of multiple entities and optimizing +per-entity skinning weights, our method efficiently generates accurate, +semantically separable geometries. Extensive experiments demonstrate that our +approach produces high-quality reconstructions of both deformable and +non-deformable objects in complex interactions, with improved training +efficiency compared to existing methods. + +
+
+ comment: Accepted in NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ DeTikZify: Synthesizing Graphics Programs for Scientific Figures and + Sketches with TikZ NeurIPS 2024 + + +
+ Creating high-quality scientific figures can be time-consuming and +challenging, even though sketching ideas on paper is relatively easy. +Furthermore, recreating existing figures that are not stored in formats +preserving semantic information is equally complex. To tackle this problem, we +introduce DeTikZify, a novel multimodal language model that automatically +synthesizes scientific figures as semantics-preserving TikZ graphics programs +based on sketches and existing figures. To achieve this, we create three new +datasets: DaTikZv2, the largest TikZ dataset to date, containing over 360k +human-created TikZ graphics; SketchFig, a dataset that pairs hand-drawn +sketches with their corresponding scientific figures; and MetaFig, a collection +of diverse scientific figures and associated metadata. We train DeTikZify on +MetaFig and DaTikZv2, along with synthetically generated sketches learned from +SketchFig. We also introduce an MCTS-based inference algorithm that enables +DeTikZify to iteratively refine its outputs without the need for additional +training. Through both automatic and human evaluation, we demonstrate that +DeTikZify outperforms commercial Claude 3 and GPT-4V in synthesizing TikZ +programs, with the MCTS algorithm effectively boosting its performance. We make +our code, models, and datasets publicly available. + +
+
+ comment: Accepted at NeurIPS 2024 (spotlight); Project page: + https://github.com/potamides/DeTikZify +
+
+
+
+
+ + ♻ ☆ OmniGS: Fast Radiance Field Reconstruction using Omnidirectional + Gaussian Splatting + + +
+ Photorealistic reconstruction relying on 3D Gaussian Splatting has shown +promising potential in various domains. However, the current 3D Gaussian +Splatting system only supports radiance field reconstruction using undistorted +perspective images. In this paper, we present OmniGS, a novel omnidirectional +Gaussian splatting system, to take advantage of omnidirectional images for fast +radiance field reconstruction. Specifically, we conduct a theoretical analysis +of spherical camera model derivatives in 3D Gaussian Splatting. According to +the derivatives, we then implement a new GPU-accelerated omnidirectional +rasterizer that directly splats 3D Gaussians onto the equirectangular screen +space for omnidirectional image rendering. We realize differentiable +optimization of the omnidirectional radiance field without the requirement of +cube-map rectification or tangent-plane approximation. Extensive experiments +conducted in egocentric and roaming scenarios demonstrate that our method +achieves state-of-the-art reconstruction quality and high rendering speed using +omnidirectional images. The code will be publicly available. + +
+
+ comment: 8 pages, 6 figures, accepted by WACV 2025, project page: + https://liquorleaf.github.io/research/OmniGS/ +
+
+
+
+
+ + ♻ ☆ Beyond Grid Data: Exploring Graph Neural Networks for Earth Observation + + +
+ Earth Observation (EO) data analysis has been significantly revolutionized by +deep learning (DL), with applications typically limited to grid-like data +structures. Graph Neural Networks (GNNs) emerge as an important innovation, +propelling DL into the non-Euclidean domain. Naturally, GNNs can effectively +tackle the challenges posed by diverse modalities, multiple sensors, and the +heterogeneous nature of EO data. To introduce GNNs in the related domains, our +review begins by offering fundamental knowledge on GNNs. Then, we summarize the +generic problems in EO, to which GNNs can offer potential solutions. Following +this, we explore a broad spectrum of GNNs' applications to scientific problems +in Earth systems, covering areas such as weather and climate analysis, disaster +management, air quality monitoring, agriculture, land cover classification, +hydrological process modeling, and urban modeling. The rationale behind +adopting GNNs in these fields is explained, alongside methodologies for +organizing graphs and designing favorable architectures for various tasks. +Furthermore, we highlight methodological challenges of implementing GNNs in +these domains and possible solutions that could guide future research. While +acknowledging that GNNs are not a universal solution, we conclude the paper by +comparing them with other popular architectures like transformers and analyzing +their potential synergies. + +
+
+ comment: Accepted for publication in Geoscience and Remote Sensing Magazine + (GRSM) +
+
+
+
+
+ + ♻ ☆ In-Context Translation: Towards Unifying Image Recognition, Processing, + and Generation + + +
+ We propose In-Context Translation (ICT), a general learning framework to +unify visual recognition (e.g., semantic segmentation), low-level image +processing (e.g., denoising), and conditional image generation (e.g., +edge-to-image synthesis). Thanks to unification, ICT significantly reduces the +inherent inductive bias that comes with designing models for specific tasks, +and it maximizes mutual enhancement across similar tasks. However, the +unification across a large number of tasks is non-trivial due to various data +formats and training pipelines. To this end, ICT introduces two designs. +Firstly, it standardizes input-output data of different tasks into RGB image +pairs, e.g., semantic segmentation data pairs an RGB image with its +segmentation mask in the same RGB format. This turns different tasks into a +general translation task between two RGB images. Secondly, it standardizes the +training of different tasks into a general in-context learning, where +"in-context" means the input comprises an example input-output pair of the +target task and a query image. The learning objective is to generate the +"missing" data paired with the query. The implicit translation process is thus +between the query and the generated image. In experiments, ICT unifies ten +vision tasks and showcases impressive performance on their respective +benchmarks. Notably, ICT performs well across three major categories of +computer vision tasks, while its two competitors (Painter and PromptDiffusion) +are only effective in at most two of these task categories. In addition, +compared to its competitors, ICT trained on only 4 RTX 3090 GPUs is shown to be +more efficient and less costly in training. + +
+
+
+
+
+ + ♻ ☆ Harnessing Webpage UIs for Text-Rich Visual Understanding + + +
+ Text-rich visual understanding-the ability to process environments where +dense textual content is integrated with visuals-is crucial for multimodal +large language models (MLLMs) to interact effectively with structured +environments. To enhance this capability, we propose synthesizing general +multimodal instructions from webpage UIs using text-based large language models +(LLMs). Despite lacking direct visual input, text-based LLMs are able to +process structured text representations from webpage accessibility trees. These +instructions are then paired with UI screenshots to train multimodal models. We +introduce MultiUI, a dataset containing 7.3 million samples from 1 million +websites, covering diverse multimodal tasks and UI layouts. Models trained on +MultiUI not only excel in web UI tasks-achieving up to a 48% improvement on +VisualWebBench and a 19.1% boost in element accuracy on a web agent dataset +Mind2Web-but also generalize surprisingly well to non-web UI tasks and even to +non-UI domains, such as document understanding, OCR, and chart interpretation. +These results highlight the broad applicability of web UI data for advancing +text-rich visual understanding across various scenarios. + +
+
+
+
+
+ + ♻ ☆ Multi-label Cluster Discrimination for Visual Representation Learning ECCV2024 + + +
+ Contrastive Language Image Pre-training (CLIP) has recently demonstrated +success across various tasks due to superior feature representation empowered +by image-text contrastive learning. However, the instance discrimination method +used by CLIP can hardly encode the semantic structure of training data. To +handle this limitation, cluster discrimination has been proposed through +iterative cluster assignment and classification. Nevertheless, most cluster +discrimination approaches only define a single pseudo-label for each image, +neglecting multi-label signals in the image. In this paper, we propose a novel +Multi-Label Cluster Discrimination method named MLCD to enhance representation +learning. In the clustering step, we first cluster the large-scale LAION-400M +dataset into one million centers based on off-the-shelf embedding features. +Considering that natural images frequently contain multiple visual objects or +attributes, we select the multiple closest centers as auxiliary class labels. +In the discrimination step, we design a novel multi-label classification loss, +which elegantly separates losses from positive classes and negative classes, +and alleviates ambiguity on decision boundary. We validate the proposed +multi-label cluster discrimination method with experiments on different scales +of models and pre-training datasets. Experimental results show that our method +achieves state-of-the-art performance on multiple downstream tasks including +linear probe, zero-shot classification, and image-text retrieval. Code and +models have been released at https://github.com/deepglint/unicom . + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ♻ ☆ CPnP: Consistent Pose Estimator for Perspective-n-Point Problem with + Bias Elimination + + +
+ The Perspective-n-Point (PnP) problem has been widely studied in both +computer vision and photogrammetry societies. With the development of feature +extraction techniques, a large number of feature points might be available in a +single shot. It is promising to devise a consistent estimator, i.e., the +estimate can converge to the true camera pose as the number of points +increases. To this end, we propose a consistent PnP solver, named \emph{CPnP}, +with bias elimination. Specifically, linear equations are constructed from the +original projection model via measurement model modification and variable +elimination, based on which a closed-form least-squares solution is obtained. +We then analyze and subtract the asymptotic bias of this solution, resulting in +a consistent estimate. Additionally, Gauss-Newton (GN) iterations are executed +to refine the consistent solution. Our proposed estimator is efficient in terms +of computations -- it has $O(n)$ computational complexity. Experimental tests +on both synthetic data and real images show that our proposed estimator is +superior to some well-known ones for images with dense visual features, in +terms of estimation precision and computing time. + +
+
+
+
+
+ + ♻ ☆ VHM: Versatile and Honest Vision Language Model for Remote Sensing Image + Analysis + + +
+ This paper develops a Versatile and Honest vision language Model (VHM) for +remote sensing image analysis. VHM is built on a large-scale remote sensing +image-text dataset with rich-content captions (VersaD), and an honest +instruction dataset comprising both factual and deceptive questions (HnstD). +Unlike prevailing remote sensing image-text datasets, in which image captions +focus on a few prominent objects and their relationships, VersaD captions +provide detailed information about image properties, object attributes, and the +overall scene. This comprehensive captioning enables VHM to thoroughly +understand remote sensing images and perform diverse remote sensing tasks. +Moreover, different from existing remote sensing instruction datasets that only +include factual questions, HnstD contains additional deceptive questions +stemming from the non-existence of objects. This feature prevents VHM from +producing affirmative answers to nonsense queries, thereby ensuring its +honesty. In our experiments, VHM significantly outperforms various vision +language models on common tasks of scene classification, visual question +answering, and visual grounding. Additionally, VHM achieves competent +performance on several unexplored tasks, such as building vectorizing, +multi-label classification and honest question answering. We will release the +code, data and model weights at https://github.com/opendatalab/VHM . + +
+
+ comment: Equal contribution: Chao Pang, Xingxing Weng, Jiang Wu; Corresponding + author: Gui-Song Xia, Conghui He +
+
+
+
+
+ + ♻ ☆ FedFMS: Exploring Federated Foundation Models for Medical Image + Segmentation + + +
+ Medical image segmentation is crucial for clinical diagnosis. The +Segmentation Anything Model (SAM) serves as a powerful foundation model for +visual segmentation and can be adapted for medical image segmentation. However, +medical imaging data typically contain privacy-sensitive information, making it +challenging to train foundation models with centralized storage and sharing. To +date, there are few foundation models tailored for medical image deployment +within the federated learning framework, and the segmentation performance, as +well as the efficiency of communication and training, remain unexplored. In +response to these issues, we developed Federated Foundation models for Medical +image Segmentation (FedFMS), which includes the Federated SAM (FedSAM) and a +communication and training-efficient Federated SAM with Medical SAM Adapter +(FedMSA). Comprehensive experiments on diverse datasets are conducted to +investigate the performance disparities between centralized training and +federated learning across various configurations of FedFMS. The experiments +revealed that FedFMS could achieve performance comparable to models trained via +centralized training methods while maintaining privacy. Furthermore, FedMSA +demonstrated the potential to enhance communication and training efficiency. +Our model implementation codes are available at +https://github.com/LIU-YUXI/FedFMS. + +
+
+ comment: Accepted by MICCAI'2024 +
+
+
+
+
+ + ♻ ☆ Digi2Real: Bridging the Realism Gap in Synthetic Data Face Recognition + via Foundation Models + + +
+ The accuracy of face recognition systems has improved significantly in the +past few years, thanks to the large amount of data collected and the +advancement in neural network architectures. However, these large-scale +datasets are often collected without explicit consent, raising ethical and +privacy concerns. To address this, there have been proposals to use synthetic +datasets for training face recognition models. Yet, such models still rely on +real data to train the generative models and generally exhibit inferior +performance compared to those trained on real datasets. One of these datasets, +DigiFace, uses a graphics pipeline to generate different identities and +different intra-class variations without using real data in training the +models. However, the performance of this approach is poor on face recognition +benchmarks, possibly due to the lack of realism in the images generated from +the graphics pipeline. In this work, we introduce a novel framework for realism +transfer aimed at enhancing the realism of synthetically generated face images. +Our method leverages the large-scale face foundation model, and we adapt the +pipeline for realism enhancement. By integrating the controllable aspects of +the graphics pipeline with our realism enhancement technique, we generate a +large amount of realistic variations-combining the advantages of both +approaches. Our empirical evaluations demonstrate that models trained using our +enhanced dataset significantly improve the performance of face recognition +systems over the baseline. The source code and datasets will be made available +publicly: https://www.idiap.ch/paper/digi2real + +
+
+ comment: The dataset would be available here: + https://www.idiap.ch/paper/digi2real +
+
+
+
+
+ + ♻ ☆ A Data Perspective on Enhanced Identity Preservation for Diffusion + Personalization + + +
+ Large text-to-image models have revolutionized the ability to generate +imagery using natural language. However, particularly unique or personal visual +concepts, such as pets and furniture, will not be captured by the original +model. This has led to interest in how to personalize a text-to-image model. +Despite significant progress, this task remains a formidable challenge, +particularly in preserving the subject's identity. Most researchers attempt to +address this issue by modifying model architectures. These methods are capable +of keeping the subject structure and color but fail to preserve identity +details. Towards this issue, our approach takes a data-centric perspective. We +introduce a novel regularization dataset generation strategy on both the text +and image level. This strategy enables the model to preserve fine details of +the desired subjects, such as text and logos. Our method is +architecture-agnostic and can be flexibly applied on various text-to-image +models. We show on established benchmarks that our data-centric approach forms +the new state of the art in terms of identity preservation and text alignment. + +
+
+ comment: WACV 2025 +
+
+
+
+
+ + ♻ ☆ Utilizing Large Language Models in an iterative paradigm with Domain + feedback for Zero-shot Molecule optimization + + +
+ Molecule optimization is a critical task in drug discovery to optimize +desired properties of a given molecule through chemical modification. Despite +Large Language Models (LLMs) holding the potential to efficiently simulate this +task by using natural language to direct the optimization, straightforwardly +utilizing shows limited performance. In this work, we facilitate utilizing LLMs +in an iterative paradigm by proposing a simple yet highly effective domain +feedback provider, namely $\text{Re}^3$DF. In detail, $\text{Re}^3$DF harnesses +an external toolkit, RDKit, to handle the molecule hallucination, if the +modified molecule is chemically invalid. Otherwise, its desired properties are +computed and compared to the original one, establishing reliable domain +feedback with correct direction and distance towards the objective, followed by +a retrieved example, to explicitly guide the LLM to refine the modified +molecule. We conduct experiments across both single- and multi-property +objectives with 2 thresholds, where $\text{Re}^3$DF shows significant +improvements. Particularly, for 20 single-property objectives, $\text{Re}^3$DF +enhances Hit ratio by 16.95% and 20.76% under loose and strict thresholds, +respectively. For 32 multi-property objectives, $\text{Re}^3$DF enhances Hit +ratio by 6.04% and 5.25%. + +
+
+
+
+
+ + ♻ ☆ Generalizing Alignment Paradigm of Text-to-Image Generation with + Preferences through $f$-divergence Minimization + + +
+ Direct Preference Optimization (DPO) has recently expanded its successful +application from aligning large language models (LLMs) to aligning +text-to-image models with human preferences, which has generated considerable +interest within the community. However, we have observed that these approaches +rely solely on minimizing the reverse Kullback-Leibler divergence during +alignment process between the fine-tuned model and the reference model, +neglecting the incorporation of other divergence constraints. In this study, we +focus on extending reverse Kullback-Leibler divergence in the alignment +paradigm of text-to-image models to $f$-divergence, which aims to garner better +alignment performance as well as good generation diversity. We provide the +generalized formula of the alignment paradigm under the $f$-divergence +condition and thoroughly analyze the impact of different divergence constraints +on alignment process from the perspective of gradient fields. We conduct +comprehensive evaluation on image-text alignment performance, human value +alignment performance and generation diversity performance under different +divergence constraints, and the results indicate that alignment based on +Jensen-Shannon divergence achieves the best trade-off among them. The option of +divergence employed for aligning text-to-image models significantly impacts the +trade-off between alignment performance (especially human value alignment) and +generation diversity, which highlights the necessity of selecting an +appropriate divergence for practical applications. + +
+
+ comment: 34 pages +
+
+
+
+
+ + ♻ ☆ Hierarchical Temporal Context Learning for Camera-based Semantic Scene + Completion ECCV 2024 + + +
+ Camera-based 3D semantic scene completion (SSC) is pivotal for predicting +complicated 3D layouts with limited 2D image observations. The existing +mainstream solutions generally leverage temporal information by roughly +stacking history frames to supplement the current frame, such straightforward +temporal modeling inevitably diminishes valid clues and increases learning +difficulty. To address this problem, we present HTCL, a novel Hierarchical +Temporal Context Learning paradigm for improving camera-based semantic scene +completion. The primary innovation of this work involves decomposing temporal +context learning into two hierarchical steps: (a) cross-frame affinity +measurement and (b) affinity-based dynamic refinement. Firstly, to separate +critical relevant context from redundant information, we introduce the pattern +affinity with scale-aware isolation and multiple independent learners for +fine-grained contextual correspondence modeling. Subsequently, to dynamically +compensate for incomplete observations, we adaptively refine the feature +sampling locations based on initially identified locations with high affinity +and their neighboring relevant regions. Our method ranks $1^{st}$ on the +SemanticKITTI benchmark and even surpasses LiDAR-based methods in terms of mIoU +on the OpenOccupancy benchmark. Our code is available on +https://github.com/Arlo0o/HTCL. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ TalkMosaic: Interactive PhotoMosaic with Multi-modal LLM Q&A + Interactions + + +
+ We use images of cars of a wide range of varieties to compose an image of an +animal such as a bird or a lion for the theme of environmental protection to +maximize the information about cars in a single composed image and to raise the +awareness about environmental challenges. We present a novel way of image +interaction with an artistically-composed photomosaic image, in which a simple +operation of "click and display" is used to demonstrate the interactive switch +between a tile image in a photomosaic image and the corresponding original car +image, which will be automatically saved on the Desktop. We build a multimodal +custom GPT named TalkMosaic by incorporating car images information and the +related knowledge to ChatGPT. By uploading the original car image to +TalkMosaic, we can ask questions about the given car image and get the +corresponding answers efficiently and effectively such as where to buy the tire +in the car image that satisfies high environmental standards. We give an +in-depth analysis on how to speed up the inference of multimodal LLM using +sparse attention and quantization techniques with presented probabilistic +FlashAttention (PrFlashAttention) and Staircase Adaptive Quantization (SAQ) +methods. The implemented prototype demonstrates the feasibility and +effectiveness of the presented approach. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Revisiting Surgical Instrument Segmentation Without Human Intervention: + A Graph Partitioning View ACM MM 2024 + + +
+ Surgical instrument segmentation (SIS) on endoscopic images stands as a +long-standing and essential task in the context of computer-assisted +interventions for boosting minimally invasive surgery. Given the recent surge +of deep learning methodologies and their data-hungry nature, training a neural +predictive model based on massive expert-curated annotations has been +dominating and served as an off-the-shelf approach in the field, which could, +however, impose prohibitive burden to clinicians for preparing fine-grained +pixel-wise labels corresponding to the collected surgical video frames. In this +work, we propose an unsupervised method by reframing the video frame +segmentation as a graph partitioning problem and regarding image pixels as +graph nodes, which is significantly different from the previous efforts. A +self-supervised pre-trained model is firstly leveraged as a feature extractor +to capture high-level semantic features. Then, Laplacian matrixs are computed +from the features and are eigendecomposed for graph partitioning. On the "deep" +eigenvectors, a surgical video frame is meaningfully segmented into different +modules such as tools and tissues, providing distinguishable semantic +information like locations, classes, and relations. The segmentation problem +can then be naturally tackled by applying clustering or threshold on the +eigenvectors. Extensive experiments are conducted on various datasets (e.g., +EndoVis2017, EndoVis2018, UCL, etc.) for different clinical endpoints. Across +all the challenging scenarios, our method demonstrates outstanding performance +and robustness higher than unsupervised state-of-the-art (SOTA) methods. The +code is released at https://github.com/MingyuShengSMY/GraphClusteringSIS.git. + +
+
+ comment: This paper is accepted by The 32nd ACM International Conference on + Multimedia (ACM MM 2024) Workshop on Multimedia Computing for Health and + Medicine (MCHM) +
+
+
+
+
+ + ♻ ☆ Identifying and Solving Conditional Image Leakage in Image-to-Video + Diffusion Model NeurIPS 2024 + + +
+ Diffusion models have obtained substantial progress in image-to-video +generation. However, in this paper, we find that these models tend to generate +videos with less motion than expected. We attribute this to the issue called +conditional image leakage, where the image-to-video diffusion models (I2V-DMs) +tend to over-rely on the conditional image at large time steps. We further +address this challenge from both inference and training aspects. First, we +propose to start the generation process from an earlier time step to avoid the +unreliable large-time steps of I2V-DMs, as well as an initial noise +distribution with optimal analytic expressions (Analytic-Init) by minimizing +the KL divergence between it and the actual marginal distribution to bridge the +training-inference gap. Second, we design a time-dependent noise distribution +(TimeNoise) for the conditional image during training, applying higher noise +levels at larger time steps to disrupt it and reduce the model's dependency on +it. We validate these general strategies on various I2V-DMs on our collected +open-domain image benchmark and the UCF101 dataset. Extensive results show that +our methods outperform baselines by producing higher motion scores with lower +errors while maintaining image alignment and temporal consistency, thereby +yielding superior overall performance and enabling more accurate motion +control. The project page: \url{https://cond-image-leak.github.io/}. + +
+
+ comment: NeurIPS 2024. Project page: https://cond-image-leak.github.io/ +
+
+
+
+
+ + ♻ ☆ AIWR: Aerial Image Water Resource Dataset for Segmentation Analysis + + +
+ Effective water resource management is crucial in agricultural regions like +northeastern Thailand, where limited water retention in sandy soils poses +significant challenges. In response to this issue, the Aerial Image Water +Resource (AIWR) dataset was developed, comprising 800 aerial images focused on +natural and artificial water bodies in this region. The dataset was created +using Bing Maps and follows the standards of the Fundamental Geographic Data +Set (FGDS). It includes ground truth annotations validated by experts in remote +sensing, making it an invaluable resource for researchers in geoinformatics, +computer vision, and artificial intelligence. The AIWR dataset presents +considerable challenges, such as segmentation due to variations in the size, +color, shape, and similarity of water bodies, which often resemble other land +use categories. The objective of the proposed dataset is to explore advanced +AI-driven methods for water body segmentation, addressing the unique challenges +posed by the dataset complexity and limited size. This dataset and related +research contribute to the development of novel algorithms for water +management, supporting sustainable agricultural practices in regions facing +similar challenges. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Advancing Efficient Brain Tumor Multi-Class Classification -- New + Insights from the Vision Mamba Model in Transfer Learning + + +
+ Early and accurate diagnosis of brain tumors is crucial for improving patient +survival rates. However, the detection and classification of brain tumors are +challenging due to their diverse types and complex morphological +characteristics. This study investigates the application of pre-trained models +for brain tumor classification, with a particular focus on deploying the Mamba +model. We fine-tuned several mainstream transfer learning models and applied +them to the multi-class classification of brain tumors. By comparing these +models to those trained from scratch, we demonstrated the significant +advantages of transfer learning, especially in the medical imaging field, where +annotated data is often limited. Notably, we introduced the Vision Mamba (Vim), +a novel network architecture, and applied it for the first time in brain tumor +classification, achieving exceptional classification accuracy. Experimental +results indicate that the Vim model achieved 100% classification accuracy on an +independent test set, emphasizing its potential for tumor classification tasks. +These findings underscore the effectiveness of transfer learning in brain tumor +classification and reveal that, compared to existing state-of-the-art models, +the Vim model is lightweight, efficient, and highly accurate, offering a new +perspective for clinical applications. Furthermore, the framework proposed in +this study for brain tumor classification, based on transfer learning and the +Vision Mamba model, is broadly applicable to other medical imaging +classification problems. + +
+
+
+
+
+ + ♻ ☆ D2SP: Dynamic Dual-Stage Purification Framework for Dual Noise + Mitigation in Vision-based Affective Recognition + + +
+ The contemporary state-of-the-art of Dynamic Facial Expression Recognition +(DFER) technology facilitates remarkable progress by deriving emotional +mappings of facial expressions from video content, underpinned by training on +voluminous datasets. Yet, the DFER datasets encompass a substantial volume of +noise data. Noise arises from low-quality captures that defy logical labeling, +and instances that suffer from mislabeling due to annotation bias, engendering +two principal types of uncertainty: the uncertainty regarding data usability +and the uncertainty concerning label reliability. Addressing the two types of +uncertainty, we have meticulously crafted a two-stage framework aiming at +\textbf{S}eeking \textbf{C}ertain data \textbf{I}n extensive \textbf{U}ncertain +data (SCIU). This initiative aims to purge the DFER datasets of these +uncertainties, thereby ensuring that only clean, verified data is employed in +training processes. To mitigate the issue of low-quality samples, we introduce +the Coarse-Grained Pruning (CGP) stage, which assesses sample weights and +prunes those deemed unusable due to their low weight. For samples with +incorrect annotations, the Fine-Grained Correction (FGC) stage evaluates +prediction stability to rectify mislabeled data. Moreover, SCIU is conceived as +a universally compatible, plug-and-play framework, tailored to integrate +seamlessly with prevailing DFER methodologies. Rigorous experiments across +prevalent DFER datasets and against numerous benchmark methods substantiates +SCIU's capacity to markedly elevate performance metrics. + +
+
+
+
+
+ + ♻ ☆ Document Parsing Unveiled: Techniques, Challenges, and Prospects for + Structured Information Extraction + + +
+ Document parsing is essential for converting unstructured and semi-structured +documents-such as contracts, academic papers, and invoices-into structured, +machine-readable data. Document parsing extract reliable structured data from +unstructured inputs, providing huge convenience for numerous applications. +Especially with recent achievements in Large Language Models, document parsing +plays an indispensable role in both knowledge base construction and training +data generation. This survey presents a comprehensive review of the current +state of document parsing, covering key methodologies, from modular pipeline +systems to end-to-end models driven by large vision-language models. Core +components such as layout detection, content extraction (including text, +tables, and mathematical expressions), and multi-modal data integration are +examined in detail. Additionally, this paper discusses the challenges faced by +modular document parsing systems and vision-language models in handling complex +layouts, integrating multiple modules, and recognizing high-density text. It +emphasizes the importance of developing larger and more diverse datasets and +outlines future research directions. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 48 + +
+
+
+ + ☆ Monocular Event-Based Vision for Obstacle Avoidance with a Quadrotor + + +
+ We present the first static-obstacle avoidance method for quadrotors using +just an onboard, monocular event camera. Quadrotors are capable of fast and +agile flight in cluttered environments when piloted manually, but vision-based +autonomous flight in unknown environments is difficult in part due to the +sensor limitations of traditional onboard cameras. Event cameras, however, +promise nearly zero motion blur and high dynamic range, but produce a very +large volume of events under significant ego-motion and further lack a +continuous-time sensor model in simulation, making direct sim-to-real transfer +not possible. By leveraging depth prediction as a pretext task in our learning +framework, we can pre-train a reactive obstacle avoidance events-to-control +policy with approximated, simulated events and then fine-tune the perception +component with limited events-and-depth real-world data to achieve obstacle +avoidance in indoor and outdoor settings. We demonstrate this across two +quadrotor-event camera platforms in multiple settings and find, contrary to +traditional vision-based works, that low speeds (1m/s) make the task harder and +more prone to collisions, while high speeds (5m/s) result in better event-based +depth estimation and avoidance. We also find that success rates in outdoor +scenes can be significantly higher than in certain indoor scenes. + +
+
+ comment: 18 pages with supplementary +
+
+
+
+
+ + ☆ Out-of-Distribution Recovery with Object-Centric Keypoint Inverse Policy + For Visuomotor Imitation Learning + + +
+ We propose an object-centric recovery policy framework to address the +challenges of out-of-distribution (OOD) scenarios in visuomotor policy +learning. Previous behavior cloning (BC) methods rely heavily on a large amount +of labeled data coverage, failing in unfamiliar spatial states. Without relying +on extra data collection, our approach learns a recovery policy constructed by +an inverse policy inferred from object keypoint manifold gradient in the +original training data. The recovery policy serves as a simple add-on to any +base visuomotor BC policy, agnostic to a specific method, guiding the system +back towards the training distribution to ensure task success even in OOD +situations. We demonstrate the effectiveness of our object-centric framework in +both simulation and real robot experiments, achieving an improvement of +$\textbf{77.7\%}$ over the base policy in OOD. Project Website: +https://sites.google.com/view/ocr-penn + +
+
+
+
+
+ + ☆ Data-Driven Sampling Based Stochastic MPC for Skid-Steer Mobile Robot + Navigation ICRA 2025 + + +
+ Traditional approaches to motion modeling for skid-steer robots struggle with +capturing nonlinear tire-terrain dynamics, especially during high-speed +maneuvers. In this paper, we tackle such nonlinearities by enhancing a dynamic +unicycle model with Gaussian Process (GP) regression outputs. This enables us +to develop an adaptive, uncertainty-informed navigation formulation. We solve +the resultant stochastic optimal control problem using a chance-constrained +Model Predictive Path Integral (MPPI) control method. This approach formulates +both obstacle avoidance and path-following as chance constraints, accounting +for residual uncertainties from the GP to ensure safety and reliability in +control. Leveraging GPU acceleration, we efficiently manage the non-convex +nature of the problem, ensuring real-time performance. Our approach unifies +path-following and obstacle avoidance across different terrains, unlike prior +works which typically focus on one or the other. We compare our GP-MPPI method +against unicycle and data-driven kinematic models within the MPPI framework. In +simulations, our approach shows superior tracking accuracy and obstacle +avoidance. We further validate our approach through hardware experiments on a +skid-steer robot platform, demonstrating its effectiveness in high-speed +navigation. The GPU implementation of the proposed method and supplementary +video footage are available at https: //stochasticmppi.github.io. + +
+
+ comment: Currently under review for ICRA 2025 +
+
+
+
+
+ + ☆ The Future of Intelligent Healthcare: A Systematic Analysis and + Discussion on the Integration and Impact of Robots Using Large Language + Models for Healthcare + + +
+ The potential use of large language models (LLMs) in healthcare robotics can +help address the significant demand put on healthcare systems around the world +with respect to an aging demographic and a shortage of healthcare +professionals. Even though LLMs have already been integrated into medicine to +assist both clinicians and patients, the integration of LLMs within healthcare +robots has not yet been explored for clinical settings. In this perspective +paper, we investigate the groundbreaking developments in robotics and LLMs to +uniquely identify the needed system requirements for designing health specific +LLM based robots in terms of multi modal communication through human robot +interactions (HRIs), semantic reasoning, and task planning. Furthermore, we +discuss the ethical issues, open challenges, and potential future research +directions for this emerging innovative field. + +
+
+
+
+
+ + ☆ What Makes an Educational Robot Game Fun? Framework Analysis of + Children's Design Ideas + + +
+ Fun acts as a catalyst for learning by enhancing motivation, active +engagement and knowledge retention. As social robots gain traction as +educational tools, understanding how their unique affordances can be leveraged +to cultivate fun becomes crucial. This research investigates the concept of fun +in educational games involving social robots to support the design of REMind:a +robot-mediated role-play game aimed at encouraging bystander intervention +against peer bullying among children. To incorporate fun elements into design +of REMind, we conducted a user-centered Research through Design (RtD) study +with focus groups of children to gain a deeper understanding of their +perceptions of fun. We analyzed children's ideas by using Framework Analysis +and leveraging LeBlanc's Taxonomy of Game Pleasures and identified 28 elements +of fun that can be incorporated into robot-mediated games. We present our +observations, discuss their impact on REMind's design, and offer +recommendations for designing fun educational games using social robots. + +
+
+ comment: This is a pre-print of a manuscript that was accepted to + International Conference on Social Robotics 2024 (ICSR'24 + AI), 2024, which + was held in Odense, Denmark +
+
+
+
+
+ + ☆ Energy Consumption in Robotics: A Simplified Modeling Approach + + +
+ The energy use of a robot is trajectory-dependent, and thus can be reduced by +optimization of the trajectory. Current methods for robot trajectory +optimization can reduce energy up to 15\% for fixed start and end points, +however their use in industrial robot planning is still restricted due to model +complexity and lack of integration with planning tools which address other +concerns (e.g. collision avoidance). We propose an approach that uses +differentiable inertial and kinematic models from standard open-source tools, +integrating with standard ROS planning methods. An inverse dynamics-based +energy model is optionally extended with a single-parameter electrical model, +simplifying the model identification process. We compare the inertial and +electrical models on a collaborative robot, showing that simplified models +provide competitive accuracy and are easier to deploy in practice. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Energy-Aware Predictive Motion Planning for Autonomous Vehicles Using a + Hybrid Zonotope Constraint Representation + + +
+ Uncrewed aerial systems have tightly coupled energy and motion dynamics which +must be accounted for by onboard planning algorithms. This work proposes a +strategy for coupled motion and energy planning using model predictive control +(MPC). A reduced-order linear time-invariant model of coupled energy and motion +dynamics is presented. Constrained zonotopes are used to represent state and +input constraints, and hybrid zonotopes are used to represent non-convex +constraints tied to a map of the environment. The structures of these +constraint representations are exploited within a mixed-integer quadratic +program solver tailored to MPC motion planning problems. Results apply the +proposed methodology to coupled motion and energy utilization planning problems +for 1) a hybrid-electric vehicle that must restrict engine usage when flying +over regions with noise restrictions, and 2) an electric package delivery drone +that must track waysets with both position and battery state of charge +requirements. By leveraging the structure-exploiting solver, the proposed +mixed-integer MPC formulations can be implemented in real time. + +
+
+
+
+
+ + ☆ Developing Simulation Models for Soft Robotic Grippers in Webots + + +
+ Robotic simulators provide cost-effective and risk-free virtual environments +for studying robotic designs, control algorithms, and sensor integrations. They +typically host extensive libraries of sensors and actuators that facilitate +rapid prototyping and design evaluations in simulation. The use of the most +prominent existing robotic simulators is however limited to simulation of +rigid-link robots. On the other hand, there exist dedicated specialized +environments for simulating soft robots. This separation limits the study of +soft robotic systems, particularly in hybrid scenarios where soft and rigid +sub-systems co-exist. In this work, we develop a lightweight open-source +digital twin of a commercially available soft gripper, directly integrated +within the robotic simulator Webots. We use a Rigid-Link-Discretization (RLD) +model to simulate the soft gripper. Using a Particle Swarm Optimization (PSO) +approach, we identify the parameters of the RLD model based on the kinematics +and dynamics of the physical system and show the efficacy of our modeling +approach in validation experiments. All software and experimental details are +available on github: https://github.com/anonymousgituser1/Robosoft2025 + +
+
+ comment: 7 pages, 9 figures, 1 table +
+
+
+
+
+ + ☆ UNet: A Generic and Reliable Multi-UAV Communication and Networking + Architecture for Heterogeneous Applications + + +
+ The rapid growth of UAV applications necessitates a robust communication and +networking architecture capable of addressing the diverse requirements of +various applications concurrently, rather than relying on application-specific +solutions. This paper proposes a generic and reliable multi-UAV communication +and networking architecture designed to support the varying demands of +heterogeneous applications, including short-range and long-range communication, +star and mesh topologies, different data rates, and multiple wireless +standards. Our architecture accommodates both adhoc and infrastructure +networks, ensuring seamless connectivity throughout the network. Additionally, +we present the design of a multi-protocol UAV gateway that enables +interoperability among various communication protocols. Furthermore, we +introduce a data processing and service layer framework with a graphical user +interface of a ground control station that facilitates remote control and +monitoring from any location at any time. We practically implemented the +proposed architecture and evaluated its performance using different metrics, +demonstrating its effectiveness. + +
+
+ comment: 11 pages, 20 figures, Journal paper +
+
+
+
+
+ + ☆ Set-Membership Estimation for Fault Diagnosis of Nonlinear Systems + + +
+ This paper introduces a Fault Diagnosis (Detection, Isolation, and +Estimation) method using Set-Membership Estimation (SME) designed for a class +of nonlinear systems that are linear to the fault parameters. The methodology +advances fault diagnosis by continuously evaluating an estimate of the fault +parameter and a feasible parameter set where the true fault parameter belongs. +Unlike previous SME approaches, in this work, we address nonlinear systems +subjected to both input and output uncertainties by utilizing inclusion +functions and interval arithmetic. Additionally, we present an approach to +outer-approximate the polytopic description of the feasible parameter set by +effectively balancing approximation accuracy with computational efficiency +resulting in improved fault detectability. Lastly, we introduce adaptive +regularization of the parameter estimates to enhance the estimation process +when the input-output data are sparse or non-informative, enhancing fault +identifiability. We demonstrate the effectiveness of this method in simulations +involving an Autonomous Surface Vehicle in both a path-following and a +realistic collision avoidance scenario, underscoring its potential to enhance +safety and reliability in critical applications. + +
+
+
+
+
+ + ☆ Autonomous Decision Making for UAV Cooperative Pursuit-Evasion Game with + Reinforcement Learning + + +
+ The application of intelligent decision-making in unmanned aerial vehicle +(UAV) is increasing, and with the development of UAV 1v1 pursuit-evasion game, +multi-UAV cooperative game has emerged as a new challenge. This paper proposes +a deep reinforcement learning-based model for decision-making in multi-role UAV +cooperative pursuit-evasion game, to address the challenge of enabling UAV to +autonomously make decisions in complex game environments. In order to enhance +the training efficiency of the reinforcement learning algorithm in UAV +pursuit-evasion game environment that has high-dimensional state-action space, +this paper proposes multi-environment asynchronous double deep Q-network with +priority experience replay algorithm to effectively train the UAV's game +policy. Furthermore, aiming to improve cooperation ability and task completion +efficiency, as well as minimize the cost of UAVs in the pursuit-evasion game, +this paper focuses on the allocation of roles and targets within multi-UAV +environment. The cooperative game decision model with varying numbers of UAVs +are obtained by assigning diverse tasks and roles to the UAVs in different +scenarios. The simulation results demonstrate that the proposed method enables +autonomous decision-making of the UAVs in pursuit-evasion game scenarios and +exhibits significant capabilities in cooperation. + +
+
+ comment: 11 pages, 12 figures, 31 conference +
+
+
+
+
+ + ☆ Transformer-Based Fault-Tolerant Control for Fixed-Wing UAVs Using + Knowledge Distillation and In-Context Adaptation + + +
+ This study presents a transformer-based approach for fault-tolerant control +in fixed-wing Unmanned Aerial Vehicles (UAVs), designed to adapt in real time +to dynamic changes caused by structural damage or actuator failures. Unlike +traditional Flight Control Systems (FCSs) that rely on classical control theory +and struggle under severe alterations in dynamics, our method directly maps +outer-loop reference values -- altitude, heading, and airspeed -- into control +commands using the in-context learning and attention mechanisms of +transformers, thus bypassing inner-loop controllers and fault-detection layers. +Employing a teacher-student knowledge distillation framework, the proposed +approach trains a student agent with partial observations by transferring +knowledge from a privileged expert agent with full observability, enabling +robust performance across diverse failure scenarios. Experimental results +demonstrate that our transformer-based controller outperforms industry-standard +FCS and state-of-the-art reinforcement learning (RL) methods, maintaining high +tracking accuracy and stability in nominal conditions and extreme failure +cases, highlighting its potential for enhancing UAV operational safety and +reliability. + +
+
+
+
+
+ + ☆ Multi-modal NeRF Self-Supervision for LiDAR Semantic Segmentation IROS + + +
+ LiDAR Semantic Segmentation is a fundamental task in autonomous driving +perception consisting of associating each LiDAR point to a semantic label. +Fully-supervised models have widely tackled this task, but they require labels +for each scan, which either limits their domain or requires impractical amounts +of expensive annotations. Camera images, which are generally recorded alongside +LiDAR pointclouds, can be processed by the widely available 2D foundation +models, which are generic and dataset-agnostic. However, distilling knowledge +from 2D data to improve LiDAR perception raises domain adaptation challenges. +For example, the classical perspective projection suffers from the parallax +effect produced by the position shift between both sensors at their respective +capture times. We propose a Semi-Supervised Learning setup to leverage +unlabeled LiDAR pointclouds alongside distilled knowledge from the camera +images. To self-supervise our model on the unlabeled scans, we add an auxiliary +NeRF head and cast rays from the camera viewpoint over the unlabeled voxel +features. The NeRF head predicts densities and semantic logits at each sampled +ray location which are used for rendering pixel semantics. Concurrently, we +query the Segment-Anything (SAM) foundation model with the camera image to +generate a set of unlabeled generic masks. We fuse the masks with the rendered +pixel semantics from LiDAR to produce pseudo-labels that supervise the pixel +predictions. During inference, we drop the NeRF head and run our model with +only LiDAR. We show the effectiveness of our approach in three public LiDAR +Semantic Segmentation benchmarks: nuScenes, SemanticKITTI and ScribbleKITTI. + +
+
+ comment: IEEE/RSJ International Conference on Intelligent Robots and Systems + (IROS) 2024 +
+
+
+
+
+ + ☆ Multi-Modal 3D Scene Graph Updater for Shared and Dynamic Environments + + +
+ The advent of generalist Large Language Models (LLMs) and Large Vision Models +(VLMs) have streamlined the construction of semantically enriched maps that can +enable robots to ground high-level reasoning and planning into their +representations. One of the most widely used semantic map formats is the 3D +Scene Graph, which captures both metric (low-level) and semantic (high-level) +information. However, these maps often assume a static world, while real +environments, like homes and offices, are dynamic. Even small changes in these +spaces can significantly impact task performance. To integrate robots into +dynamic environments, they must detect changes and update the scene graph in +real-time. This update process is inherently multimodal, requiring input from +various sources, such as human agents, the robot's own perception system, time, +and its actions. This work proposes a framework that leverages these multimodal +inputs to maintain the consistency of scene graphs during real-time operation, +presenting promising initial results and outlining a roadmap for future +research. + +
+
+ comment: This paper has been accepted at the Workshop on Lifelong Learning for + Home Robots at the 8th Conference on Robot Learning (CoRL 2024), Munich, + Germany +
+
+
+
+
+ + ☆ Nature's All-in-One: Multitasking Robots Inspired by Dung Beetles + + +
+ Dung beetles impressively coordinate their six legs simultaneously to +effectively roll large dung balls. They are also capable of rolling dung balls +varying in the weight on different terrains. The mechanisms underlying how +their motor commands are adapted to walk and simultaneously roll balls +(multitasking behavior) under different conditions remain unknown. Therefore, +this study unravels the mechanisms of how dung beetles roll dung balls and +adapt their leg movements to stably roll balls over different terrains for +multitasking robots. We synthesize a modular neural-based loco-manipulation +control inspired by and based on ethological observations of the ball-rolling +behavior of dung beetles. The proposed neural-based control contains various +neural modules, including a central pattern generator (CPG) module, a pattern +formation network (PFN) module, and a robot orientation control (ROC) module. +The integrated neural control mechanisms can successfully control a dung +beetle-like robot (ALPHA) with biomechanical feet to perform adaptive robust +(multitasking) loco-manipulation (walking and ball-rolling) on various terrains +(flat and uneven). It can also deal with different ball weights (2.0 and 4.6 +kg) and ball types (soft and rigid). The control mechanisms can serve as +guiding principles for solving complex sensory-motor coordination for +multitasking robots. Furthermore, this study contributes to biological research +by enhancing our scientific understanding of sensory-motor coordination for +complex adaptive (multitasking) loco-manipulation behavior in animals. + +
+
+
+
+
+ + ☆ When to Localize? A Risk-Constrained Reinforcement Learning Approach + + +
+ In a standard navigation pipeline, a robot localizes at every time step to +lower navigational errors. However, in some scenarios, a robot needs to +selectively localize when it is expensive to obtain observations. For example, +an underwater robot surfacing to localize too often hinders it from searching +for critical items underwater, such as black boxes from crashed aircraft. On +the other hand, if the robot never localizes, poor state estimates cause +failure to find the items due to inadvertently leaving the search area or +entering hazardous, restricted areas. Motivated by these scenarios, we +investigate approaches to help a robot determine "when to localize?" We +formulate this as a bi-criteria optimization problem: minimize the number of +localization actions while ensuring the probability of failure (due to +collision or not reaching a desired goal) remains bounded. In recent work, we +showed how to formulate this active localization problem as a constrained +Partially Observable Markov Decision Process (POMDP), which was solved using an +online POMDP solver. However, this approach is too slow and requires full +knowledge of the robot transition and observation models. In this paper, we +present RiskRL, a constrained Reinforcement Learning (RL) framework that +overcomes these limitations. RiskRL uses particle filtering and recurrent Soft +Actor-Critic network to learn a policy that minimizes the number of +localizations while ensuring the probability of failure constraint is met. Our +numerical experiments show that RiskRL learns a robust policy that outperforms +the baseline by at least 13% while also generalizing to unseen environments. + +
+
+
+
+
+ + ☆ Communication and Energy-Aware Multi-UAV Coverage Path Planning for + Networked Operations + + +
+ This paper presents a communication and energy-aware Multi-UAV Coverage Path +Planning (mCPP) method for scenarios requiring continuous inter-UAV +communication, such as cooperative search and rescue and surveillance missions. +Unlike existing mCPP solutions that focus on energy, time, or coverage +efficiency, our approach generates coverage paths that require minimal the +communication range to maintain inter-UAV connectivity while also optimizing +energy consumption. The mCPP problem is formulated as a multi-objective +optimization task, aiming to minimize both the communication range requirement +and energy consumption. Our approach significantly reduces the communication +range needed for maintaining connectivity while ensuring energy efficiency, +outperforming state-of-the-art methods. Its effectiveness is validated through +simulations on complex and arbitrary shaped regions of interests, including +scenarios with no-fly zones. Additionally, real-world experiment demonstrate +its high accuracy, achieving 99\% consistency between the estimated and actual +communication range required during a multi-UAV coverage mission involving +three UAVs. + +
+
+
+
+
+ + ☆ Safety Verification for Evasive Collision Avoidance in Autonomous + Vehicles with Enhanced Resolutions + + +
+ This paper presents a comprehensive hazard analysis, risk assessment, and +loss evaluation for an Evasive Minimum Risk Maneuvering (EMRM) system designed +for autonomous vehicles. The EMRM system is engineered to enhance collision +avoidance and mitigate loss severity by drawing inspiration from professional +drivers who perform aggressive maneuvers while maintaining stability for +effective risk mitigation. Recent advancements in autonomous vehicle technology +demonstrate a growing capability for high-performance maneuvers. This paper +discusses a comprehensive safety verification process and establishes a clear +safety goal to enhance testing validation. The study systematically identifies +potential hazards and assesses their risks to overall safety and the protection +of vulnerable road users. A novel loss evaluation approach is introduced, +focusing on the impact of mitigation maneuvers on loss severity. Additionally, +the proposed mitigation integrity level can be used to verify the minimum-risk +maneuver feature. This paper applies a verification method to evasive +maneuvering, contributing to the development of more reliable active safety +features in autonomous driving systems. + +
+
+
+
+
+ + ☆ RT-Affordance: Affordances are Versatile Intermediate Representations + for Robot Manipulation + + +
+ We explore how intermediate policy representations can facilitate +generalization by providing guidance on how to perform manipulation tasks. +Existing representations such as language, goal images, and trajectory sketches +have been shown to be helpful, but these representations either do not provide +enough context or provide over-specified context that yields less robust +policies. We propose conditioning policies on affordances, which capture the +pose of the robot at key stages of the task. Affordances offer expressive yet +lightweight abstractions, are easy for users to specify, and facilitate +efficient learning by transferring knowledge from large internet datasets. Our +method, RT-Affordance, is a hierarchical model that first proposes an +affordance plan given the task language, and then conditions the policy on this +affordance plan to perform manipulation. Our model can flexibly bridge +heterogeneous sources of supervision including large web datasets and robot +trajectories. We additionally train our model on cheap-to-collect in-domain +affordance images, allowing us to learn new tasks without collecting any +additional costly robot trajectories. We show on a diverse set of novel tasks +how RT-Affordance exceeds the performance of existing methods by over 50%, and +we empirically demonstrate that affordances are robust to novel settings. +Videos available at https://snasiriany.me/rt-affordance + +
+
+
+
+
+ + ☆ LVI-GS: Tightly-coupled LiDAR-Visual-Inertial SLAM using 3D Gaussian + Splatting + + +
+ 3D Gaussian Splatting (3DGS) has shown its ability in rapid rendering and +high-fidelity mapping. In this paper, we introduce LVI-GS, a tightly-coupled +LiDAR-Visual-Inertial mapping framework with 3DGS, which leverages the +complementary characteristics of LiDAR and image sensors to capture both +geometric structures and visual details of 3D scenes. To this end, the 3D +Gaussians are initialized from colourized LiDAR points and optimized using +differentiable rendering. In order to achieve high-fidelity mapping, we +introduce a pyramid-based training approach to effectively learn multi-level +features and incorporate depth loss derived from LiDAR measurements to improve +geometric feature perception. Through well-designed strategies for Gaussian-Map +expansion, keyframe selection, thread management, and custom CUDA acceleration, +our framework achieves real-time photo-realistic mapping. Numerical experiments +are performed to evaluate the superior performance of our method compared to +state-of-the-art 3D reconstruction systems. + +
+
+
+
+
+ + ☆ VQ-ACE: Efficient Policy Search for Dexterous Robotic Manipulation via + Action Chunking Embedding + + +
+ Dexterous robotic manipulation remains a significant challenge due to the +high dimensionality and complexity of hand movements required for tasks like +in-hand manipulation and object grasping. This paper addresses this issue by +introducing Vector Quantized Action Chunking Embedding (VQ-ACE), a novel +framework that compresses human hand motion into a quantized latent space, +significantly reducing the action space's dimensionality while preserving key +motion characteristics. By integrating VQ-ACE with both Model Predictive +Control (MPC) and Reinforcement Learning (RL), we enable more efficient +exploration and policy learning in dexterous manipulation tasks using a +biomimetic robotic hand. Our results show that latent space sampling with MPC +produces more human-like behavior in tasks such as Ball Rolling and Object +Picking, leading to higher task success rates and reduced control costs. For +RL, action chunking accelerates learning and improves exploration, demonstrated +through faster convergence in tasks like cube stacking and in-hand cube +reorientation. These findings suggest that VQ-ACE offers a scalable and +effective solution for robotic manipulation tasks involving complex, +high-dimensional state spaces, contributing to more natural and adaptable +robotic systems. + +
+
+
+
+
+ + ☆ Object and Contact Point Tracking in Demonstrations Using 3D Gaussian + Splatting + + +
+ This paper introduces a method to enhance Interactive Imitation Learning +(IIL) by extracting touch interaction points and tracking object movement from +video demonstrations. The approach extends current IIL systems by providing +robots with detailed knowledge of both where and how to interact with objects, +particularly complex articulated ones like doors and drawers. By leveraging +cutting-edge techniques such as 3D Gaussian Splatting and FoundationPose for +tracking, this method allows robots to better understand and manipulate objects +in dynamic environments. The research lays the foundation for more effective +task learning and execution in autonomous robotic systems. + +
+
+ comment: CoRL 2024, Workshop on Lifelong Learning for Home Robots, Munich, + Germany +
+
+
+
+
+ + ☆ VLA-3D: A Dataset for 3D Semantic Scene Understanding and Navigation RSS 2024 + + +
+ With the recent rise of Large Language Models (LLMs), Vision-Language Models +(VLMs), and other general foundation models, there is growing potential for +multimodal, multi-task embodied agents that can operate in diverse environments +given only natural language as input. One such application area is indoor +navigation using natural language instructions. However, despite recent +progress, this problem remains challenging due to the spatial reasoning and +semantic understanding required, particularly in arbitrary scenes that may +contain many objects belonging to fine-grained classes. To address this +challenge, we curate the largest real-world dataset for Vision and +Language-guided Action in 3D Scenes (VLA-3D), consisting of over 11.5K scanned +3D indoor rooms from existing datasets, 23.5M heuristically generated semantic +relations between objects, and 9.7M synthetically generated referential +statements. Our dataset consists of processed 3D point clouds, semantic object +and room annotations, scene graphs, navigable free space annotations, and +referential language statements that specifically focus on view-independent +spatial relations for disambiguating objects. The goal of these features is to +aid the downstream task of navigation, especially on real-world systems where +some level of robustness must be guaranteed in an open world of changing scenes +and imperfect language. We benchmark our dataset with current state-of-the-art +models to obtain a performance baseline. All code to generate and visualize the +dataset is publicly released, see https://github.com/HaochenZ11/VLA-3D. With +the release of this dataset, we hope to provide a resource for progress in +semantic 3D scene understanding that is robust to changes and one which will +aid the development of interactive indoor navigation systems. + +
+
+ comment: Accepted and presented at the 1st Workshop on Semantic Reasoning and + Goal Understanding in Robotics (SemRob), Robotics Science and Systems + Conference (RSS 2024) +
+
+
+
+
+ + ☆ A Behavior Architecture for Fast Humanoid Robot Door Traversals + + +
+ Towards the role of humanoid robots as squad mates in urban operations and +other domains, we identified doors as a major area lacking capability +development. In this paper, we focus on the ability of humanoid robots to +navigate and deal with doors. Human-sized doors are ubiquitous in many +environment domains and the humanoid form factor is uniquely suited to operate +and traverse them. We present an architecture which incorporates GPU +accelerated perception and a tree based interactive behavior coordination +system with a whole body motion and walking controller. Our system is capable +of performing door traversals on a variety of door types. It supports rapid +authoring of behaviors for unseen door types and techniques to achieve +re-usability of those authored behaviors. The behaviors are modelled using +trees and feature logical reactivity and action sequences that can be executed +with layered concurrency to increase speed. Primitive actions are built on top +of our existing whole body controller which supports manipulation while +walking. We include a perception system using both neural networks and +classical computer vision for door mechanism detection outside of the lab +environment. We present operator-robot interdependence analysis charts to +explore how human cognition is combined with artificial intelligence to produce +complex robot behavior. Finally, we present and discuss real robot performances +of fast door traversals on our Nadia humanoid robot. Videos online at +https://www.youtube.com/playlist?list=PLXuyT8w3JVgMPaB5nWNRNHtqzRK8i68dy. + +
+
+ comment: 15 pages, 23 figure, for submission to Elsevier RAS +
+
+
+
+
+ + ☆ An Open-source Sim2Real Approach for Sensor-independent Robot Navigation + in a Grid ICRA + + +
+ This paper presents a Sim2Real (Simulation to Reality) approach to bridge the +gap between a trained agent in a simulated environment and its real-world +implementation in navigating a robot in a similar setting. Specifically, we +focus on navigating a quadruped robot in a real-world grid-like environment +inspired by the Gymnasium Frozen Lake -- a highly user-friendly and free +Application Programming Interface (API) to develop and test Reinforcement +Learning (RL) algorithms. We detail the development of a pipeline to transfer +motion policies learned in the Frozen Lake simulation to a physical quadruped +robot, thus enabling autonomous navigation and obstacle avoidance in a grid +without relying on expensive localization and mapping sensors. The work +involves training an RL agent in the Frozen Lake environment and utilizing the +resulting Q-table to control a 12 Degrees-of-Freedom (DOF) quadruped robot. In +addition to detailing the RL implementation, inverse kinematics-based quadruped +gaits, and the transfer policy pipeline, we open-source the project on GitHub +and include a demonstration video of our Sim2Real transfer approach. This work +provides an accessible, straightforward, and low-cost framework for +researchers, students, and hobbyists to explore and implement RL-based robot +navigation in real-world grid environments. + +
+
+ comment: Accepted for publication at the 9th IEEE International Conference on + Robotics and Automation Engineering (IEEE ICRAE 2024), Singapore +
+
+
+
+
+ + Enhancing Exploratory Capability of Visual Navigation Using Uncertainty + of Implicit Scene Representation + + +
+ In the context of visual navigation in unknown scenes, both "exploration" and +"exploitation" are equally crucial. Robots must first establish environmental +cognition through exploration and then utilize the cognitive information to +accomplish target searches. However, most existing methods for image-goal +navigation prioritize target search over the generation of exploratory +behavior. To address this, we propose the Navigation with Uncertainty-driven +Exploration (NUE) pipeline, which uses an implicit and compact scene +representation, NeRF, as a cognitive structure. We estimate the uncertainty of +NeRF and augment the exploratory ability by the uncertainty to in turn +facilitate the construction of implicit representation. Simultaneously, we +extract memory information from NeRF to enhance the robot's reasoning ability +for determining the location of the target. Ultimately, we seamlessly combine +the two generated abilities to produce navigational actions. Our pipeline is +end-to-end, with the environmental cognitive structure being constructed +online. Extensive experimental results on image-goal navigation demonstrate the +capability of our pipeline to enhance exploratory behaviors, while also +enabling a natural transition from the exploration to exploitation phase. This +enables our model to outperform existing memory-based cognitive navigation +structures in terms of navigation performance. + +
+
+
+
+
+ + ☆ Augmented-Reality Enabled Crop Monitoring with Robot Assistance + + +
+ The integration of augmented reality (AR), extended reality (XR), and virtual +reality (VR) technologies in agriculture has shown significant promise in +enhancing various agricultural practices. Mobile robots have also been adopted +as assessment tools in precision agriculture, improving economic efficiency and +productivity, and minimizing undesired effects such as weeds and pests. Despite +considerable work on both fronts, the combination of a versatile User Interface +(UI) provided by an AR headset with the integration and direct interaction and +control of a mobile field robot has not yet been fully explored or +standardized. This work aims to address this gap by providing real-time data +input and control output of a mobile robot for precision agriculture through a +virtual environment enabled by an AR headset interface. The system leverages +open-source computational tools and off-the-shelf hardware for effective +integration. Distinctive case studies are presented where growers or +technicians can interact with a legged robot via an AR headset and a UI. Users +can teleoperate the robot to gather information in an area of interest, request +real-time graphed status of an area, or have the robot autonomously navigate to +selected areas for measurement updates. The proposed system utilizes a custom +local navigation method with a fixed holographic coordinate system in +combination with QR codes. This step toward fusing AR and robotics in +agriculture aims to provide practical solutions for real-time data management +and control enabled by human-robot interaction. The implementation can be +extended to various robot applications in agriculture and beyond, promoting a +unified framework for on-demand and autonomous robot operation in the field. + +
+
+
+
+
+ + ☆ Chance-Constrained Convex MPC for Robust Quadruped Locomotion Under + Parametric and Additive Uncertainties + + +
+ Recent advances in quadrupedal locomotion have focused on improving stability +and performance across diverse environments. However, existing methods often +lack adequate safety analysis and struggle to adapt to varying payloads and +complex terrains, typically requiring extensive tuning. To overcome these +challenges, we propose a Chance-Constrained Model Predictive Control (CCMPC) +framework that explicitly models payload and terrain variability as +distributions of parametric and additive disturbances within the single rigid +body dynamics (SRBD) model. Our approach ensures safe and consistent +performance under uncertain dynamics by expressing the model friction cone +constraints, which define the feasible set of ground reaction forces, as chance +constraints. Moreover, we solve the resulting stochastic control problem using +a computationally efficient quadratic programming formulation. Extensive Monte +Carlo simulations of quadrupedal locomotion across varying payloads and complex +terrains demonstrate that CCMPC significantly outperforms two competitive +benchmarks: Linear MPC (LMPC) and MPC with hand-tuned safety margins to +maintain stability, reduce foot slippage, and track the center of mass. +Hardware experiments on the Unitree Go1 robot show successful locomotion across +various indoor and outdoor terrains with unknown loads exceeding 50% of the +robot body weight, despite no additional parameter tuning. A video of the +results and accompanying code can be found at: https://cc-mpc.github.io/. + +
+
+ comment: Under review for Robotics and Automation Letters +
+
+
+
+
+ + ☆ Digital Twin for Autonomous Surface Vessels: Enabler for Safe Maritime + Navigation + + +
+ Autonomous surface vessels (ASVs) are becoming increasingly significant in +enhancing the safety and sustainability of maritime operations. To ensure the +reliability of modern control algorithms utilized in these vessels, digital +twins (DTs) provide a robust framework for conducting safe and effective +simulations within a virtual environment. Digital twins are generally +classified on a scale from 0 to 5, with each level representing a progression +in complexity and functionality: Level 0 (Standalone) employs offline modeling +techniques; Level 1 (Descriptive) integrates sensors and online modeling to +enhance situational awareness; Level 2 (Diagnostic) focuses on condition +monitoring and cybersecurity; Level 3 (Predictive) incorporates predictive +analytics; Level 4 (Prescriptive) embeds decision-support systems; and Level 5 +(Autonomous) enables advanced functionalities such as collision avoidance and +path following. These digital representations not only provide insights into +the vessel's current state and operational efficiency but also predict future +scenarios and assess life endurance. By continuously updating with real-time +sensor data, the digital twin effectively corrects modeling errors and enhances +decision-making processes. Since DTs are key enablers for complex autonomous +systems, this paper introduces a comprehensive methodology for establishing a +digital twin framework specifically tailored for ASVs. Through a detailed +literature survey, we explore existing state-of-the-art enablers across the +defined levels, offering valuable recommendations for future research and +development in this rapidly evolving field. + +
+
+
+
+
+ + ☆ Accelerating Gaussian Variational Inference for Motion Planning Under + Uncertainty + + +
+ This work addresses motion planning under uncertainty as a stochastic optimal +control problem. The path distribution induced by the optimal controller +corresponds to a posterior path distribution with a known form. To approximate +this posterior, we frame an optimization problem in the space of Gaussian +distributions, which aligns with the Gaussian Variational Inference Motion +Planning (GVIMP) paradigm introduced in \cite{yu2023gaussian}. In this +framework, the computation bottleneck lies in evaluating the expectation of +collision costs over a dense discretized trajectory and computing the marginal +covariances. This work exploits the sparse motion planning factor graph, which +allows for parallel computing collision costs and Gaussian Belief Propagation +(GBP) marginal covariance computation, to introduce a computationally efficient +approach to solving GVIMP. We term the novel paradigm as the Parallel Gaussian +Variational Inference Motion Planning (P-GVIMP). We validate the proposed +framework on various robotic systems, demonstrating significant speed +acceleration achieved by leveraging Graphics Processing Units (GPUs) for +parallel computation. An open-sourced implementation is presented at +https://github.com/hzyu17/VIMP. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ STEER: Flexible Robotic Manipulation via Dense Language Grounding + + +
+ The complexity of the real world demands robotic systems that can +intelligently adapt to unseen situations. We present STEER, a robot learning +framework that bridges high-level, commonsense reasoning with precise, flexible +low-level control. Our approach translates complex situational awareness into +actionable low-level behavior through training language-grounded policies with +dense annotation. By structuring policy training around fundamental, modular +manipulation skills expressed in natural language, STEER exposes an expressive +interface for humans or Vision-Language Models (VLMs) to intelligently +orchestrate the robot's behavior by reasoning about the task and context. Our +experiments demonstrate the skills learned via STEER can be combined to +synthesize novel behaviors to adapt to new situations or perform completely new +tasks without additional data collection or training. + +
+
+ comment: Project website: https://lauramsmith.github.io/steer/ +
+
+
+
+
+ + ☆ Learning Few-Shot Object Placement with Intra-Category Transfer RA-L + + +
+ Efficient learning from demonstration for long-horizon tasks remains an open +challenge in robotics. While significant effort has been directed toward +learning trajectories, a recent resurgence of object-centric approaches has +demonstrated improved sample efficiency, enabling transferable robotic skills. +Such approaches model tasks as a sequence of object poses over time. In this +work, we propose a scheme for transferring observed object arrangements to +novel object instances by learning these arrangements on canonical class +frames. We then employ this scheme to enable a simple yet effective approach +for training models from as few as five demonstrations to predict arrangements +of a wide range of objects including tableware, cutlery, furniture, and desk +spaces. We propose a method for optimizing the learned models to enables +efficient learning of tasks such as setting a table or tidying up an office +with intra-category transfer, even in the presence of distractors. We present +extensive experimental results in simulation and on a real robotic system for +table setting which, based on human evaluations, scored 73.3% compared to a +human baseline. We make the code and trained models publicly available at +http://oplict.cs.uni-freiburg.de. + +
+
+ comment: 8 pages, 7 figures, 2 tables, submitted to RA-L +
+
+
+
+
+ + ☆ Exploring the Interplay Between Video Generation and World Models in + Autonomous Driving: A Survey + + +
+ World models and video generation are pivotal technologies in the domain of +autonomous driving, each playing a critical role in enhancing the robustness +and reliability of autonomous systems. World models, which simulate the +dynamics of real-world environments, and video generation models, which produce +realistic video sequences, are increasingly being integrated to improve +situational awareness and decision-making capabilities in autonomous vehicles. +This paper investigates the relationship between these two technologies, +focusing on how their structural parallels, particularly in diffusion-based +models, contribute to more accurate and coherent simulations of driving +scenarios. We examine leading works such as JEPA, Genie, and Sora, which +exemplify different approaches to world model design, thereby highlighting the +lack of a universally accepted definition of world models. These diverse +interpretations underscore the field's evolving understanding of how world +models can be optimized for various autonomous driving tasks. Furthermore, this +paper discusses the key evaluation metrics employed in this domain, such as +Chamfer distance for 3D scene reconstruction and Fr\'echet Inception Distance +(FID) for assessing the quality of generated video content. By analyzing the +interplay between video generation and world models, this survey identifies +critical challenges and future research directions, emphasizing the potential +of these technologies to jointly advance the performance of autonomous driving +systems. The findings presented in this paper aim to provide a comprehensive +understanding of how the integration of video generation and world models can +drive innovation in the development of safer and more reliable autonomous +vehicles. + +
+
+
+
+
+ + ♻ ☆ Cognitive Planning for Object Goal Navigation using Generative AI Models + + +
+ Recent advancements in Generative AI, particularly in Large Language Models +(LLMs) and Large Vision-Language Models (LVLMs), offer new possibilities for +integrating cognitive planning into robotic systems. In this work, we present a +novel framework for solving the object goal navigation problem that generates +efficient exploration strategies. Our approach enables a robot to navigate +unfamiliar environments by leveraging LLMs and LVLMs to understand the semantic +structure of the scene. To address the challenge of representing complex +environments without overwhelming the system, we propose a 3D modular scene +representation, enriched with semantic descriptions. This representation is +dynamically pruned using an LLM-based mechanism, which filters irrelevant +information and focuses on task-specific data. By combining these elements, our +system generates high-level sub-goals that guide the exploration of the robot +toward the target object. We validate our approach in simulated environments, +demonstrating its ability to enhance object search efficiency while maintaining +scalability in complex settings. + +
+
+
+
+
+ + ♻ ☆ POLICEd RL: Learning Closed-Loop Robot Control Policies with Provable + Satisfaction of Hard Constraints RSS + + +
+ In this paper, we seek to learn a robot policy guaranteed to satisfy state +constraints. To encourage constraint satisfaction, existing RL algorithms +typically rely on Constrained Markov Decision Processes and discourage +constraint violations through reward shaping. However, such soft constraints +cannot offer verifiable safety guarantees. To address this gap, we propose +POLICEd RL, a novel RL algorithm explicitly designed to enforce affine hard +constraints in closed-loop with a black-box environment. Our key insight is to +force the learned policy to be affine around the unsafe set and use this affine +region as a repulsive buffer to prevent trajectories from violating the +constraint. We prove that such policies exist and guarantee constraint +satisfaction. Our proposed framework is applicable to both systems with +continuous and discrete state and action spaces and is agnostic to the choice +of the RL training algorithm. Our results demonstrate the capacity of POLICEd +RL to enforce hard constraints in robotic tasks while significantly +outperforming existing methods. + +
+
+ comment: Robotics: Science and Systems (RSS) 2024, + https://www.roboticsproceedings.org/rss20/p104.html +
+
+
+
+
+ + ♻ ☆ Caging in Time: A Framework for Robust Object Manipulation under + Uncertainties and Limited Robot Perception + + +
+ Real-world object manipulation has been commonly challenged by physical +uncertainties and perception limitations. Being an effective strategy, while +caging configuration-based manipulation frameworks have successfully provided +robust solutions, they are not broadly applicable due to their strict +requirements on the availability of multiple robots, widely distributed +contacts, or specific geometries of the robots or the objects. To this end, +this work proposes a novel concept, termed Caging in Time, to allow caging +configurations to be formed even if there is just one robot engaged in a task. +This novel concept can be explained by an insight that even if a caging +configuration is needed to constrain the motion of an object, only a small +portion of the cage is actively manipulating at a time. As such, we can switch +the configuration of the robot strategically so that by collapsing its +configuration in time, we will see a cage formed and its necessary portion +active whenever needed. We instantiate our Caging in Time theory on challenging +quasistatic and dynamic manipulation tasks, showing that Caging in Time can be +achieved in general state spaces including geometry-based and energy-based +spaces. With extensive experiments, we show robust and accurate manipulation, +in an open-loop manner, without requiring detailed knowledge of the object +geometry or physical properties, nor realtime accurate feedback on the +manipulation states. In addition to being an effective and robust open-loop +manipulation solution, the proposed theory can be a supplementary strategy to +other manipulation systems affected by uncertain or limited robot perception. + +
+
+ comment: 24 pages, 25 figures, video available at: + www.youtube.com/watch?v=Ag_jTzazuSM +
+
+
+
+
+ + ♻ ☆ Learning Lyapunov-Stable Polynomial Dynamical Systems through Imitation + + +
+ Imitation learning is a paradigm to address complex motion planning problems +by learning a policy to imitate an expert's behavior. However, relying solely +on the expert's data might lead to unsafe actions when the robot deviates from +the demonstrated trajectories. Stability guarantees have previously been +provided utilizing nonlinear dynamical systems, acting as high-level motion +planners, in conjunction with the Lyapunov stability theorem. Yet, these +methods are prone to inaccurate policies, high computational cost, sample +inefficiency, or quasi stability when replicating complex and highly nonlinear +trajectories. To mitigate this problem, we present an approach for learning a +globally stable nonlinear dynamical system as a motion planning policy. We +model the nonlinear dynamical system as a parametric polynomial and learn the +polynomial's coefficients jointly with a Lyapunov candidate. To showcase its +success, we compare our method against the state of the art in simulation and +conduct real-world experiments with the Kinova Gen3 Lite manipulator arm. Our +experiments demonstrate the sample efficiency and reproduction accuracy of our +method for various expert trajectories, while remaining stable in the face of +perturbations. + +
+
+ comment: In 7th Annual Conference on Robot Learning 2023 Aug 30 +
+
+
+
+
+ + ♻ ☆ OCCAM: Online Continuous Controller Adaptation with Meta-Learned Models + + +
+ Control tuning and adaptation present a significant challenge to the usage of +robots in diverse environments. It is often nontrivial to find a single set of +control parameters by hand that work well across the broad array of +environments and conditions that a robot might encounter. Automated adaptation +approaches must utilize prior knowledge about the system while adapting to +significant domain shifts to find new control parameters quickly. In this work, +we present a general framework for online controller adaptation that deals with +these challenges. We combine meta-learning with Bayesian recursive estimation +to learn prior predictive models of system performance that quickly adapt to +online data, even when there is significant domain shift. These predictive +models can be used as cost functions within efficient sampling-based +optimization routines to find new control parameters online that maximize +system performance. Our framework is powerful and flexible enough to adapt +controllers for four diverse systems: a simulated race car, a simulated +quadrupedal robot, and a simulated and physical quadrotor. The video and code +can be found at https://hersh500.github.io/occam. + +
+
+ comment: 8 pages, 4 figures. Accepted to Conference on Robot Learning (CoRL) + 2024 +
+
+
+
+
+ + ♻ ☆ Online Analytic Exemplar-Free Continual Learning with Large Models for + Imbalanced Autonomous Driving Task + + +
+ In autonomous driving, even a meticulously trained model can encounter +failures when facing unfamiliar scenarios. One of these scenarios can be +formulated as an online continual learning (OCL) problem. That is, data come in +an online fashion, and models are updated according to these streaming data. +Two major OCL challenges are catastrophic forgetting and data imbalance. To +address these challenges, in this paper, we propose an Analytic Exemplar-Free +Online Continual Learning algorithm (AEF-OCL). The AEF-OCL leverages analytic +continual learning principles and employs ridge regression as a classifier for +features extracted by a large backbone network. It solves the OCL problem by +recursively calculating the analytical solution, ensuring an equalization +between the continual learning and its joint-learning counterpart, and works +without the need to save any used samples (i.e., exemplar-free). Additionally, +we introduce a Pseudo-Features Generator (PFG) module that recursively +estimates the mean and the variance of real features for each class. It +over-samples offset pseudo-features from the same normal distribution as the +real features, thereby addressing the data imbalance issue. Experimental +results demonstrate that despite being an exemplar-free strategy, our method +outperforms various methods on the autonomous driving SODA10M dataset. Source +code is available at https://github.com/ZHUANGHP/Analytic-continual-learning. + +
+
+ comment: This paper is to be published in IEEE Transactions on Vehicular + Technology +
+
+
+
+
+ + ♻ ☆ Resilient Movement Planning for Continuum Robots + + +
+ The paper presents an experimental study of resilient path planning for +con-tinuum robots taking into account the multi-objective optimisation problem. +To do this, we used two well-known algorithms, namely Genetic algorithm and A* +algorithm, for path planning and the Analytical Hierarchy Process algorithm for +paths evaluation. In our experiment Analytical Hierarchy Process algorithm +considers four different criteria, i.e. distance, motors damage, mechanical +damage and accuracy each considered to contribute to the resilience of a +continuum robot. The use of different criteria is necessary to increasing the +time to maintenance operations of the robot. The experiment shows that on the +one hand both algorithms can be used in combination with Analytical Hierarchy +Process algorithm for multi criteria path-planning, while Genetic algorithm +shows superior performance in the comparison of the two algorithms. + +
+
+
+
+
+ + ♻ ☆ Skill-aware Mutual Information Optimisation for Generalisation in + Reinforcement Learning NeurIPS + + +
+ Meta-Reinforcement Learning (Meta-RL) agents can struggle to operate across +tasks with varying environmental features that require different optimal skills +(i.e., different modes of behaviour). Using context encoders based on +contrastive learning to enhance the generalisability of Meta-RL agents is now +widely studied but faces challenges such as the requirement for a large sample +size, also referred to as the $\log$-$K$ curse. To improve RL generalisation to +different tasks, we first introduce Skill-aware Mutual Information (SaMI), an +optimisation objective that aids in distinguishing context embeddings according +to skills, thereby equipping RL agents with the ability to identify and execute +different skills across tasks. We then propose Skill-aware Noise Contrastive +Estimation (SaNCE), a $K$-sample estimator used to optimise the SaMI objective. +We provide a framework for equipping an RL agent with SaNCE in practice and +conduct experimental validation on modified MuJoCo and Panda-gym benchmarks. We +empirically find that RL agents that learn by maximising SaMI achieve +substantially improved zero-shot generalisation to unseen tasks. Additionally, +the context encoder trained with SaNCE demonstrates greater robustness to a +reduction in the number of available samples, thus possessing the potential to +overcome the $\log$-$K$ curse. + +
+
+ comment: The Thirty-eighth Annual Conference on Neural Information Processing + Systems (NeurIPS), 2024 +
+
+
+
+
+ + ♻ ☆ Reinforcement Learning with Lie Group Orientations for Robotics ICRA 2025 + + +
+ Handling orientations of robots and objects is a crucial aspect of many +applications. Yet, ever so often, there is a lack of mathematical correctness +when dealing with orientations, especially in learning pipelines involving, for +example, artificial neural networks. In this paper, we investigate +reinforcement learning with orientations and propose a simple modification of +the network's input and output that adheres to the Lie group structure of +orientations. As a result, we obtain an easy and efficient implementation that +is directly usable with existing learning libraries and achieves significantly +better performance than other common orientation representations. We briefly +introduce Lie theory specifically for orientations in robotics to motivate and +outline our approach. Subsequently, a thorough empirical evaluation of +different combinations of orientation representations for states and actions +demonstrates the superior performance of our proposed approach in different +scenarios, including: direct orientation control, end effector orientation +control, and pick-and-place tasks. + +
+
+ comment: Submitted to ICRA 2025 +
+
+
+
+
+ + ♻ ☆ AsynEVO: Asynchronous Event-Driven Visual Odometry for Pure Event + Streams + + +
+ Event cameras are bio-inspired vision sensors that asynchronously measure +per-pixel brightness changes.The high-temporal resolution and asynchronicity of +event cameras offer great potential for estimating robot motion states. Recent +works have adopted the continuous-time estimation methods to exploit the +inherent nature of event cameras. However, existing methods either have poor +runtime performance or neglect the high-temporal resolution of event cameras. +To alleviate it, an Asynchronous Event-driven Visual Odometry (AsynEVO) based +on sparse Gaussian Process (GP) regression is proposed to efficiently infer the +motion trajectory from pure event streams. Concretely, an asynchronous frontend +pipeline is designed to adapt event-driven feature tracking and manage feature +trajectories; a parallel dynamic sliding-window backend is presented within the +framework of sparse GP regression on $SE(3)$. Notably, a dynamic +marginalization strategy is employed to ensure the consistency and sparsity of +this GP regression. Experiments conducted on public datasets and real-world +scenarios demonstrate that AsynEVO achieves competitive precision and superior +robustness compared to the state-of-the-art.The experiment in the +repeated-texture scenario indicates that the high-temporal resolution of +AsynEVO plays a vital role in the estimation of high-speed movement. +Furthermore, we show that the computational efficiency of AsynEVO significantly +outperforms the incremental method. + +
+
+ comment: Submitted to IEEE Transactions on Intelligent Transportation Systems + (2024-07-15) +
+
+
+
+
+ + ♻ ☆ TaCOS: Task-Specific Camera Optimization with Simulation + + +
+ The performance of perception tasks is heavily influenced by imaging systems. +However, designing cameras with high task performance is costly, requiring +extensive camera knowledge and experimentation with physical hardware. +Additionally, cameras and perception tasks are mostly designed in isolation, +whereas recent methods that jointly design cameras and tasks have shown +improved performance. Therefore, we present a novel end-to-end optimization +approach that co-designs cameras with specific vision tasks. This method +combines derivative-free and gradient-based optimizers to support both +continuous and discrete camera parameters within manufacturing constraints. We +leverage recent computer graphics techniques and physical camera +characteristics to simulate the cameras in virtual environments, making the +design process cost-effective. We validate our simulations against physical +cameras and provide a procedurally generated virtual environment. Our +experiments demonstrate that our method designs cameras that outperform common +off-the-shelf options, and more efficiently compared to the state-of-the-art +approach, requiring only 2 minutes to design a camera on an example experiment +compared with 67 minutes for the competing method. Designed to support the +development of cameras under manufacturing constraints, multiple cameras, and +unconventional cameras, we believe this approach can advance the fully +automated design of cameras. + +
+
+
+
+
+ + ♻ ☆ Constrained Human-AI Cooperation: An Inclusive Embodied Social + Intelligence Challenge NeurIPS 2024 + + +
+ We introduce Constrained Human-AI Cooperation (CHAIC), an inclusive embodied +social intelligence challenge designed to test social perception and +cooperation in embodied agents. In CHAIC, the goal is for an embodied agent +equipped with egocentric observations to assist a human who may be operating +under physical constraints -- e.g., unable to reach high places or confined to +a wheelchair -- in performing common household or outdoor tasks as efficiently +as possible. To achieve this, a successful helper must: (1) infer the human's +intents and constraints by following the human and observing their behaviors +(social perception), and (2) make a cooperative plan tailored to the human +partner to solve the task as quickly as possible, working together as a team +(cooperative planning). To benchmark this challenge, we create four new agents +with real physical constraints and eight long-horizon tasks featuring both +indoor and outdoor scenes with various constraints, emergency events, and +potential risks. We benchmark planning- and learning-based baselines on the +challenge and introduce a new method that leverages large language models and +behavior modeling. Empirical evaluations demonstrate the effectiveness of our +benchmark in enabling systematic assessment of key aspects of machine social +intelligence. Our benchmark and code are publicly available at +https://github.com/UMass-Foundation-Model/CHAIC. + +
+
+ comment: NeurIPS 2024 Dataset and Benchmark Track. The first two authors + contributed equally. Project Website at https://vis-www.cs.umass.edu/CHAIC/ +
+
+
+
+
+ + ♻ ☆ Transitional Grid Maps: Joint Modeling of Static and Dynamic Occupancy + + +
+ Autonomous agents rely on sensor data to construct representations of their +environments, essential for predicting future events and planning their +actions. However, sensor measurements suffer from limited range, occlusions, +and sensor noise. These challenges become more evident in highly dynamic +environments. This work proposes a probabilistic framework to jointly infer +which parts of an environment are statically and which parts are dynamically +occupied. We formulate the problem as a Bayesian network and introduce minimal +assumptions that significantly reduce the complexity of the problem. Based on +those, we derive Transitional Grid Maps (TGMs), an efficient analytical +solution. Using real data, we demonstrate how this approach produces better +maps by keeping track of both static and dynamic elements and, as a side +effect, can help improve existing SLAM algorithms. + +
+
+
+
+
+ + ♻ ☆ Scaling Is All You Need: Autonomous Driving with JAX-Accelerated + Reinforcement Learning + + +
+ Reinforcement learning has been demonstrated to outperform even the best +humans in complex domains like video games. However, running reinforcement +learning experiments on the required scale for autonomous driving is extremely +difficult. Building a large scale reinforcement learning system and +distributing it across many GPUs is challenging. Gathering experience during +training on real world vehicles is prohibitive from a safety and scalability +perspective. Therefore, an efficient and realistic driving simulator is +required that uses a large amount of data from real-world driving. We bring +these capabilities together and conduct large-scale reinforcement learning +experiments for autonomous driving. We demonstrate that our policy performance +improves with increasing scale. Our best performing policy reduces the failure +rate by 64% while improving the rate of driving progress by 25% compared to the +policies produced by state-of-the-art machine learning for autonomous driving. + +
+
+
+
+
+ + ♻ ☆ A Semi-Lagrangian Approach for Time and Energy Path Planning + Optimization in Static Flow Fields + + +
+ Efficient path planning for autonomous mobile robots is a critical problem +across numerous domains, where optimizing both time and energy consumption is +paramount. This paper introduces a novel methodology that considers the dynamic +influence of an environmental flow field and considers geometric constraints, +including obstacles and forbidden zones, enriching the complexity of the +planning problem. We formulate it as a multi-objective optimal control problem, +propose a novel transformation called Harmonic Transformation, and apply a +semi-Lagrangian scheme to solve it. The set of Pareto efficient solutions is +obtained considering two distinct approaches: a deterministic method and an +evolutionary-based one, both of which are designed to make use of the proposed +Harmonic Transformation. Through an extensive analysis of these approaches, we +demonstrate their efficacy in finding optimized paths. + +
+
+ comment: 50 pages, reviewed version; Preprint submitted to Journal of the + Franklin Institute (under review) +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 168 + +
+
+
+ + ☆ MME-Finance: A Multimodal Finance Benchmark for Expert-level + Understanding and Reasoning + + +
+ In recent years, multimodal benchmarks for general domains have guided the +rapid development of multimodal models on general tasks. However, the financial +field has its peculiarities. It features unique graphical images (e.g., +candlestick charts, technical indicator charts) and possesses a wealth of +specialized financial knowledge (e.g., futures, turnover rate). Therefore, +benchmarks from general fields often fail to measure the performance of +multimodal models in the financial domain, and thus cannot effectively guide +the rapid development of large financial models. To promote the development of +large financial multimodal models, we propose MME-Finance, an bilingual +open-ended and practical usage-oriented Visual Question Answering (VQA) +benchmark. The characteristics of our benchmark are finance and expertise, +which include constructing charts that reflect the actual usage needs of users +(e.g., computer screenshots and mobile photography), creating questions +according to the preferences in financial domain inquiries, and annotating +questions by experts with 10+ years of experience in the financial industry. +Additionally, we have developed a custom-designed financial evaluation system +in which visual information is first introduced in the multi-modal evaluation +process. Extensive experimental evaluations of 19 mainstream MLLMs are +conducted to test their perception, reasoning, and cognition capabilities. The +results indicate that models performing well on general benchmarks cannot do +well on MME-Finance; for instance, the top-performing open-source and +closed-source models obtain 65.69 (Qwen2VL-72B) and 63.18 (GPT-4o), +respectively. Their performance is particularly poor in categories most +relevant to finance, such as candlestick charts and technical indicator charts. +In addition, we propose a Chinese version, which helps compare performance of +MLLMs under a Chinese context. + +
+
+ comment: Project Page: https://hithink-research.github.io/MME-Finance/ +
+
+
+
+
+ + ☆ Classification Done Right for Vision-Language Pre-Training NeurIPS 2024 + + +
+ We introduce SuperClass, a super simple classification method for +vision-language pre-training on image-text data. Unlike its contrastive +counterpart CLIP who contrast with a text encoder, SuperClass directly utilizes +tokenized raw text as supervised classification labels, without the need for +additional text filtering or selection. Due to the absence of the text encoding +as contrastive target, SuperClass does not require a text encoder and does not +need to maintain a large batch size as CLIP does. SuperClass demonstrated +superior performance on various downstream tasks, including classic computer +vision benchmarks and vision language downstream tasks. We further explored the +scaling behavior of SuperClass on model size, training length, or data size, +and reported encouraging results and comparisons to CLIP. +https://github.com/x-cls/superclass + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Inference Optimal VLMs Need Only One Visual Token but Larger Models + + +
+ Vision Language Models (VLMs) have demonstrated strong capabilities across +various visual understanding and reasoning tasks. However, their real-world +deployment is often constrained by high latency during inference due to +substantial compute required to process the large number of input tokens +(predominantly from the image) by the LLM. To reduce inference costs, one can +either downsize the LLM or reduce the number of input image-tokens, the latter +of which has been the focus of many recent works around token compression. +However, it is unclear what the optimal trade-off is, as both the factors +directly affect the VLM performance. We first characterize this optimal +trade-off between the number of visual tokens and LLM parameters by +establishing scaling laws that capture variations in performance with these two +factors. Our results reveal a surprising trend: for visual reasoning tasks, the +inference-optimal behavior in VLMs, i.e., minimum downstream error at any given +fixed inference compute, is achieved when using the largest LLM that fits +within the inference budget while minimizing visual token count - often to a +single token. While the token reduction literature has mainly focused on +maintaining base model performance by modestly reducing the token count (e.g., +$5-10\times$), our results indicate that the compute-optimal inference regime +requires operating under even higher token compression ratios. Based on these +insights, we take some initial steps towards building approaches tailored for +high token compression settings. Code is available at +https://github.com/locuslab/llava-token-compression. + +
+
+
+
+
+ + ☆ DiT4Edit: Diffusion Transformer for Image Editing + + +
+ Despite recent advances in UNet-based image editing, methods for shape-aware +object editing in high-resolution images are still lacking. Compared to UNet, +Diffusion Transformers (DiT) demonstrate superior capabilities to effectively +capture the long-range dependencies among patches, leading to higher-quality +image generation. In this paper, we propose DiT4Edit, the first Diffusion +Transformer-based image editing framework. Specifically, DiT4Edit uses the +DPM-Solver inversion algorithm to obtain the inverted latents, reducing the +number of steps compared to the DDIM inversion algorithm commonly used in +UNet-based frameworks. Additionally, we design unified attention control and +patches merging, tailored for transformer computation streams. This integration +allows our framework to generate higher-quality edited images faster. Our +design leverages the advantages of DiT, enabling it to surpass UNet structures +in image editing, especially in high-resolution and arbitrary-size images. +Extensive experiments demonstrate the strong performance of DiT4Edit across +various editing scenarios, highlighting the potential of Diffusion Transformers +in supporting image editing. + +
+
+
+
+
+ + ☆ ShadowMamba: State-Space Model with Boundary-Region Selective Scan for + Shadow Removal + + +
+ Image shadow removal is a typical low-level vision problem, where the +presence of shadows leads to abrupt changes in brightness in certain regions, +affecting the accuracy of upstream tasks. Current shadow removal methods still +face challenges such as residual boundary artifacts, and capturing feature +information at shadow boundaries is crucial for removing shadows and +eliminating residual boundary artifacts. Recently, Mamba has achieved +remarkable success in computer vision by globally modeling long-sequence +information with linear complexity. However, when applied to image shadow +removal, the original Mamba scanning method overlooks the semantic continuity +of shadow boundaries as well as the continuity of semantics within the same +region. Based on the unique characteristics of shadow images, this paper +proposes a novel selective scanning method called boundary-region selective +scanning. This method scans boundary regions, shadow regions, and non-shadow +regions independently, bringing pixels of the same region type closer together +in the long sequence, especially focusing on the local information at the +boundaries, which is crucial for shadow removal. This method combines with +global scanning and channel scanning to jointly accomplish the shadow removal. +We name our model ShadowMamba, the first Mamba-based model for shadow removal. +Extensive experimental results show that our method outperforms current +state-of-the-art models across most metrics on multiple datasets. The code for +ShadowMamba is available at (Code will be released upon acceptance). + +
+
+
+
+
+ + ☆ Decoupling Fine Detail and Global Geometry for Compressed Depth Map + Super-Resolution ECCV 2024 + + +
+ Recovering high-quality depth maps from compressed sources has gained +significant attention due to the limitations of consumer-grade depth cameras +and the bandwidth restrictions during data transmission. However, current +methods still suffer from two challenges. First, bit-depth compression produces +a uniform depth representation in regions with subtle variations, hindering the +recovery of detailed information. Second, densely distributed random noise +reduces the accuracy of estimating the global geometric structure of the scene. +To address these challenges, we propose a novel framework, termed +geometry-decoupled network (GDNet), for compressed depth map super-resolution +that decouples the high-quality depth map reconstruction process by handling +global and detailed geometric features separately. To be specific, we propose +the fine geometry detail encoder (FGDE), which is designed to aggregate fine +geometry details in high-resolution low-level image features while +simultaneously enriching them with complementary information from +low-resolution context-level image features. In addition, we develop the global +geometry encoder (GGE) that aims at suppressing noise and extracting global +geometric information effectively via constructing compact feature +representation in a low-rank space. We conduct experiments on multiple +benchmark datasets, demonstrating that our GDNet significantly outperforms +current methods in terms of geometric consistency and detail recovery. In the +ECCV 2024 AIM Compressed Depth Upsampling Challenge, our solution won the 1st +place award. Our codes will be available. + +
+
+ comment: The 1st solution for the ECCV 2024 AIM Compressed Depth Upsampling + Challenge +
+
+
+
+
+ + ☆ Topograph: An efficient Graph-Based Framework for Strictly Topology + Preserving Image Segmentation + + +
+ Topological correctness plays a critical role in many image segmentation +tasks, yet most networks are trained using pixel-wise loss functions, such as +Dice, neglecting topological accuracy. Existing topology-aware methods often +lack robust topological guarantees, are limited to specific use cases, or +impose high computational costs. In this work, we propose a novel, graph-based +framework for topologically accurate image segmentation that is both +computationally efficient and generally applicable. Our method constructs a +component graph that fully encodes the topological information of both the +prediction and ground truth, allowing us to efficiently identify topologically +critical regions and aggregate a loss based on local neighborhood information. +Furthermore, we introduce a strict topological metric capturing the homotopy +equivalence between the union and intersection of prediction-label pairs. We +formally prove the topological guarantees of our approach and empirically +validate its effectiveness on binary and multi-class datasets. Our loss +demonstrates state-of-the-art performance with up to fivefold faster loss +computation compared to persistent homology methods. + +
+
+
+
+
+ + ☆ Kernel Orthogonality does not necessarily imply a Decrease in Feature + Map Redundancy in CNNs: Convolutional Similarity Minimization + + +
+ Convolutional Neural Networks (CNNs) have been heavily used in Deep Learning +due to their success in various tasks. Nonetheless, it has been observed that +CNNs suffer from redundancy in feature maps, leading to inefficient capacity +utilization. Efforts to mitigate and solve this problem led to the emergence of +multiple methods, amongst which is kernel orthogonality through variant means. +In this work, we challenge the common belief that kernel orthogonality leads to +a decrease in feature map redundancy, which is, supposedly, the ultimate +objective behind kernel orthogonality. We prove, theoretically and empirically, +that kernel orthogonality has an unpredictable effect on feature map similarity +and does not necessarily decrease it. Based on our theoretical result, we +propose an effective method to reduce feature map similarity independently of +the input of the CNN. This is done by minimizing a novel loss function we call +Convolutional Similarity. Empirical results show that minimizing the +Convolutional Similarity increases the performance of classification models and +can accelerate their convergence. Furthermore, using our proposed method pushes +towards a more efficient use of the capacity of models, allowing the use of +significantly smaller models to achieve the same levels of performance. + +
+
+
+
+
+ + ☆ Knowledge Graphs of Driving Scenes to Empower the Emerging Capabilities + of Neurosymbolic AI + + +
+ In the era of Generative AI, Neurosymbolic AI is emerging as a powerful +approach for tasks spanning from perception to cognition. The use of +Neurosymbolic AI has been shown to achieve enhanced capabilities, including +improved grounding, alignment, explainability, and reliability. However, due to +its nascent stage, there is a lack of widely available real-world benchmark +datasets tailored to Neurosymbolic AI tasks. To address this gap and support +the evaluation of current and future methods, we introduce DSceneKG -- a suite +of knowledge graphs of driving scenes built from real-world, high-quality +scenes from multiple open autonomous driving datasets. In this article, we +detail the construction process of DSceneKG and highlight its application in +seven different tasks. DSceneKG is publicly accessible at: +https://github.com/ruwantw/DSceneKG + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Beyond Grid Data: Exploring Graph Neural Networks for Earth Observation + + +
+ Earth Observation (EO) data analysis has been significantly revolutionized by +deep learning (DL), with applications typically limited to grid-like data +structures. Graph Neural Networks (GNNs) emerge as an important innovation, +propelling DL into the non-Euclidean domain. Naturally, GNNs can effectively +tackle the challenges posed by diverse modalities, multiple sensors, and the +heterogeneous nature of EO data. To introduce GNNs in the related domains, our +review begins by offering fundamental knowledge on GNNs. Then, we summarize the +generic problems in EO, to which GNNs can offer potential solutions. Following +this, we explore a broad spectrum of GNNs' applications to scientific problems +in Earth systems, covering areas such as weather and climate analysis, disaster +management, air quality monitoring, agriculture, land cover classification, +hydrological process modeling, and urban modeling. The rationale behind +adopting GNNs in these fields is explained, alongside methodologies for +organizing graphs and designing favorable architectures for various tasks. +Furthermore, we highlight methodological challenges of implementing GNNs in +these domains and possible solutions that could guide future research. While +acknowledging that GNNs are not a universal solution, we conclude the paper by +comparing them with other popular architectures like transformers and analyzing +their potential synergies. + +
+
+ comment: Accepted for publication in Geoscience and Remote Sensing Magazine + (GRSM) +
+
+
+
+
+ + ☆ On Improved Conditioning Mechanisms and Pre-training Strategies for + Diffusion Models NeurIPS 2024 + + +
+ Large-scale training of latent diffusion models (LDMs) has enabled +unprecedented quality in image generation. However, the key components of the +best performing LDM training recipes are oftentimes not available to the +research community, preventing apple-to-apple comparisons and hindering the +validation of progress in the field. In this work, we perform an in-depth study +of LDM training recipes focusing on the performance of models and their +training efficiency. To ensure apple-to-apple comparisons, we re-implement five +previously published models with their corresponding recipes. Through our +study, we explore the effects of (i)~the mechanisms used to condition the +generative model on semantic information (e.g., text prompt) and control +metadata (e.g., crop size, random flip flag, etc.) on the model performance, +and (ii)~the transfer of the representations learned on smaller and +lower-resolution datasets to larger ones on the training efficiency and model +performance. We then propose a novel conditioning mechanism that disentangles +semantic and control metadata conditionings and sets a new state-of-the-art in +class-conditional generation on the ImageNet-1k dataset -- with FID +improvements of 7% on 256 and 8% on 512 resolutions -- as well as text-to-image +generation on the CC12M dataset -- with FID improvements of 8% on 256 and 23% +on 512 resolution. + +
+
+ comment: Accepted as a conference paper (poster) for NeurIPS 2024 +
+
+
+
+
+ + ☆ Pre-trained Visual Dynamics Representations for Efficient Policy + Learning ECCV 2024 + + +
+ Pre-training for Reinforcement Learning (RL) with purely video data is a +valuable yet challenging problem. Although in-the-wild videos are readily +available and inhere a vast amount of prior world knowledge, the absence of +action annotations and the common domain gap with downstream tasks hinder +utilizing videos for RL pre-training. To address the challenge of pre-training +with videos, we propose Pre-trained Visual Dynamics Representations (PVDR) to +bridge the domain gap between videos and downstream tasks for efficient policy +learning. By adopting video prediction as a pre-training task, we use a +Transformer-based Conditional Variational Autoencoder (CVAE) to learn visual +dynamics representations. The pre-trained visual dynamics representations +capture the visual dynamics prior knowledge in the videos. This abstract prior +knowledge can be readily adapted to downstream tasks and aligned with +executable actions through online adaptation. We conduct experiments on a +series of robotics visual control tasks and verify that PVDR is an effective +form for pre-training with videos to promote policy learning. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ MA^2: A Self-Supervised and Motion Augmenting Autoencoder for Gait-Based + Automatic Disease Detection + + +
+ Ground reaction force (GRF) is the force exerted by the ground on a body in +contact with it. GRF-based automatic disease detection (ADD) has become an +emerging medical diagnosis method, which aims to learn and identify disease +patterns corresponding to different gait pressures based on deep learning +methods. Although existing ADD methods can save doctors time in making +diagnoses, training deep models still struggles with the cost caused by the +labeling engineering for a large number of gait diagnostic data for subjects. +On the other hand, the accuracy of the deep model under the unified benchmark +GRF dataset and the generalization ability on scalable gait datasets need to be +further improved. To address these issues, we propose MA2, a GRF-based +self-supervised and motion augmenting auto-encoder, which models the ADD task +as an encoder-decoder paradigm. In the encoder, we introduce an embedding block +including the 3-layer 1D convolution for extracting the token and a mask +generator to randomly mask out the sequence of tokens to maximize the model's +potential to capture high-level, discriminative, intrinsic representations. +whereafter, the decoder utilizes this information to reconstruct the pixel +sequence of the origin input and calculate the reconstruction loss to optimize +the network. Moreover, the backbone of an auto-encoder is multi-head +self-attention that can consider the global information of the token from the +input, not just the local neighborhood. This allows the model to capture +generalized contextual information. Extensive experiments demonstrate MA2 has +SOTA performance of 90.91% accuracy on 1% limited pathological GRF samples with +labels, and good generalization ability of 78.57% accuracy on scalable +Parkinson disease dataset. + +
+
+ comment: 8 pages, 11 figures, article +
+
+
+
+
+ + ☆ Investigating the Applicability of a Snapshot Computed Tomography + Imaging Spectrometer for the Prediction of Brix and pH of Grapes + + +
+ In this paper, a recently developed snapshot hyperspectral imaging (HSI) +system based on Computed Tomography Imaging Spectroscopy (CTIS) is utilized to +determine Brix and pH values in Sheegene 20 table grapes through Partial Least +Squares Regression (PLSR) modeling. The performance of the CTIS system is +compared with that of a state-of-the-art line scan HSI system by imaging 100 +grapes across both platforms. Reference measurements of Brix and pH values are +obtained directly using a refractometer and a pH meter, as these parameters are +essential for assessing the quality of table and wine grapes. The findings +indicate that the spectra captured by the CTIS camera correlate well with the +reference measurements, despite the system's narrower spectral range. The CTIS +camera's advantages, including its lower cost, portability, and reduced +susceptibility to motion errors, highlight its potential for promising in-field +applications in grape quality assessment. + +
+
+ comment: 15 pages, 10 figures +
+
+
+
+
+ + ☆ Local Lesion Generation is Effective for Capsule Endoscopy Image Data + Augmentation in a Limited Data Setting + + +
+ Limited medical imaging datasets challenge deep learning models by increasing +risks of overfitting and reduced generalization, particularly in Generative +Adversarial Networks (GANs), where discriminators may overfit, leading to +training divergence. This constraint also impairs classification models trained +on small datasets. Generative Data Augmentation (GDA) addresses this by +expanding training datasets with synthetic data, although it requires training +a generative model. We propose and evaluate two local lesion generation +approaches to address the challenge of augmenting small medical image datasets. +The first approach employs the Poisson Image Editing algorithm, a classical +image processing technique, to create realistic image composites that +outperform current state-of-the-art methods. The second approach introduces a +novel generative method, leveraging a fine-tuned Image Inpainting GAN to +synthesize realistic lesions within specified regions of real training images. +A comprehensive comparison of the two proposed methods demonstrates that +effective local lesion generation in a data-constrained setting allows for +reaching new state-of-the-art results in capsule endoscopy lesion +classification. Combination of our techniques achieves a macro F1-score of +33.07%, surpassing the previous best result by 7.84 percentage points (p.p.) on +the highly imbalanced Kvasir Capsule Dataset, a benchmark for capsule +endoscopy. To the best of our knowledge, this work is the first to apply a +fine-tuned Image Inpainting GAN for GDA in medical imaging, demonstrating that +an image-conditional GAN can be adapted effectively to limited datasets to +generate high-quality examples, facilitating effective data augmentation. +Additionally, we show that combining this GAN-based approach with classical +image processing techniques further enhances the results. + +
+
+ comment: 45 pages, 27 figures +
+
+
+
+
+ + ☆ HFGaussian: Learning Generalizable Gaussian Human with Integrated Human + Features + + +
+ Recent advancements in radiance field rendering show promising results in 3D +scene representation, where Gaussian splatting-based techniques emerge as +state-of-the-art due to their quality and efficiency. Gaussian splatting is +widely used for various applications, including 3D human representation. +However, previous 3D Gaussian splatting methods either use parametric body +models as additional information or fail to provide any underlying structure, +like human biomechanical features, which are essential for different +applications. In this paper, we present a novel approach called HFGaussian that +can estimate novel views and human features, such as the 3D skeleton, 3D key +points, and dense pose, from sparse input images in real time at 25 FPS. The +proposed method leverages generalizable Gaussian splatting technique to +represent the human subject and its associated features, enabling efficient and +generalizable reconstruction. By incorporating a pose regression network and +the feature splatting technique with Gaussian splatting, HFGaussian +demonstrates improved capabilities over existing 3D human methods, showcasing +the potential of 3D human representations with integrated biomechanics. We +thoroughly evaluate our HFGaussian method against the latest state-of-the-art +techniques in human Gaussian splatting and pose estimation, demonstrating its +real-time, state-of-the-art performance. + +
+
+
+
+
+ + ☆ Self-supervised cross-modality learning for uncertainty-aware object + detection and recognition in applications which lack pre-labelled training + data + + +
+ This paper shows how an uncertainty-aware, deep neural network can be trained +to detect, recognise and localise objects in 2D RGB images, in applications +lacking annotated train-ng datasets. We propose a self-supervising +teacher-student pipeline, in which a relatively simple teacher classifier, +trained with only a few labelled 2D thumbnails, automatically processes a +larger body of unlabelled RGB-D data to teach a student network based on a +modified YOLOv3 architecture. Firstly, 3D object detection with back projection +is used to automatically extract and teach 2D detection and localisation +information to the student network. Secondly, a weakly supervised 2D thumbnail +classifier, with minimal training on a small number of hand-labelled images, is +used to teach object category recognition. Thirdly, we use a Gaussian Process +GP to encode and teach a robust uncertainty estimation functionality, so that +the student can output confidence scores with each categorization. The +resulting student significantly outperforms the same YOLO architecture trained +directly on the same amount of labelled data. Our GP-based approach yields +robust and meaningful uncertainty estimations for complex industrial object +classifications. The end-to-end network is also capable of real-time +processing, needed for robotics applications. Our method can be applied to many +important industrial tasks, where labelled datasets are typically unavailable. +In this paper, we demonstrate an example of detection, localisation, and object +category recognition of nuclear mixed-waste materials in highly cluttered and +unstructured scenes. This is critical for robotic sorting and handling of +legacy nuclear waste, which poses complex environmental remediation challenges +in many nuclearised nations. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Exploiting the Segment Anything Model (SAM) for Lung Segmentation in + Chest X-ray Images + + +
+ Segment Anything Model (SAM), a new AI model from Meta AI released in April +2023, is an ambitious tool designed to identify and separate individual objects +within a given image through semantic interpretation. The advanced capabilities +of SAM are the result of its training with millions of images and masks, and a +few days after its release, several researchers began testing the model on +medical images to evaluate its performance in this domain. With this +perspective in focus -- i.e., optimizing work in the healthcare field -- this +work proposes the use of this new technology to evaluate and study chest X-ray +images. The approach adopted for this work, with the aim of improving the +model's performance for lung segmentation, involved a transfer learning +process, specifically the fine-tuning technique. After applying this +adjustment, a substantial improvement was observed in the evaluation metrics +used to assess SAM's performance compared to the masks provided by the +datasets. The results obtained by the model after the adjustments were +satisfactory and similar to cutting-edge neural networks, such as U-Net. + +
+
+
+
+
+ + ☆ ATM: Improving Model Merging by Alternating Tuning and Merging + + +
+ Model merging has recently emerged as a cost-efficient paradigm for +multi-task learning. Among current approaches, task arithmetic stands out for +its simplicity and effectiveness. In this paper, we motivate the effectiveness +of task vectors by linking them to multi-task gradients. We show that in a +single-epoch scenario, task vectors are mathematically equivalent to the +gradients obtained via gradient descent in a multi-task setting, and still +approximate these gradients in subsequent epochs. Furthermore, we show that +task vectors perform optimally when equality is maintained, and their +effectiveness is largely driven by the first epoch's gradient. Building on this +insight, we propose viewing model merging as a single step in an iterative +process that Alternates between Tuning and Merging (ATM). This method acts as a +bridge between model merging and multi-task gradient descent, achieving +state-of-the-art results with the same data and computational requirements. We +extensively evaluate ATM across diverse settings, achieving up to 20% higher +accuracy in computer vision and NLP tasks, compared to the best +baselines.Finally, we provide both empirical and theoretical support for its +effectiveness, demonstrating increased orthogonality between task vectors and +proving that ATM minimizes an upper bound on the loss obtained by jointly +finetuning all tasks. + +
+
+ comment: Main paper: 10 Pages, 11 figures, 2 tables +
+
+
+
+
+ + ☆ Gradient-Guided Conditional Diffusion Models for Private Image + Reconstruction: Analyzing Adversarial Impacts of Differential Privacy and + Denoising + + +
+ We investigate the construction of gradient-guided conditional diffusion +models for reconstructing private images, focusing on the adversarial interplay +between differential privacy noise and the denoising capabilities of diffusion +models. While current gradient-based reconstruction methods struggle with +high-resolution images due to computational complexity and prior knowledge +requirements, we propose two novel methods that require minimal modifications +to the diffusion model's generation process and eliminate the need for prior +knowledge. Our approach leverages the strong image generation capabilities of +diffusion models to reconstruct private images starting from randomly generated +noise, even when a small amount of differentially private noise has been added +to the gradients. We also conduct a comprehensive theoretical analysis of the +impact of differential privacy noise on the quality of reconstructed images, +revealing the relationship among noise magnitude, the architecture of attacked +models, and the attacker's reconstruction capability. Additionally, extensive +experiments validate the effectiveness of our proposed methods and the accuracy +of our theoretical findings, suggesting new directions for privacy risk +auditing using conditional diffusion models. + +
+
+
+
+
+ + ☆ GarVerseLOD: High-Fidelity 3D Garment Reconstruction from a Single + In-the-Wild Image using a Dataset with Levels of Details + + +
+ Neural implicit functions have brought impressive advances to the +state-of-the-art of clothed human digitization from multiple or even single +images. However, despite the progress, current arts still have difficulty +generalizing to unseen images with complex cloth deformation and body poses. In +this work, we present GarVerseLOD, a new dataset and framework that paves the +way to achieving unprecedented robustness in high-fidelity 3D garment +reconstruction from a single unconstrained image. Inspired by the recent +success of large generative models, we believe that one key to addressing the +generalization challenge lies in the quantity and quality of 3D garment data. +Towards this end, GarVerseLOD collects 6,000 high-quality cloth models with +fine-grained geometry details manually created by professional artists. In +addition to the scale of training data, we observe that having disentangled +granularities of geometry can play an important role in boosting the +generalization capability and inference accuracy of the learned model. We hence +craft GarVerseLOD as a hierarchical dataset with levels of details (LOD), +spanning from detail-free stylized shape to pose-blended garment with +pixel-aligned details. This allows us to make this highly under-constrained +problem tractable by factorizing the inference into easier tasks, each narrowed +down with smaller searching space. To ensure GarVerseLOD can generalize well to +in-the-wild images, we propose a novel labeling paradigm based on conditional +diffusion models to generate extensive paired images for each garment model +with high photorealism. We evaluate our method on a massive amount of +in-the-wild images. Experimental results demonstrate that GarVerseLOD can +generate standalone garment pieces with significantly better quality than prior +approaches. Project page: https://garverselod.github.io/ + +
+
+ comment: Project page: https://garverselod.github.io/ +
+
+
+
+
+ + ☆ Evaluation of handwriting kinematics and pressure for differential + diagnosis of Parkinson's disease + + +
+ Objective: We present the PaHaW Parkinson's disease handwriting database, +consisting of handwriting samples from Parkinson's disease (PD) patients and +healthy controls. Our goal is to show that kinematic features and pressure +features in handwriting can be used for the differential diagnosis of PD. +Methods and Material: The database contains records from 37 PD patients and 38 +healthy controls performing eight different handwriting tasks. The tasks +include drawing an Archimedean spiral, repetitively writing orthographically +simple syllables and words, and writing of a sentence. In addition to the +conventional kinematic features related to the dynamics of handwriting, we +investigated new pressure features based on the pressure exerted on the writing +surface. To discriminate between PD patients and healthy subjects, three +different classifiers were compared: K-nearest neighbors (K-NN), ensemble +AdaBoost classifier, and support vector machines (SVM). Results: For predicting +PD based on kinematic and pressure features of handwriting, the best performing +model was SVM with classification accuracy of Pacc = 81.3% (sensitivity Psen = +87.4% and specificity of Pspe = 80.9%). When evaluated separately, pressure +features proved to be relevant for PD diagnosis, yielding Pacc = 82.5% compared +to Pacc = 75.4% using kinematic features. Conclusion: Experimental results +showed that an analysis of kinematic and pressure features during handwriting +can help assess subtle characteristics of handwriting and discriminate between +PD patients and healthy controls. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ☆ Judge Like a Real Doctor: Dual Teacher Sample Consistency Framework for + Semi-supervised Medical Image Classification + + +
+ Semi-supervised learning (SSL) is a popular solution to alleviate the high +annotation cost in medical image classification. As a main branch of SSL, +consistency regularization engages in imposing consensus between the +predictions of a single sample from different views, termed as Absolute +Location consistency (AL-c). However, only AL-c may be insufficient. Just like +when diagnosing a case in practice, besides the case itself, the doctor usually +refers to certain related trustworthy cases to make more reliable +decisions.Therefore, we argue that solely relying on AL-c may ignore the +relative differences across samples, which we interpret as relative locations, +and only exploit limited information from one perspective. To address this +issue, we propose a Sample Consistency Mean Teacher (SCMT) which not only +incorporates AL c but also additionally enforces consistency between the +samples' relative similarities to its related samples, called Relative Location +consistency (RL c). AL c and RL c conduct consistency regularization from two +different perspectives, jointly extracting more diverse semantic information +for classification. On the other hand, due to the highly similar structures in +medical images, the sample distribution could be overly dense in feature space, +making their relative locations susceptible to noise. To tackle this problem, +we further develop a Sample Scatter Mean Teacher (SSMT) by utilizing +contrastive learning to sparsify the sample distribution and obtain robust and +effective relative locations. Extensive experiments on different datasets +demonstrate the superiority of our method. + +
+
+ comment: Accepted by IEEE Transactions on Emerging Topics in Computational + Intelligence +
+
+
+
+
+ + ☆ Rethinking Decoders for Transformer-based Semantic Segmentation: + Compression is All You Need NeurIPS2024 + + +
+ State-of-the-art methods for Transformer-based semantic segmentation +typically adopt Transformer decoders that are used to extract additional +embeddings from image embeddings via cross-attention, refine either or both +types of embeddings via self-attention, and project image embeddings onto the +additional embeddings via dot-product. Despite their remarkable success, these +empirical designs still lack theoretical justifications or interpretations, +thus hindering potentially principled improvements. In this paper, we argue +that there are fundamental connections between semantic segmentation and +compression, especially between the Transformer decoders and Principal +Component Analysis (PCA). From such a perspective, we derive a white-box, fully +attentional DEcoder for PrIncipled semantiC segemenTation (DEPICT), with the +interpretations as follows: 1) the self-attention operator refines image +embeddings to construct an ideal principal subspace that aligns with the +supervision and retains most information; 2) the cross-attention operator seeks +to find a low-rank approximation of the refined image embeddings, which is +expected to be a set of orthonormal bases of the principal subspace and +corresponds to the predefined classes; 3) the dot-product operation yields +compact representation for image embeddings as segmentation masks. Experiments +conducted on dataset ADE20K find that DEPICT consistently outperforms its +black-box counterpart, Segmenter, and it is light weight and more robust. + +
+
+ comment: NeurIPS2024. Code:https://github.com/QishuaiWen/DEPICT/ +
+
+
+
+
+ + ☆ FEDLAD: Federated Evaluation of Deep Leakage Attacks and Defenses + + +
+ Federated Learning is a privacy preserving decentralized machine learning +paradigm designed to collaboratively train models across multiple clients by +exchanging gradients to the server and keeping private data local. +Nevertheless, recent research has revealed that the security of Federated +Learning is compromised, as private ground truth data can be recovered through +a gradient inversion technique known as Deep Leakage. While these attacks are +crafted with a focus on applications in Federated Learning, they generally are +not evaluated in realistic scenarios. This paper introduces the FEDLAD +Framework (Federated Evaluation of Deep Leakage Attacks and Defenses), a +comprehensive benchmark for evaluating Deep Leakage attacks and defenses within +a realistic Federated context. By implementing a unified benchmark that +encompasses multiple state-of-the-art Deep Leakage techniques and various +defense strategies, our framework facilitates the evaluation and comparison of +the efficacy of these methods across different datasets and training states. +This work highlights a crucial trade-off between privacy and model accuracy in +Federated Learning and aims to advance the understanding of security challenges +in decentralized machine learning systems, stimulate future research, and +enhance reproducibility in evaluating Deep Leakage attacks and defenses. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ CRT-Fusion: Camera, Radar, Temporal Fusion Using Motion Information for + 3D Object Detection NeurIPS2024 + + +
+ Accurate and robust 3D object detection is a critical component in autonomous +vehicles and robotics. While recent radar-camera fusion methods have made +significant progress by fusing information in the bird's-eye view (BEV) +representation, they often struggle to effectively capture the motion of +dynamic objects, leading to limited performance in real-world scenarios. In +this paper, we introduce CRT-Fusion, a novel framework that integrates temporal +information into radar-camera fusion to address this challenge. Our approach +comprises three key modules: Multi-View Fusion (MVF), Motion Feature Estimator +(MFE), and Motion Guided Temporal Fusion (MGTF). The MVF module fuses radar and +image features within both the camera view and bird's-eye view, thereby +generating a more precise unified BEV representation. The MFE module conducts +two simultaneous tasks: estimation of pixel-wise velocity information and BEV +segmentation. Based on the velocity and the occupancy score map obtained from +the MFE module, the MGTF module aligns and fuses feature maps across multiple +timestamps in a recurrent manner. By considering the motion of dynamic objects, +CRT-Fusion can produce robust BEV feature maps, thereby improving detection +accuracy and robustness. Extensive evaluations on the challenging nuScenes +dataset demonstrate that CRT-Fusion achieves state-of-the-art performance for +radar-camera-based 3D object detection. Our approach outperforms the previous +best method in terms of NDS by +1.7%, while also surpassing the leading +approach in mAP by +1.4%. These significant improvements in both metrics +showcase the effectiveness of our proposed fusion strategy in enhancing the +reliability and accuracy of 3D object detection. + +
+
+ comment: Accepted at NeurIPS2024 +
+
+
+
+
+ + ☆ Precise Drive with VLM: First Prize Solution for PRCV 2024 Drive LM + challenge + + +
+ This technical report outlines the methodologies we applied for the PRCV +Challenge, focusing on cognition and decision-making in driving scenarios. We +employed InternVL-2.0, a pioneering open-source multi-modal model, and enhanced +it by refining both the model input and training methodologies. For the input +data, we strategically concatenated and formatted the multi-view images. It is +worth mentioning that we utilized the coordinates of the original images +without transformation. In terms of model training, we initially pre-trained +the model on publicly available autonomous driving scenario datasets to bolster +its alignment capabilities of the challenge tasks, followed by fine-tuning on +the DriveLM-nuscenes Dataset. During the fine-tuning phase, we innovatively +modified the loss function to enhance the model's precision in predicting +coordinate values. These approaches ensure that our model possesses advanced +cognitive and decision-making capabilities in driving scenarios. Consequently, +our model achieved a score of 0.6064, securing the first prize on the +competition's final results. + +
+
+
+
+
+ + ☆ PV-faultNet: Optimized CNN Architecture to detect defects resulting + efficient PV production + + +
+ The global shift towards renewable energy has pushed PV cell manufacturing as +a pivotal point as they are the fundamental building block of green energy. +However, the manufacturing process is complex enough to lose its purpose due to +probable defects experienced during the time impacting the overall efficiency. +However, at the moment, manual inspection is being conducted to detect the +defects that can cause bias, leading to time and cost inefficiency. Even if +automated solutions have also been proposed, most of them are +resource-intensive, proving ineffective in production environments. In that +context, this study presents PV-faultNet, a lightweight Convolutional Neural +Network (CNN) architecture optimized for efficient and real-time defect +detection in photovoltaic (PV) cells, designed to be deployable on +resource-limited production devices. Addressing computational challenges in +industrial PV manufacturing environments, the model includes only 2.92 million +parameters, significantly reducing processing demands without sacrificing +accuracy. Comprehensive data augmentation techniques were implemented to tackle +data scarcity, thus enhancing model generalization and maintaining a balance +between precision and recall. The proposed model achieved high performance with +91\% precision, 89\% recall, and a 90\% F1 score, demonstrating its +effectiveness for scalable quality control in PV production. + +
+
+
+
+
+ + ☆ Efficient and Effective Adaptation of Multimodal Foundation Models in + Sequential Recommendation + + +
+ Multimodal foundation models (MFMs) have revolutionized sequential +recommender systems through advanced representation learning. While +Parameter-efficient Fine-tuning (PEFT) is commonly used to adapt these models, +studies often prioritize parameter efficiency, neglecting GPU memory and +training speed. To address this, we introduced the IISAN framework, +significantly enhancing efficiency. However, IISAN was limited to symmetrical +MFMs and identical text and image encoders, preventing the use of +state-of-the-art Large Language Models. To overcome this, we developed +IISAN-Versa, a versatile plug-and-play architecture compatible with both +symmetrical and asymmetrical MFMs. IISAN-Versa employs a Decoupled PEFT +structure and utilizes both intra- and inter-modal adaptation. It effectively +handles asymmetry through a simple yet effective combination of group +layer-dropping and dimension transformation alignment. Our research +demonstrates that IISAN-Versa effectively adapts large text encoders, and we +further identify a scaling effect where larger encoders generally perform +better. IISAN-Versa also demonstrates strong versatility in our defined +multimodal scenarios, which include raw titles and captions generated from +images and videos. Additionally, IISAN-Versa achieved state-of-the-art +performance on the Microlens public benchmark. We will release our code and +datasets to support future research. + +
+
+ comment: The extension of IISAN in SIGIR2024 +
+
+
+
+
+ + ☆ CAD-NeRF: Learning NeRFs from Uncalibrated Few-view Images by CAD Model + Retrieval + + +
+ Reconstructing from multi-view images is a longstanding problem in 3D vision, +where neural radiance fields (NeRFs) have shown great potential and get +realistic rendered images of novel views. Currently, most NeRF methods either +require accurate camera poses or a large number of input images, or even both. +Reconstructing NeRF from few-view images without poses is challenging and +highly ill-posed. To address this problem, we propose CAD-NeRF, a method +reconstructed from less than 10 images without any known poses. Specifically, +we build a mini library of several CAD models from ShapeNet and render them +from many random views. Given sparse-view input images, we run a model and pose +retrieval from the library, to get a model with similar shapes, serving as the +density supervision and pose initializations. Here we propose a multi-view pose +retrieval method to avoid pose conflicts among views, which is a new and unseen +problem in uncalibrated NeRF methods. Then, the geometry of the object is +trained by the CAD guidance. The deformation of the density field and camera +poses are optimized jointly. Then texture and density are trained and +fine-tuned as well. All training phases are in self-supervised manners. +Comprehensive evaluations of synthetic and real images show that CAD-NeRF +successfully learns accurate densities with a large deformation from retrieved +CAD models, showing the generalization abilities. + +
+
+ comment: The article has been accepted by Frontiers of Computer Science (FCS) +
+
+
+
+
+ + ☆ Region-Guided Attack on the Segment Anything Model (SAM) + + +
+ The Segment Anything Model (SAM) is a cornerstone of image segmentation, +demonstrating exceptional performance across various applications, particularly +in autonomous driving and medical imaging, where precise segmentation is +crucial. However, SAM is vulnerable to adversarial attacks that can +significantly impair its functionality through minor input perturbations. +Traditional techniques, such as FGSM and PGD, are often ineffective in +segmentation tasks due to their reliance on global perturbations that overlook +spatial nuances. Recent methods like Attack-SAM-K and UAD have begun to address +these challenges, but they frequently depend on external cues and do not fully +leverage the structural interdependencies within segmentation processes. This +limitation underscores the need for a novel adversarial strategy that exploits +the unique characteristics of segmentation tasks. In response, we introduce the +Region-Guided Attack (RGA), designed specifically for SAM. RGA utilizes a +Region-Guided Map (RGM) to manipulate segmented regions, enabling targeted +perturbations that fragment large segments and expand smaller ones, resulting +in erroneous outputs from SAM. Our experiments demonstrate that RGA achieves +high success rates in both white-box and black-box scenarios, emphasizing the +need for robust defenses against such sophisticated attacks. RGA not only +reveals SAM's vulnerabilities but also lays the groundwork for developing more +resilient defenses against adversarial threats in image segmentation. + +
+
+
+
+
+ + ☆ Exploring Seasonal Variability in the Context of Neural Radiance Fields + for 3D Reconstruction on Satellite Imagery + + +
+ In this work, the seasonal predictive capabilities of Neural Radiance Fields +(NeRF) applied to satellite images are investigated. Focusing on the +utilization of satellite data, the study explores how Sat-NeRF, a novel +approach in computer vision, performs in predicting seasonal variations across +different months. Through comprehensive analysis and visualization, the study +examines the model's ability to capture and predict seasonal changes, +highlighting specific challenges and strengths. Results showcase the impact of +the sun direction on predictions, revealing nuanced details in seasonal +transitions, such as snow cover, color accuracy, and texture representation in +different landscapes. Given these results, we propose Planet-NeRF, an extension +to Sat-NeRF capable of incorporating seasonal variability through a set of +month embedding vectors. Comparative evaluations reveal that Planet-NeRF +outperforms prior models in the case where seasonal changes are present. The +extensive evaluation combined with the proposed method offers promising avenues +for future research in this domain. + +
+
+
+
+
+ + ☆ Multi-modal NeRF Self-Supervision for LiDAR Semantic Segmentation IROS + + +
+ LiDAR Semantic Segmentation is a fundamental task in autonomous driving +perception consisting of associating each LiDAR point to a semantic label. +Fully-supervised models have widely tackled this task, but they require labels +for each scan, which either limits their domain or requires impractical amounts +of expensive annotations. Camera images, which are generally recorded alongside +LiDAR pointclouds, can be processed by the widely available 2D foundation +models, which are generic and dataset-agnostic. However, distilling knowledge +from 2D data to improve LiDAR perception raises domain adaptation challenges. +For example, the classical perspective projection suffers from the parallax +effect produced by the position shift between both sensors at their respective +capture times. We propose a Semi-Supervised Learning setup to leverage +unlabeled LiDAR pointclouds alongside distilled knowledge from the camera +images. To self-supervise our model on the unlabeled scans, we add an auxiliary +NeRF head and cast rays from the camera viewpoint over the unlabeled voxel +features. The NeRF head predicts densities and semantic logits at each sampled +ray location which are used for rendering pixel semantics. Concurrently, we +query the Segment-Anything (SAM) foundation model with the camera image to +generate a set of unlabeled generic masks. We fuse the masks with the rendered +pixel semantics from LiDAR to produce pseudo-labels that supervise the pixel +predictions. During inference, we drop the NeRF head and run our model with +only LiDAR. We show the effectiveness of our approach in three public LiDAR +Semantic Segmentation benchmarks: nuScenes, SemanticKITTI and ScribbleKITTI. + +
+
+ comment: IEEE/RSJ International Conference on Intelligent Robots and Systems + (IROS) 2024 +
+
+
+
+
+ + ☆ LDPM: Towards undersampled MRI reconstruction with MR-VAE and Latent + Diffusion Prior + + +
+ Diffusion model, as a powerful generative model, has found a wide range of +applications including MRI reconstruction. However, most existing diffusion +model-based MRI reconstruction methods operate directly in pixel space, which +makes their optimization and inference computationally expensive. Latent +diffusion models were introduced to address this problem in natural image +processing, but directly applying them to MRI reconstruction still faces many +challenges, including the lack of control over the generated results, the +adaptability of Variational AutoEncoder (VAE) to MRI, and the exploration of +applicable data consistency in latent space. To address these challenges, a +Latent Diffusion Prior based undersampled MRI reconstruction (LDPM) method is +proposed. A sketcher module is utilized to provide appropriate control and +balance the quality and fidelity of the reconstructed MR images. A VAE adapted +for MRI tasks (MR-VAE) is explored, which can serve as the backbone for future +MR-related tasks. Furthermore, a variation of the DDIM sampler, called the +Dual-Stage Sampler, is proposed to achieve high-fidelity reconstruction in the +latent space. The proposed method achieves competitive results on fastMRI +datasets, and the effectiveness of each module is demonstrated in ablation +experiments. + +
+
+
+
+
+ + ☆ Mapping Africa Settlements: High Resolution Urban and Rural Map by Deep + Learning and Satellite Imagery + + +
+ Accurate Land Use and Land Cover (LULC) maps are essential for understanding +the drivers of sustainable development, in terms of its complex +interrelationships between human activities and natural resources. However, +existing LULC maps often lack precise urban and rural classifications, +particularly in diverse regions like Africa. This study presents a novel +construction of a high-resolution rural-urban map using deep learning +techniques and satellite imagery. We developed a deep learning model based on +the DeepLabV3 architecture, which was trained on satellite imagery from +Landsat-8 and the ESRI LULC dataset, augmented with human settlement data from +the GHS-SMOD. The model utilizes semantic segmentation to classify land into +detailed categories, including urban and rural areas, at a 10-meter resolution. +Our findings demonstrate that incorporating LULC along with urban and rural +classifications significantly enhances the model's ability to accurately +distinguish between urban, rural, and non-human settlement areas. Therefore, +our maps can support more informed decision-making for policymakers, +researchers, and stakeholders. We release a continent wide urban-rural map, +covering the period 2016 and 2022. + +
+
+
+
+
+ + ☆ Domain Expansion and Boundary Growth for Open-Set Single-Source Domain + Generalization + + +
+ Open-set single-source domain generalization aims to use a single-source +domain to learn a robust model that can be generalized to unknown target +domains with both domain shifts and label shifts. The scarcity of the source +domain and the unknown data distribution of the target domain pose a great +challenge for domain-invariant feature learning and unknown class recognition. +In this paper, we propose a novel learning approach based on domain expansion +and boundary growth to expand the scarce source samples and enlarge the +boundaries across the known classes that indirectly broaden the boundary +between the known and unknown classes. Specifically, we achieve domain +expansion by employing both background suppression and style augmentation on +the source data to synthesize new samples. Then we force the model to distill +consistent knowledge from the synthesized samples so that the model can learn +domain-invariant information. Furthermore, we realize boundary growth across +classes by using edge maps as an additional modality of samples when training +multi-binary classifiers. In this way, it enlarges the boundary between the +inliers and outliers, and consequently improves the unknown class recognition +during open-set generalization. Extensive experiments show that our approach +can achieve significant improvements and reach state-of-the-art performance on +several cross-domain image classification datasets. + +
+
+ comment: TMM 2024 +
+
+
+
+
+ + ☆ Membership Inference Attacks against Large Vision-Language Models NeurIPS 2024 + + +
+ Large vision-language models (VLLMs) exhibit promising capabilities for +processing multi-modal tasks across various application scenarios. However, +their emergence also raises significant data security concerns, given the +potential inclusion of sensitive information, such as private photos and +medical records, in their training datasets. Detecting inappropriately used +data in VLLMs remains a critical and unresolved issue, mainly due to the lack +of standardized datasets and suitable methodologies. In this study, we +introduce the first membership inference attack (MIA) benchmark tailored for +various VLLMs to facilitate training data detection. Then, we propose a novel +MIA pipeline specifically designed for token-level image detection. Lastly, we +present a new metric called MaxR\'enyi-K%, which is based on the confidence of +the model output and applies to both text and image data. We believe that our +work can deepen the understanding and methodology of MIAs in the context of +VLLMs. Our code and datasets are available at +https://github.com/LIONS-EPFL/VL-MIA. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Fried deconvolution + + +
+ In this paper we present a new approach to deblur the effect of atmospheric +turbulence in the case of long range imaging. Our method is based on an +analytical formulation, the Fried kernel, of the atmosphere modulation transfer +function (MTF) and a framelet based deconvolution algorithm. An important +parameter is the refractive index structure which requires specific +measurements to be known. Then we propose a method which provides a good +estimation of this parameter from the input blurred image. The final algorithms +are very easy to implement and show very good results on both simulated blur +and real images. + +
+
+
+
+
+ + ☆ Turbulence stabilization + + +
+ We recently developed a new approach to get a stabilized image from a +sequence of frames acquired through atmospheric turbulence. The goal of this +algorihtm is to remove the geometric distortions due by the atmosphere +movements. This method is based on a variational formulation and is efficiently +solved by the use of Bregman iterations and the operator splitting method. In +this paper we propose to study the influence of the choice of the regularizing +term in the model. Then we proposed to experiment some of the most used +regularization constraints available in the litterature. + +
+
+
+
+
+ + ☆ A Symmetric Dynamic Learning Framework for Diffeomorphic Medical Image + Registration + + +
+ Diffeomorphic image registration is crucial for various medical imaging +applications because it can preserve the topology of the transformation. This +study introduces DCCNN-LSTM-Reg, a learning framework that evolves dynamically +and learns a symmetrical registration path by satisfying a specified control +increment system. This framework aims to obtain symmetric diffeomorphic +deformations between moving and fixed images. To achieve this, we combine deep +learning networks with diffeomorphic mathematical mechanisms to create a +continuous and dynamic registration architecture, which consists of multiple +Symmetric Registration (SR) modules cascaded on five different scales. +Specifically, our method first uses two U-nets with shared parameters to +extract multiscale feature pyramids from the images. We then develop an +SR-module comprising a sequential CNN-LSTM architecture to progressively +correct the forward and reverse multiscale deformation fields using control +increment learning and the homotopy continuation technique. Through extensive +experiments on three 3D registration tasks, we demonstrate that our method +outperforms existing approaches in both quantitative and qualitative +evaluations. + +
+
+ comment: 12 pages,7 figures +
+
+
+
+
+ + ☆ Enhancing Adversarial Robustness via Uncertainty-Aware Distributional + Adversarial Training + + +
+ Despite remarkable achievements in deep learning across various domains, its +inherent vulnerability to adversarial examples still remains a critical concern +for practical deployment. Adversarial training has emerged as one of the most +effective defensive techniques for improving model robustness against such +malicious inputs. However, existing adversarial training schemes often lead to +limited generalization ability against underlying adversaries with diversity +due to their overreliance on a point-by-point augmentation strategy by mapping +each clean example to its adversarial counterpart during training. In addition, +adversarial examples can induce significant disruptions in the statistical +information w.r.t. the target model, thereby introducing substantial +uncertainty and challenges to modeling the distribution of adversarial +examples. To circumvent these issues, in this paper, we propose a novel +uncertainty-aware distributional adversarial training method, which enforces +adversary modeling by leveraging both the statistical information of +adversarial examples and its corresponding uncertainty estimation, with the +goal of augmenting the diversity of adversaries. Considering the potentially +negative impact induced by aligning adversaries to misclassified clean +examples, we also refine the alignment reference based on the statistical +proximity to clean examples during adversarial training, thereby reframing +adversarial training within a distribution-to-distribution matching framework +interacted between the clean and adversarial domains. Furthermore, we design an +introspective gradient alignment approach via matching input gradients between +these domains without introducing external models. Extensive experiments across +four benchmark datasets and various network architectures demonstrate that our +approach achieves state-of-the-art adversarial robustness and maintains natural +performance. + +
+
+
+
+
+ + ☆ AtlasSeg: Atlas Prior Guided Dual-U-Net for Cortical Segmentation in + Fetal Brain MRI + + +
+ Accurate tissue segmentation in fetal brain MRI remains challenging due to +the dynamically changing anatomical anatomy and contrast during fetal +development. To enhance segmentation accuracy throughout gestation, we +introduced AtlasSeg, a dual-U-shape convolution network incorporating +gestational age (GA) specific information as guidance. By providing a publicly +available fetal brain atlas with segmentation label at the corresponding GA, +AtlasSeg effectively extracted the contextual features of age-specific patterns +in atlas branch and generated tissue segmentation in segmentation branch. +Multi-scale attentive atlas feature fusions were constructed in all stages +during encoding and decoding, giving rise to a dual-U-shape network to assist +feature flow and information interactions between two branches. AtlasSeg +outperformed six well-known segmentation networks in both our internal fetal +brain MRI dataset and the external FeTA dataset. Ablation experiments +demonstrate the efficiency of atlas guidance and the attention mechanism. The +proposed AtlasSeg demonstrated superior segmentation performance against other +convolution networks with higher segmentation accuracy, and may facilitate +fetal brain MRI analysis in large-scale fetal brain studies. + +
+
+
+
+
+ + ☆ Centerness-based Instance-aware Knowledge Distillation with Task-wise + Mutual Lifting for Object Detection on Drone Imagery + + +
+ Developing accurate and efficient detectors for drone imagery is challenging +due to the inherent complexity of aerial scenes. While some existing methods +aim to achieve high accuracy by utilizing larger models, their computational +cost is prohibitive for drones. Recently, Knowledge Distillation (KD) has shown +promising potential for maintaining satisfactory accuracy while significantly +compressing models in general object detection. Considering the advantages of +KD, this paper presents the first attempt to adapt it to object detection on +drone imagery and addresses two intrinsic issues: (1) low foreground-background +ratio and (2) small instances and complex backgrounds, which lead to inadequate +training, resulting insufficient distillation. Therefore, we propose a +task-wise Lightweight Mutual Lifting (Light-ML) module with a Centerness-based +Instance-aware Distillation (CID) strategy. The Light-ML module mutually +harmonizes the classification and localization branches by channel shuffling +and convolution, integrating teacher supervision across different tasks during +back-propagation, thus facilitating training the student model. The CID +strategy extracts valuable regions surrounding instances through the centerness +of proposals, enhancing distillation efficacy. Experiments on the VisDrone, +UAVDT, and COCO benchmarks demonstrate that the proposed approach promotes the +accuracies of existing state-of-the-art KD methods with comparable +computational requirements. Codes will be available upon acceptance. + +
+
+
+
+
+ + ☆ Continual Audio-Visual Sound Separation NeurIPS 2024 + + +
+ In this paper, we introduce a novel continual audio-visual sound separation +task, aiming to continuously separate sound sources for new classes while +preserving performance on previously learned classes, with the aid of visual +guidance. This problem is crucial for practical visually guided auditory +perception as it can significantly enhance the adaptability and robustness of +audio-visual sound separation models, making them more applicable for +real-world scenarios where encountering new sound sources is commonplace. The +task is inherently challenging as our models must not only effectively utilize +information from both modalities in current tasks but also preserve their +cross-modal association in old tasks to mitigate catastrophic forgetting during +audio-visual continual learning. To address these challenges, we propose a +novel approach named ContAV-Sep (\textbf{Cont}inual +\textbf{A}udio-\textbf{V}isual Sound \textbf{Sep}aration). ContAV-Sep presents +a novel Cross-modal Similarity Distillation Constraint (CrossSDC) to uphold the +cross-modal semantic similarity through incremental tasks and retain previously +acquired knowledge of semantic similarity in old models, mitigating the risk of +catastrophic forgetting. The CrossSDC can seamlessly integrate into the +training process of different audio-visual sound separation frameworks. +Experiments demonstrate that ContAV-Sep can effectively mitigate catastrophic +forgetting and achieve significantly better performance compared to other +continual learning baselines for audio-visual sound separation. Code is +available at: \url{https://github.com/weiguoPian/ContAV-Sep_NeurIPS2024}. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ OLAF: A Plug-and-Play Framework for Enhanced Multi-object Multi-part + Scene Parsing ECCV + + +
+ Multi-object multi-part scene segmentation is a challenging task whose +complexity scales exponentially with part granularity and number of scene +objects. To address the task, we propose a plug-and-play approach termed OLAF. +First, we augment the input (RGB) with channels containing object-based +structural cues (fg/bg mask, boundary edge mask). We propose a weight +adaptation technique which enables regular (RGB) pre-trained models to process +the augmented (5-channel) input in a stable manner during optimization. In +addition, we introduce an encoder module termed LDF to provide low-level dense +feature guidance. This assists segmentation, particularly for smaller parts. +OLAF enables significant mIoU gains of $\mathbf{3.3}$ (Pascal-Parts-58), +$\mathbf{3.5}$ (Pascal-Parts-108) over the SOTA model. On the most challenging +variant (Pascal-Parts-201), the gain is $\mathbf{4.0}$. Experimentally, we show +that OLAF's broad applicability enables gains across multiple architectures +(CNN, U-Net, Transformer) and datasets. The code is available at +olafseg.github.io + +
+
+ comment: Accepted in The European Conference on Computer Vision (ECCV) 2024 +
+
+
+
+
+ + ☆ Analyzing Poverty through Intra-Annual Time-Series: A Wavelet Transform + Approach + + +
+ Reducing global poverty is a key objective of the Sustainable Development +Goals (SDGs). Achieving this requires high-frequency, granular data to capture +neighborhood-level changes, particularly in data scarce regions such as low- +and middle-income countries. To fill in the data gaps, recent computer vision +methods combining machine learning (ML) with earth observation (EO) data to +improve poverty estimation. However, while much progress have been made, they +often omit intra-annual variations, which are crucial for estimating poverty in +agriculturally dependent countries. We explored the impact of integrating +intra-annual NDVI information with annual multi-spectral data on model +accuracy. To evaluate our method, we created a simulated dataset using Landsat +imagery and nighttime light data to evaluate EO-ML methods that use +intra-annual EO data. Additionally, we evaluated our method against the +Demographic and Health Survey (DHS) dataset across Africa. Our results indicate +that integrating specific NDVI-derived features with multi-spectral data +provides valuable insights for poverty analysis, emphasizing the importance of +retaining intra-annual information. + +
+
+
+
+
+ + ☆ Correlation of Object Detection Performance with Visual Saliency and + Depth Estimation + + +
+ As object detection techniques continue to evolve, understanding their +relationships with complementary visual tasks becomes crucial for optimising +model architectures and computational resources. This paper investigates the +correlations between object detection accuracy and two fundamental visual +tasks: depth prediction and visual saliency prediction. Through comprehensive +experiments using state-of-the-art models (DeepGaze IIE, Depth Anything, +DPT-Large, and Itti's model) on COCO and Pascal VOC datasets, we find that +visual saliency shows consistently stronger correlations with object detection +accuracy (mA$\rho$ up to 0.459 on Pascal VOC) compared to depth prediction +(mA$\rho$ up to 0.283). Our analysis reveals significant variations in these +correlations across object categories, with larger objects showing correlation +values up to three times higher than smaller objects. These findings suggest +incorporating visual saliency features into object detection architectures +could be more beneficial than depth information, particularly for specific +object categories. The observed category-specific variations also provide +insights for targeted feature engineering and dataset design improvements, +potentially leading to more efficient and accurate object detection systems. + +
+
+ comment: Code Available at: + https://github.com/mbar0075/Object-Detection-Correlation-Saliency-vs-Depth +
+
+
+
+
+ + ☆ Advances in Photoacoustic Imaging Reconstruction and Quantitative + Analysis for Biomedical Applications + + +
+ Photoacoustic imaging (PAI) represents an innovative biomedical imaging +modality that harnesses the advantages of optical resolution and acoustic +penetration depth while ensuring enhanced safety. Despite its promising +potential across a diverse array of preclinical and clinical applications, the +clinical implementation of PAI faces significant challenges, including the +trade-off between penetration depth and spatial resolution, as well as the +demand for faster imaging speeds. This paper explores the fundamental +principles underlying PAI, with a particular emphasis on three primary +implementations: photoacoustic computed tomography (PACT), photoacoustic +microscopy (PAM), and photoacoustic endoscopy (PAE). We undertake a critical +assessment of their respective strengths and practical limitations. +Furthermore, recent developments in utilizing conventional or deep learning +(DL) methodologies for image reconstruction and artefact mitigation across +PACT, PAM, and PAE are outlined, demonstrating considerable potential to +enhance image quality and accelerate imaging processes. Furthermore, this paper +examines the recent developments in quantitative analysis within PAI, including +the quantification of haemoglobin concentration, oxygen saturation, and other +physiological parameters within tissues. Finally, our discussion encompasses +current trends and future directions in PAI research while emphasizing the +transformative impact of deep learning on advancing PAI. + +
+
+
+
+
+ + ☆ Test-Time Dynamic Image Fusion NeurIPS 2024 + + +
+ The inherent challenge of image fusion lies in capturing the correlation of +multi-source images and comprehensively integrating effective information from +different sources. Most existing techniques fail to perform dynamic image +fusion while notably lacking theoretical guarantees, leading to potential +deployment risks in this field. Is it possible to conduct dynamic image fusion +with a clear theoretical justification? In this paper, we give our solution +from a generalization perspective. We proceed to reveal the generalized form of +image fusion and derive a new test-time dynamic image fusion paradigm. It +provably reduces the upper bound of generalization error. Specifically, we +decompose the fused image into multiple components corresponding to its source +data. The decomposed components represent the effective information from the +source data, thus the gap between them reflects the Relative Dominability (RD) +of the uni-source data in constructing the fusion image. Theoretically, we +prove that the key to reducing generalization error hinges on the negative +correlation between the RD-based fusion weight and the uni-source +reconstruction loss. Intuitively, RD dynamically highlights the dominant +regions of each source and can be naturally converted to the corresponding +fusion weight, achieving robust results. Extensive experiments and discussions +with in-depth analysis on multiple benchmarks confirm our findings and +superiority. Our code is available at https://github.com/Yinan-Xia/TTD. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Lost in Context: The Influence of Context on Feature Attribution Methods + for Object Recognition + + +
+ Contextual information plays a critical role in object recognition models +within computer vision, where changes in context can significantly affect +accuracy, underscoring models' dependence on contextual cues. This study +investigates how context manipulation influences both model accuracy and +feature attribution, providing insights into the reliance of object recognition +models on contextual information as understood through the lens of feature +attribution methods. + We employ a range of feature attribution techniques to decipher the reliance +of deep neural networks on context in object recognition tasks. Using the +ImageNet-9 and our curated ImageNet-CS datasets, we conduct experiments to +evaluate the impact of contextual variations, analyzed through feature +attribution methods. Our findings reveal several key insights: (a) Correctly +classified images predominantly emphasize object volume attribution over +context volume attribution. (b) The dependence on context remains relatively +stable across different context modifications, irrespective of classification +accuracy. (c) Context change exerts a more pronounced effect on model +performance than Context perturbations. (d) Surprisingly, context attribution +in `no-information' scenarios is non-trivial. Our research moves beyond +traditional methods by assessing the implications of broad-level modifications +on object recognition, either in the object or its context. + +
+
+ comment: Published in ICVGIP 2024 +
+
+
+
+
+ + ☆ LiVOS: Light Video Object Segmentation with Gated Linear Matching + + +
+ Semi-supervised video object segmentation (VOS) has been largely driven by +space-time memory (STM) networks, which store past frame features in a +spatiotemporal memory to segment the current frame via softmax attention. +However, STM networks face memory limitations due to the quadratic complexity +of softmax matching, restricting their applicability as video length and +resolution increase. To address this, we propose LiVOS, a lightweight memory +network that employs linear matching via linear attention, reformulating memory +matching into a recurrent process that reduces the quadratic attention matrix +to a constant-size, spatiotemporal-agnostic 2D state. To enhance selectivity, +we introduce gated linear matching, where a data-dependent gate matrix is +multiplied with the state matrix to control what information to retain or +discard. Experiments on diverse benchmarks demonstrated the effectiveness of +our method. It achieved 64.8 J&F on MOSE and 85.1 J&F on DAVIS, surpassing all +non-STM methods and narrowing the gap with STM-based approaches. For longer and +higher-resolution videos, it matched STM-based methods with 53% less GPU memory +and supports 4096p inference on a 32G consumer-grade GPU--a previously +cost-prohibitive capability--opening the door for long and high-resolution +video foundation models. + +
+
+ comment: Code&models: https://github.com/uncbiag/LiVOS +
+
+
+
+
+ + ☆ Conditional Vendi Score: An Information-Theoretic Approach to Diversity + Evaluation of Prompt-based Generative Models + + +
+ Text-conditioned generation models are commonly evaluated based on the +quality of the generated data and its alignment with the input text prompt. On +the other hand, several applications of prompt-based generative models require +sufficient diversity in the generated data to ensure the models' capability of +generating image and video samples possessing a variety of features. However, +most existing diversity metrics are designed for unconditional generative +models, and thus cannot distinguish the diversity arising from variations in +text prompts and that contributed by the generative model itself. In this work, +our goal is to quantify the prompt-induced and model-induced diversity in +samples generated by prompt-based models. We propose an information-theoretic +approach for internal diversity quantification, where we decompose the +kernel-based entropy $H(X)$ of the generated data $X$ into the sum of the +conditional entropy $H(X|T)$, given text variable $T$, and the mutual +information $I(X; T)$ between the text and data variables. We introduce the +\emph{Conditional-Vendi} score based on $H(X|T)$ to quantify the internal +diversity of the model and the \emph{Information-Vendi} score based on $I(X; +T)$ to measure the statistical relevance between the generated data and text +prompts. We provide theoretical results to statistically interpret these scores +and relate them to the unconditional Vendi score. We conduct several numerical +experiments to show the correlation between the Conditional-Vendi score and the +internal diversity of text-conditioned generative models. The codebase is +available at +\href{https://github.com/mjalali/conditional-vendi}{https://github.com/mjalali/conditional-vendi}. + +
+
+
+
+
+ + ☆ ChatGPT in Research and Education: Exploring Benefits and Threats + + +
+ In recent years, advanced artificial intelligence technologies, such as +ChatGPT, have significantly impacted various fields, including education and +research. Developed by OpenAI, ChatGPT is a powerful language model that +presents numerous opportunities for students and educators. It offers +personalized feedback, enhances accessibility, enables interactive +conversations, assists with lesson preparation and evaluation, and introduces +new methods for teaching complex subjects. However, ChatGPT also poses +challenges to traditional education and research systems. These challenges +include the risk of cheating on online exams, the generation of human-like text +that may compromise academic integrity, a potential decline in critical +thinking skills, and difficulties in assessing the reliability of information +generated by AI. This study examines both the opportunities and challenges +ChatGPT brings to education from the perspectives of students and educators. +Specifically, it explores the role of ChatGPT in helping students develop their +subjective skills. To demonstrate its effectiveness, we conducted several +subjective experiments using ChatGPT, such as generating solutions from +subjective problem descriptions. Additionally, surveys were conducted with +students and teachers to gather insights into how ChatGPT supports subjective +learning and teaching. The results and analysis of these surveys are presented +to highlight the impact of ChatGPT in this context. + +
+
+
+
+
+ + ☆ Artificial Intelligence-Enhanced Couinaud Segmentation for Precision + Liver Cancer Therapy + + +
+ Precision therapy for liver cancer necessitates accurately delineating liver +sub-regions to protect healthy tissue while targeting tumors, which is +essential for reducing recurrence and improving survival rates. However, the +segmentation of hepatic segments, known as Couinaud segmentation, is +challenging due to indistinct sub-region boundaries and the need for extensive +annotated datasets. This study introduces LiverFormer, a novel Couinaud +segmentation model that effectively integrates global context with low-level +local features based on a 3D hybrid CNN-Transformer architecture. Additionally, +a registration-based data augmentation strategy is equipped to enhance the +segmentation performance with limited labeled data. Evaluated on CT images from +123 patients, LiverFormer demonstrated high accuracy and strong concordance +with expert annotations across various metrics, allowing for enhanced treatment +planning for surgery and radiation therapy. It has great potential to reduces +complications and minimizes potential damages to surrounding tissue, leading to +improved outcomes for patients undergoing complex liver cancer treatments. + +
+
+
+
+
+ + ☆ NEOviz: Uncertainty-Driven Visual Analysis of Asteroid Trajectories + + +
+ We introduce NEOviz, an interactive visualization system designed to assist +planetary defense experts in the visual analysis of the movements of near-Earth +objects in the Solar System that might prove hazardous to Earth. Asteroids are +often discovered using optical telescopes and their trajectories are calculated +from images, resulting in an inherent asymmetric uncertainty in their position +and velocity. Consequently, we typically cannot determine the exact trajectory +of an asteroid, and an ensemble of trajectories must be generated to estimate +an asteroid's movement over time. When propagating these ensembles over +decades, it is challenging to visualize the varying paths and determine their +potential impact on Earth, which could cause catastrophic damage. NEOviz equips +experts with the necessary tools to effectively analyze the existing catalog of +asteroid observations. In particular, we present a novel approach for +visualizing the 3D uncertainty region through which an asteroid travels, while +providing accurate spatial context in relation to system-critical +infrastructure such as Earth, the Moon, and artificial satellites. Furthermore, +we use NEOviz to visualize the divergence of asteroid trajectories, capturing +high-variance events in an asteroid's orbital properties. For potential +impactors, we combine the 3D visualization with an uncertainty-aware impact map +to illustrate the potential risks to human populations. NEOviz was developed +with continuous input from members of the planetary defense community through a +participatory design process. It is exemplified in three real-world use cases +and evaluated via expert feedback interviews. + +
+
+
+
+
+ + ☆ ERUP-YOLO: Enhancing Object Detection Robustness for Adverse Weather + Condition by Unified Image-Adaptive Processing + + +
+ We propose an image-adaptive object detection method for adverse weather +conditions such as fog and low-light. Our framework employs differentiable +preprocessing filters to perform image enhancement suitable for later-stage +object detections. Our framework introduces two differentiable filters: a +B\'ezier curve-based pixel-wise (BPW) filter and a kernel-based local (KBL) +filter. These filters unify the functions of classical image processing filters +and improve performance of object detection. We also propose a domain-agnostic +data augmentation strategy using the BPW filter. Our method does not require +data-specific customization of the filter combinations, parameter ranges, and +data augmentation. We evaluate our proposed approach, called Enhanced +Robustness by Unified Image Processing (ERUP)-YOLO, by applying it to the +YOLOv3 detector. Experiments on adverse weather datasets demonstrate that our +proposed filters match or exceed the expressiveness of conventional methods and +our ERUP-YOLO achieved superior performance in a wide range of adverse weather +conditions, including fog and low-light conditions. + +
+
+
+
+
+ + ☆ Specialized Foundation Models Struggle to Beat Supervised Baselines + + +
+ Following its success for vision and text, the "foundation model" (FM) +paradigm -- pretraining large models on massive data, then fine-tuning on +target tasks -- has rapidly expanded to domains in the sciences, engineering, +healthcare, and beyond. Has this achieved what the original FMs accomplished, +i.e. the supplanting of traditional supervised learning in their domains? To +answer we look at three modalities -- genomics, satellite imaging, and time +series -- with multiple recent FMs and compare them to a standard supervised +learning workflow: model development, hyperparameter tuning, and training, all +using only data from the target task. Across these three specialized domains, +we find that it is consistently possible to train simple supervised models -- +no more complicated than a lightly modified wide ResNet or UNet -- that match +or even outperform the latest foundation models. Our work demonstrates that the +benefits of large-scale pretraining have yet to be realized in many specialized +areas, reinforces the need to compare new FMs to strong, well-tuned baselines, +and introduces two new, easy-to-use, open-source, and automated workflows for +doing so. + +
+
+ comment: The first two authors contributed equally. The order was determined + by coin flip +
+
+
+
+
+ + ☆ Real-Time Text Detection with Similar Mask in Traffic, Industrial, and + Natural Scenes + + +
+ Texts on the intelligent transportation scene include mass information. Fully +harnessing this information is one of the critical drivers for advancing +intelligent transportation. Unlike the general scene, detecting text in +transportation has extra demand, such as a fast inference speed, except for +high accuracy. Most existing real-time text detection methods are based on the +shrink mask, which loses some geometry semantic information and needs complex +post-processing. In addition, the previous method usually focuses on correct +output, which ignores feature correction and lacks guidance during the +intermediate process. To this end, we propose an efficient multi-scene text +detector that contains an effective text representation similar mask (SM) and a +feature correction module (FCM). Unlike previous methods, the former aims to +preserve the geometric information of the instances as much as possible. Its +post-progressing saves 50$\%$ of the time, accurately and efficiently +reconstructing text contours. The latter encourages false positive features to +move away from the positive feature center, optimizing the predictions from the +feature level. Some ablation studies demonstrate the efficiency of the SM and +the effectiveness of the FCM. Moreover, the deficiency of existing traffic +datasets (such as the low-quality annotation or closed source data +unavailability) motivated us to collect and annotate a traffic text dataset, +which introduces motion blur. In addition, to validate the scene robustness of +the SM-Net, we conduct experiments on traffic, industrial, and natural scene +datasets. Extensive experiments verify it achieves (SOTA) performance on +several benchmarks. The code and dataset are available at: +\url{https://github.com/fengmulin/SMNet}. + +
+
+
+
+
+ + ☆ Toward Robust Incomplete Multimodal Sentiment Analysis via Hierarchical + Representation Learning NeurIPS 2024 + + +
+ Multimodal Sentiment Analysis (MSA) is an important research area that aims +to understand and recognize human sentiment through multiple modalities. The +complementary information provided by multimodal fusion promotes better +sentiment analysis compared to utilizing only a single modality. Nevertheless, +in real-world applications, many unavoidable factors may lead to situations of +uncertain modality missing, thus hindering the effectiveness of multimodal +modeling and degrading the model's performance. To this end, we propose a +Hierarchical Representation Learning Framework (HRLF) for the MSA task under +uncertain missing modalities. Specifically, we propose a fine-grained +representation factorization module that sufficiently extracts valuable +sentiment information by factorizing modality into sentiment-relevant and +modality-specific representations through crossmodal translation and sentiment +semantic reconstruction. Moreover, a hierarchical mutual information +maximization mechanism is introduced to incrementally maximize the mutual +information between multi-scale representations to align and reconstruct the +high-level semantics in the representations. Ultimately, we propose a +hierarchical adversarial learning mechanism that further aligns and adapts the +latent distribution of sentiment-relevant representations to produce robust +joint multimodal representations. Comprehensive experiments on three datasets +demonstrate that HRLF significantly improves MSA performance under uncertain +modality missing cases. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Advancing Recycling Efficiency: A Comparative Analysis of Deep Learning + Models in Waste Classification + + +
+ With the ongoing increase in the worldwide population and escalating +consumption habits,there's a surge in the amount of waste produced.The +situation poses considerable challenges for waste management and the +optimization of recycling operations.The research tackles the pressing issue of +waste classification for recycling by analyzing various deep learning +models,including Convolutional Neural Network(CNN),AlexNet,ResNet,ResNet50 plus +Support Vector Machine(SVM),and transformers,across a wide array of waste +categories.The research meticulously compares these models on several targets +like parameters settings,category accuracy,total accuracy and model parameters +to establish a uniform evaluation criterion.This research presents a novel +method that incorporates SVM with deep learning frameworks,particularly +ResNet50.The results indicate the method significantly boosts accuracy in +complex waste categories.Moreover,the transformer model outshines others in +average accuracy,showcasing its aptitude for intricate classification tasks.To +improve performance in poorly performing categories,the research advocates for +enlarging the dataset,employing data augmentation,and leveraging sophisticated +models such as transformers,along with refining training methodologies.The +research paves the way for future advancements in multi-category waste +recycling and underscores the pivotal role of deep learning in promoting +environmental sustainability. + +
+
+ comment: Accepted by the 6th International Conference on Computing and Data + Science (CONF-CDS 2024), 12 pages, 8 figures, references added +
+
+
+
+
+ + ☆ FedBlock: A Blockchain Approach to Federated Learning against Backdoor + Attacks + + +
+ Federated Learning (FL) is a machine learning method for training with +private data locally stored in distributed machines without gathering them into +one place for central learning. Despite its promises, FL is prone to critical +security risks. First, because FL depends on a central server to aggregate +local training models, this is a single point of failure. The server might +function maliciously. Second, due to its distributed nature, FL might encounter +backdoor attacks by participating clients. They can poison the local model +before submitting to the server. Either type of attack, on the server or the +client side, would severely degrade learning accuracy. We propose FedBlock, a +novel blockchain-based FL framework that addresses both of these security +risks. FedBlock is uniquely desirable in that it involves only smart contract +programming, thus deployable atop any blockchain network. Our framework is +substantiated with a comprehensive evaluation study using real-world datasets. +Its robustness against backdoor attacks is competitive with the literature of +FL backdoor defense. The latter, however, does not address the server risk as +we do. + +
+
+ comment: This paper has been accepted as a full paper for the IEEE Special + Session Federated Learning on Big Data 2024 (IEEE BigData 2024) +
+
+
+
+
+ + ☆ One-Stage-TFS: Thai One-Stage Fingerspelling Dataset for Fingerspelling + Recognition Frameworks + + +
+ The Thai One-Stage Fingerspelling (One-Stage-TFS) dataset is a comprehensive +resource designed to advance research in hand gesture recognition, explicitly +focusing on the recognition of Thai sign language. This dataset comprises 7,200 +images capturing 15 one-stage consonant gestures performed by undergraduate +students from Rajabhat Maha Sarakham University, Thailand. The contributors +include both expert students from the Special Education Department with +proficiency in Thai sign language and students from other departments without +prior sign language experience. Images were collected between July and December +2021 using a DSLR camera, with contributors demonstrating hand gestures against +both simple and complex backgrounds. The One-Stage-TFS dataset presents +challenges in detecting and recognizing hand gestures, offering opportunities +to develop novel end-to-end recognition frameworks. Researchers can utilize +this dataset to explore deep learning methods, such as YOLO, EfficientDet, +RetinaNet, and Detectron, for hand detection, followed by feature extraction +and recognition using techniques like convolutional neural networks, +transformers, and adaptive feature fusion networks. The dataset is accessible +via the Mendeley Data repository and supports a wide range of applications in +computer science, including deep learning, computer vision, and pattern +recognition, thereby encouraging further innovation and exploration in these +fields. + +
+
+ comment: 12 pages, 9 figures +
+
+
+
+
+ + ☆ EcoCropsAID: Economic Crops Aerial Image Dataset for Land Use + Classification + + +
+ The EcoCropsAID dataset is a comprehensive collection of 5,400 aerial images +captured between 2014 and 2018 using the Google Earth application. This dataset +focuses on five key economic crops in Thailand: rice, sugarcane, cassava, +rubber, and longan. The images were collected at various crop growth stages: +early cultivation, growth, and harvest, resulting in significant variability +within each category and similarities across different categories. These +variations, coupled with differences in resolution, color, and contrast +introduced by multiple remote imaging sensors, present substantial challenges +for land use classification. The dataset is an interdisciplinary resource that +spans multiple research domains, including remote sensing, geoinformatics, +artificial intelligence, and computer vision. The unique features of the +EcoCropsAID dataset offer opportunities for researchers to explore novel +approaches, such as extracting spatial and temporal features, developing deep +learning architectures, and implementing transformer-based models. The +EcoCropsAID dataset provides a valuable platform for advancing research in land +use classification, with implications for optimizing agricultural practices and +enhancing sustainable development. This study explicitly investigates the use +of deep learning algorithms to classify economic crop areas in northeastern +Thailand, utilizing satellite imagery to address the challenges posed by +diverse patterns and similarities across categories. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ☆ Label Critic: Design Data Before Models + + +
+ As medical datasets rapidly expand, creating detailed annotations of +different body structures becomes increasingly expensive and time-consuming. We +consider that requesting radiologists to create detailed annotations is +unnecessarily burdensome and that pre-existing AI models can largely automate +this process. Following the spirit don't use a sledgehammer on a nut, we find +that, rather than creating annotations from scratch, radiologists only have to +review and edit errors if the Best-AI Labels have mistakes. To obtain the +Best-AI Labels among multiple AI Labels, we developed an automatic tool, called +Label Critic, that can assess label quality through tireless pairwise +comparisons. Extensive experiments demonstrate that, when incorporated with our +developed Image-Prompt pairs, pre-existing Large Vision-Language Models (LVLM), +trained on natural images and texts, achieve 96.5% accuracy when choosing the +best label in a pair-wise comparison, without extra fine-tuning. By +transforming the manual annotation task (30-60 min/scan) into an automatic +comparison task (15 sec/scan), we effectively reduce the manual efforts +required from radiologists by an order of magnitude. When the Best-AI Labels +are sufficiently accurate (81% depending on body structures), they will be +directly adopted as the gold-standard annotations for the dataset, with +lower-quality AI Labels automatically discarded. Label Critic can also check +the label quality of a single AI Label with 71.8% accuracy when no alternatives +are available for comparison, prompting radiologists to review and edit if the +estimated quality is low (19% depending on body structures). + +
+
+
+
+
+ + ☆ Efficient Feature Aggregation and Scale-Aware Regression for Monocular + 3D Object Detection + + +
+ Monocular 3D object detection has attracted great attention due to simplicity +and low cost. Existing methods typically follow conventional 2D detection +paradigms, first locating object centers and then predicting 3D attributes via +neighboring features. However, these methods predominantly rely on progressive +cross-scale feature aggregation and focus solely on local information, which +may result in a lack of global awareness and the omission of small-scale +objects. In addition, due to large variation in object scales across different +scenes and depths, inaccurate receptive fields often lead to background noise +and degraded feature representation. To address these issues, we introduces +MonoASRH, a novel monocular 3D detection framework composed of Efficient Hybrid +Feature Aggregation Module (EH-FAM) and Adaptive Scale-Aware 3D Regression Head +(ASRH). Specifically, EH-FAM employs multi-head attention with a global +receptive field to extract semantic features for small-scale objects and +leverages lightweight convolutional modules to efficiently aggregate visual +features across different scales. The ASRH encodes 2D bounding box dimensions +and then fuses scale features with the semantic features aggregated by EH-FAM +through a scale-semantic feature fusion module. The scale-semantic feature +fusion module guides ASRH in learning dynamic receptive field offsets, +incorporating scale priors into 3D position prediction for better +scale-awareness. Extensive experiments on the KITTI and Waymo datasets +demonstrate that MonoASRH achieves state-of-the-art performance. + +
+
+
+
+
+ + ☆ Foundation AI Model for Medical Image Segmentation + + +
+ Foundation models refer to artificial intelligence (AI) models that are +trained on massive amounts of data and demonstrate broad generalizability +across various tasks with high accuracy. These models offer versatile, +one-for-many or one-for-all solutions, eliminating the need for developing +task-specific AI models. Examples of such foundation models include the Chat +Generative Pre-trained Transformer (ChatGPT) and the Segment Anything Model +(SAM). These models have been trained on millions to billions of samples and +have shown wide-ranging and accurate applications in numerous tasks such as +text processing (using ChatGPT) and natural image segmentation (using SAM). In +medical image segmentation - finding target regions in medical images - there +is a growing need for these one-for-many or one-for-all foundation models. Such +models could obviate the need to develop thousands of task-specific AI models, +which is currently standard practice in the field. They can also be adapted to +tasks with datasets too small for effective training. We discuss two paths to +achieve foundation models for medical image segmentation and comment on +progress, challenges, and opportunities. One path is to adapt or fine-tune +existing models, originally developed for natural images, for use with medical +images. The second path entails building models from scratch, exclusively +training on medical images. + +
+
+
+
+
+ + ☆ DDFAV: Remote Sensing Large Vision Language Models Dataset and + Evaluation Benchmark + + +
+ With the rapid development of large vision language models (LVLMs), these +models have shown excellent results in various multimodal tasks. Since LVLMs +are prone to hallucinations and there are currently few datasets and evaluation +methods specifically designed for remote sensing, their performance is +typically poor when applied to remote sensing tasks. To address these issues, +this paper introduces a high quality remote sensing LVLMs dataset, DDFAV, +created using data augmentation and data mixing strategies. Next, a training +instruction set is produced based on some high-quality remote sensing images +selected from the proposed dataset. Finally, we develop a remote sensing LVLMs +hallucination evaluation method RSPOPE based on the proposed dataset and +evaluate the zero-shot capabilities of different LVLMs. Our proposed dataset, +instruction set, and evaluation method files are available at +https://github.com/HaodongLi2024/rspope. + +
+
+
+
+
+ + ☆ TransUNext: towards a more advanced U-shaped framework for automatic + vessel segmentation in the fundus image + + +
+ Purpose: Automatic and accurate segmentation of fundus vessel images has +become an essential prerequisite for computer-aided diagnosis of ophthalmic +diseases such as diabetes mellitus. The task of high-precision retinal vessel +segmentation still faces difficulties due to the low contrast between the +branch ends of retinal vessels and the background, the long and thin vessel +span, and the variable morphology of the optic disc and optic cup in fundus +vessel images. Methods: We propose a more advanced U-shaped architecture for a +hybrid Transformer and CNN: TransUNext, which integrates an Efficient +Self-attention Mechanism into the encoder and decoder of U-Net to capture both +local features and global dependencies with minimal computational overhead. +Meanwhile, the Global Multi-Scale Fusion (GMSF) module is further introduced to +upgrade skip-connections, fuse high-level semantic and low-level detailed +information, and eliminate high- and low-level semantic differences. Inspired +by ConvNeXt, TransNeXt Block is designed to optimize the computational +complexity of each base block in U-Net and avoid the information loss caused by +the compressed dimension when the information is converted between the feature +spaces of different dimensions. Results: We evaluated the proposed method on +four public datasets DRIVE, STARE, CHASE-DB1, and HRF. In the experimental +results, the AUC (area under the ROC curve) values were 0.9867, 0.9869, 0.9910, +and 0.9887, which exceeded the other state-of-the-art. + +
+
+
+
+
+ + ☆ CIT: Rethinking Class-incremental Semantic Segmentation with a Class + Independent Transformation + + +
+ Class-incremental semantic segmentation (CSS) requires that a model learn to +segment new classes without forgetting how to segment previous ones: this is +typically achieved by distilling the current knowledge and incorporating the +latest data. However, bypassing iterative distillation by directly transferring +outputs of initial classes to the current learning task is not supported in +existing class-specific CSS methods. Via Softmax, they enforce dependency +between classes and adjust the output distribution at each learning step, +resulting in a large probability distribution gap between initial and current +tasks. We introduce a simple, yet effective Class Independent Transformation +(CIT) that converts the outputs of existing semantic segmentation models into +class-independent forms with negligible cost or performance loss. By utilizing +class-independent predictions facilitated by CIT, we establish an accumulative +distillation framework, ensuring equitable incorporation of all class +information. We conduct extensive experiments on various segmentation +architectures, including DeepLabV3, Mask2Former, and SegViTv2. Results from +these experiments show minimal task forgetting across different datasets, with +less than 5% for ADE20K in the most challenging 11 task configurations and less +than 1% across all configurations for the PASCAL VOC 2012 dataset. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ V-DPO: Mitigating Hallucination in Large Vision Language Models via + Vision-Guided Direct Preference Optimization + + +
+ Large vision-language models (LVLMs) suffer from hallucination, resulting in +misalignment between the output textual response and the input visual content. +Recent research indicates that the over-reliance on the Large Language Model +(LLM) backbone, as one cause of the LVLM hallucination, inherently introduces +bias from language priors, leading to insufficient context attention to the +visual inputs. + We tackle this issue of hallucination by mitigating such over-reliance +through preference learning. We propose Vision-guided Direct Preference +Optimization (V-DPO) to enhance visual context learning at training time. To +interpret the effectiveness and generalizability of V-DPO on different types of +training data, we construct a synthetic dataset containing both response- and +image-contrast preference pairs, compared against existing human-annotated +hallucination samples. Our approach achieves significant improvements compared +with baseline methods across various hallucination benchmarks. Our analysis +indicates that V-DPO excels in learning from image-contrast preference data, +demonstrating its superior ability to elicit and understand nuances of visual +context. Our code is publicly available at https://github.com/YuxiXie/V-DPO. + +
+
+ comment: EMNLP 2024 Findings; 9 pages, 6 figures, 5 tables (16 pages, 8 + figures, 8 tables including references and appendices) +
+
+
+
+
+ + ☆ Full Field Digital Mammography Dataset from a Population Screening + Program + + +
+ Breast cancer presents the second largest cancer risk in the world to women. +Early detection of cancer has been shown to be effective in reducing mortality. +Population screening programs schedule regular mammography imaging for +participants, promoting early detection. Currently, such screening programs +require manual reading. False-positive errors in the reading process +unnecessarily leads to costly follow-up and patient anxiety. Automated methods +promise to provide more efficient, consistent and effective reading. To +facilitate their development, a number of datasets have been created. With the +aim of specifically targeting population screening programs, we introduce +NL-Breast-Screening, a dataset from a Canadian provincial screening program. +The dataset consists of 5997 mammography exams, each of which has four standard +views and is biopsy-confirmed. Cases where radiologist reading was a +false-positive are identified. NL-Breast is made publicly available as a new +resource to promote advances in automation for population screening programs. + +
+
+
+
+
+ + ☆ RT-Affordance: Affordances are Versatile Intermediate Representations + for Robot Manipulation + + +
+ We explore how intermediate policy representations can facilitate +generalization by providing guidance on how to perform manipulation tasks. +Existing representations such as language, goal images, and trajectory sketches +have been shown to be helpful, but these representations either do not provide +enough context or provide over-specified context that yields less robust +policies. We propose conditioning policies on affordances, which capture the +pose of the robot at key stages of the task. Affordances offer expressive yet +lightweight abstractions, are easy for users to specify, and facilitate +efficient learning by transferring knowledge from large internet datasets. Our +method, RT-Affordance, is a hierarchical model that first proposes an +affordance plan given the task language, and then conditions the policy on this +affordance plan to perform manipulation. Our model can flexibly bridge +heterogeneous sources of supervision including large web datasets and robot +trajectories. We additionally train our model on cheap-to-collect in-domain +affordance images, allowing us to learn new tasks without collecting any +additional costly robot trajectories. We show on a diverse set of novel tasks +how RT-Affordance exceeds the performance of existing methods by over 50%, and +we empirically demonstrate that affordances are robust to novel settings. +Videos available at https://snasiriany.me/rt-affordance + +
+
+
+
+
+ + ☆ Transferable polychromatic optical encoder for neural networks + + +
+ Artificial neural networks (ANNs) have fundamentally transformed the field of +computer vision, providing unprecedented performance. However, these ANNs for +image processing demand substantial computational resources, often hindering +real-time operation. In this paper, we demonstrate an optical encoder that can +perform convolution simultaneously in three color channels during the image +capture, effectively implementing several initial convolutional layers of a +ANN. Such an optical encoding results in ~24,000 times reduction in +computational operations, with a state-of-the art classification accuracy +(~73.2%) in free-space optical system. In addition, our analog optical encoder, +trained for CIFAR-10 data, can be transferred to the ImageNet subset, High-10, +without any modifications, and still exhibits moderate accuracy. Our results +evidence the potential of hybrid optical/digital computer vision system in +which the optical frontend can pre-process an ambient scene to reduce the +energy and latency of the whole computer vision system. + +
+
+ comment: 21 pages, 4 figures, 2 tables +
+
+
+
+
+ + ☆ Estimating Ego-Body Pose from Doubly Sparse Egocentric Video Data NeurIPS 2024 + + +
+ We study the problem of estimating the body movements of a camera wearer from +egocentric videos. Current methods for ego-body pose estimation rely on +temporally dense sensor data, such as IMU measurements from spatially sparse +body parts like the head and hands. However, we propose that even temporally +sparse observations, such as hand poses captured intermittently from egocentric +videos during natural or periodic hand movements, can effectively constrain +overall body motion. Naively applying diffusion models to generate full-body +pose from head pose and sparse hand pose leads to suboptimal results. To +overcome this, we develop a two-stage approach that decomposes the problem into +temporal completion and spatial completion. First, our method employs masked +autoencoders to impute hand trajectories by leveraging the spatiotemporal +correlations between the head pose sequence and intermittent hand poses, +providing uncertainty estimates. Subsequently, we employ conditional diffusion +models to generate plausible full-body motions based on these temporally dense +trajectories of the head and hands, guided by the uncertainty estimates from +the imputation. The effectiveness of our method was rigorously tested and +validated through comprehensive experiments conducted on various HMD setup with +AMASS and Ego-Exo4D datasets. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ☆ Object and Contact Point Tracking in Demonstrations Using 3D Gaussian + Splatting + + +
+ This paper introduces a method to enhance Interactive Imitation Learning +(IIL) by extracting touch interaction points and tracking object movement from +video demonstrations. The approach extends current IIL systems by providing +robots with detailed knowledge of both where and how to interact with objects, +particularly complex articulated ones like doors and drawers. By leveraging +cutting-edge techniques such as 3D Gaussian Splatting and FoundationPose for +tracking, this method allows robots to better understand and manipulate objects +in dynamic environments. The research lays the foundation for more effective +task learning and execution in autonomous robotic systems. + +
+
+ comment: CoRL 2024, Workshop on Lifelong Learning for Home Robots, Munich, + Germany +
+
+
+
+
+ + ☆ Benchmarking Vision Language Model Unlearning via Fictitious Facial + Identity Dataset + + +
+ Machine unlearning has emerged as an effective strategy for forgetting +specific information in the training data. However, with the increasing +integration of visual data, privacy concerns in Vision Language Models (VLMs) +remain underexplored. To address this, we introduce Facial Identity Unlearning +Benchmark (FIUBench), a novel VLM unlearning benchmark designed to robustly +evaluate the effectiveness of unlearning algorithms under the Right to be +Forgotten setting. Specifically, we formulate the VLM unlearning task via +constructing the Fictitious Facial Identity VQA dataset and apply a two-stage +evaluation pipeline that is designed to precisely control the sources of +information and their exposure levels. In terms of evaluation, since VLM +supports various forms of ways to ask questions with the same semantic meaning, +we also provide robust evaluation metrics including membership inference +attacks and carefully designed adversarial privacy attacks to evaluate the +performance of algorithms. Through the evaluation of four baseline VLM +unlearning algorithms within FIUBench, we find that all methods remain limited +in their unlearning performance, with significant trade-offs between model +utility and forget quality. Furthermore, our findings also highlight the +importance of privacy attacks for robust evaluations. We hope FIUBench will +drive progress in developing more effective VLM unlearning algorithms. + +
+
+
+
+
+ + ☆ Enhancing Weakly Supervised Semantic Segmentation for Fibrosis via + Controllable Image Generation + + +
+ Fibrotic Lung Disease (FLD) is a severe condition marked by lung stiffening +and scarring, leading to respiratory decline. High-resolution computed +tomography (HRCT) is critical for diagnosing and monitoring FLD; however, +fibrosis appears as irregular, diffuse patterns with unclear boundaries, +leading to high inter-observer variability and time-intensive manual +annotation. To tackle this challenge, we propose DiffSeg, a novel weakly +supervised semantic segmentation (WSSS) method that uses image-level +annotations to generate pixel-level fibrosis segmentation, reducing the need +for fine-grained manual labeling. Additionally, our DiffSeg incorporates a +diffusion-based generative model to synthesize HRCT images with different +levels of fibrosis from healthy slices, enabling the generation of the +fibrosis-injected slices and their paired fibrosis location. Experiments +indicate that our method significantly improves the accuracy of pseudo masks +generated by existing WSSS methods, greatly reducing the complexity of manual +labeling and enhancing the consistency of the generated masks. + +
+
+
+
+
+ + ☆ Personalized Video Summarization by Multimodal Video Understanding + + +
+ Video summarization techniques have been proven to improve the overall user +experience when it comes to accessing and comprehending video content. If the +user's preference is known, video summarization can identify significant +information or relevant content from an input video, aiding them in obtaining +the necessary information or determining their interest in watching the +original video. Adapting video summarization to various types of video and user +preferences requires significant training data and expensive human labeling. To +facilitate such research, we proposed a new benchmark for video summarization +that captures various user preferences. Also, we present a pipeline called +Video Summarization with Language (VSL) for user-preferred video summarization +that is based on pre-trained visual language models (VLMs) to avoid the need to +train a video summarization system on a large training dataset. The pipeline +takes both video and closed captioning as input and performs semantic analysis +at the scene level by converting video frames into text. Subsequently, the +user's genre preference was used as the basis for selecting the pertinent +textual scenes. The experimental results demonstrate that our proposed pipeline +outperforms current state-of-the-art unsupervised video summarization models. +We show that our method is more adaptable across different datasets compared to +supervised query-based video summarization models. In the end, the runtime +analysis demonstrates that our pipeline is more suitable for practical use when +scaling up the number of user preferences and videos. + +
+
+ comment: In Proceedings of CIKM 2024 Applied Research Track +
+
+
+
+
+ + ☆ Beyond Complete Shapes: A quantitative Evaluation of 3D Shape Matching + Algorithms + + +
+ Finding correspondences between 3D shapes is an important and long-standing +problem in computer vision, graphics and beyond. While approaches based on +machine learning dominate modern 3D shape matching, almost all existing +(learning-based) methods require that at least one of the involved shapes is +complete. In contrast, the most challenging and arguably most practically +relevant setting of matching partially observed shapes, is currently +underexplored. One important factor is that existing datasets contain only a +small number of shapes (typically below 100), which are unable to serve +data-hungry machine learning approaches, particularly in the unsupervised +regime. In addition, the type of partiality present in existing datasets is +often artificial and far from realistic. To address these limitations and to +encourage research on these relevant settings, we provide a generic and +flexible framework for the procedural generation of challenging partial shape +matching scenarios. Our framework allows for a virtually infinite generation of +partial shape matching instances from a finite set of shapes with complete +geometry. Further, we manually create cross-dataset correspondences between +seven existing (complete geometry) shape matching datasets, leading to a total +of 2543 shapes. Based on this, we propose several challenging partial benchmark +settings, for which we evaluate respective state-of-the-art methods as +baselines. + +
+
+
+
+
+ + ☆ SynthSet: Generative Diffusion Model for Semantic Segmentation in + Precision Agriculture + + +
+ This paper introduces a methodology for generating synthetic annotated data +to address data scarcity in semantic segmentation tasks within the precision +agriculture domain. Utilizing Denoising Diffusion Probabilistic Models (DDPMs) +and Generative Adversarial Networks (GANs), we propose a dual diffusion model +architecture for synthesizing realistic annotated agricultural data, without +any human intervention. We employ super-resolution to enhance the phenotypic +characteristics of the synthesized images and their coherence with the +corresponding generated masks. We showcase the utility of the proposed method +for wheat head segmentation. The high quality of synthesized data underscores +the effectiveness of the proposed methodology in generating image-mask pairs. +Furthermore, models trained on our generated data exhibit promising performance +when tested on an external, diverse dataset of real wheat fields. The results +show the efficacy of the proposed methodology for addressing data scarcity for +semantic segmentation tasks. Moreover, the proposed approach can be readily +adapted for various segmentation tasks in precision agriculture and beyond. + +
+
+
+
+
+ + ☆ An Application-Agnostic Automatic Target Recognition System Using Vision + Language Models + + +
+ We present a novel Automatic Target Recognition (ATR) system using +open-vocabulary object detection and classification models. A primary advantage +of this approach is that target classes can be defined just before runtime by a +non-technical end user, using either a few natural language text descriptions +of the target, or a few image exemplars, or both. Nuances in the desired +targets can be expressed in natural language, which is useful for unique +targets with little or no training data. We also implemented a novel +combination of several techniques to improve performance, such as leveraging +the additional information in the sequence of overlapping frames to perform +tubelet identification (i.e., sequential bounding box matching), bounding box +re-scoring, and tubelet linking. Additionally, we developed a technique to +visualize the aggregate output of many overlapping frames as a mosaic of the +area scanned during the aerial surveillance or reconnaissance, and a kernel +density estimate (or heatmap) of the detected targets. We initially applied +this ATR system to the use case of detecting and clearing unexploded ordinance +on airfield runways and we are currently extending our research to other +real-world applications. + +
+
+ comment: Accepted to the Thirty-Seventh Annual Conference on Innovative + Applications of Artificial Intelligence (IAAI-25) +
+
+
+
+
+ + ☆ Rainfall regression from C-band Synthetic Aperture Radar using + Multi-Task Generative Adversarial Networks + + +
+ This paper introduces a data-driven approach to estimate precipitation rates +from Synthetic Aperture Radar (SAR) at a spatial resolution of 200 meters per +pixel. It addresses previous challenges related to the collocation of SAR and +weather radar data, specifically the misalignment in collocations and the +scarcity of rainfall examples under strong wind. To tackle these challenges, +the paper proposes a multi-objective formulation, introducing patch-level +components and an adversarial component. It exploits the full NEXRAD archive to +look for potential co-locations with Sentinel-1 data. With additional +enhancements to the training procedure and the incorporation of additional +inputs, the resulting model demonstrates improved accuracy in rainfall +estimates and the ability to extend its performance to scenarios up to 15 m/s. + +
+
+ comment: 36 pages, 13 figures +
+
+
+
+
+ + ☆ Self Supervised Networks for Learning Latent Space Representations of + Human Body Scans and Motions + + +
+ This paper introduces self-supervised neural network models to tackle several +fundamental problems in the field of 3D human body analysis and processing. +First, we propose VariShaPE (Varifold Shape Parameter Estimator), a novel +architecture for the retrieval of latent space representations of body shapes +and poses. This network offers a fast and robust method to estimate the +embedding of arbitrary unregistered meshes into the latent space. Second, we +complement the estimation of latent codes with MoGeN (Motion Geometry Network) +a framework that learns the geometry on the latent space itself. This is +achieved by lifting the body pose parameter space into a higher dimensional +Euclidean space in which body motion mini-sequences from a training set of 4D +data can be approximated by simple linear interpolation. Using the SMPL latent +space representation we illustrate how the combination of these network models, +once trained, can be used to perform a variety of tasks with very limited +computational cost. This includes operations such as motion interpolation, +extrapolation and transfer as well as random shape and pose generation. + +
+
+ comment: 23 pages, 11 figures, 6 tables +
+
+
+
+
+ + ☆ TopoTxR: A topology-guided deep convolutional network for breast + parenchyma learning on DCE-MRIs + + +
+ Characterization of breast parenchyma in dynamic contrast-enhanced magnetic +resonance imaging (DCE-MRI) is a challenging task owing to the complexity of +underlying tissue structures. Existing quantitative approaches, like radiomics +and deep learning models, lack explicit quantification of intricate and subtle +parenchymal structures, including fibroglandular tissue. To address this, we +propose a novel topological approach that explicitly extracts multi-scale +topological structures to better approximate breast parenchymal structures, and +then incorporates these structures into a deep-learning-based prediction model +via an attention mechanism. Our topology-informed deep learning model, +\emph{TopoTxR}, leverages topology to provide enhanced insights into tissues +critical for disease pathophysiology and treatment response. We empirically +validate \emph{TopoTxR} using the VICTRE phantom breast dataset, showing that +the topological structures extracted by our model effectively approximate the +breast parenchymal structures. We further demonstrate \emph{TopoTxR}'s efficacy +in predicting response to neoadjuvant chemotherapy. Our qualitative and +quantitative analyses suggest differential topological behavior of breast +tissue in treatment-na\"ive imaging, in patients who respond favorably to +therapy as achieving pathological complete response (pCR) versus those who do +not. In a comparative analysis with several baselines on the publicly available +I-SPY 1 dataset (N=161, including 47 patients with pCR and 114 without) and the +Rutgers proprietary dataset (N=120, with 69 patients achieving pCR and 51 not), +\emph{TopoTxR} demonstrates a notable improvement, achieving a 2.6\% increase +in accuracy and a 4.6\% enhancement in AUC compared to the state-of-the-art +method. + +
+
+ comment: 22 pages, 8 figures, 8 tables, accepted by Medical Image Analysis ( + https://www.sciencedirect.com/science/article/abs/pii/S1361841524002986 ) +
+
+
+
+
+ + ☆ BOston Neonatal Brain Injury Data for Hypoxic Ischemic Encephalopathy + (BONBID-HIE): II. 2-year Neurocognitive Outcome and NICU Outcome + + +
+ Hypoxic Ischemic Encephalopathy (HIE) affects approximately 1-5/1000 newborns +globally and leads to adverse neurocognitive outcomes in 30% to 50% of cases by +two years of age. Despite therapeutic advances with Therapeutic Hypothermia +(TH), prognosis remains challenging, highlighting the need for improved +biomarkers. This paper introduces the second release of the Boston Neonatal +Brain Injury Dataset for Hypoxic-Ischemic Encephalopathy (BONBID-HIE), an +open-source, comprehensive MRI and clinical dataset featuring 237 patients, +including NICU outcomes and 2-year neurocognitive outcomes from Massachusetts +General Hospital and Boston Children's Hospital. + +
+
+ comment: Data description for BONBID-HIE 2024 Challenge on MICCAI 2024 +
+
+
+
+
+ + ☆ Solving Trojan Detection Competitions with Linear Weight Classification + + +
+ Neural networks can conceal malicious Trojan backdoors that allow a trigger +to covertly change the model behavior. Detecting signs of these backdoors, +particularly without access to any triggered data, is the subject of ongoing +research and open challenges. In one common formulation of the problem, we are +given a set of clean and poisoned models and need to predict whether a given +test model is clean or poisoned. In this paper, we introduce a detector that +works remarkably well across many of the existing datasets and domains. It is +obtained by training a binary classifier on a large number of models' weights +after performing a few different pre-processing steps including feature +selection and standardization, reference model weights subtraction, and model +alignment prior to detection. We evaluate this algorithm on a diverse set of +Trojan detection benchmarks and domains and examine the cases where the +approach is most and least effective. + +
+
+ comment: 9 pages, 4 Figures +
+
+
+
+
+ + ☆ Fine-Grained Spatial and Verbal Losses for 3D Visual Grounding + + +
+ 3D visual grounding consists of identifying the instance in a 3D scene which +is referred by an accompanying language description. While several +architectures have been proposed within the commonly employed +grounding-by-selection framework, the utilized losses are comparatively +under-explored. In particular, most methods rely on a basic supervised +cross-entropy loss on the predicted distribution over candidate instances, +which fails to model both spatial relations between instances and the internal +fine-grained word-level structure of the verbal referral. Sparse attempts to +additionally supervise verbal embeddings globally by learning the class of the +referred instance from the description or employing verbo-visual contrast to +better separate instance embeddings do not fundamentally lift the +aforementioned limitations. Responding to these shortcomings, we introduce two +novel losses for 3D visual grounding: a visual-level offset loss on regressed +vector offsets from each instance to the ground-truth referred instance and a +language-related span loss on predictions for the word-level span of the +referred instance in the description. In addition, we equip the verbo-visual +fusion module of our new 3D visual grounding architecture AsphaltNet with a +top-down bidirectional attentive fusion block, which enables the supervisory +signals from our two losses to propagate to the respective converse branches of +the network and thus aid the latter to learn context-aware instance embeddings +and grounding-aware verbal embeddings. AsphaltNet proposes novel auxiliary +losses to aid 3D visual grounding with competitive results compared to the +state-of-the-art on the ReferIt3D benchmark. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ☆ Enhancing Maritime Situational Awareness through End-to-End Onboard Raw + Data Analysis + + +
+ Satellite-based onboard data processing is crucial for time-sensitive +applications requiring timely and efficient rapid response. Advances in edge +artificial intelligence are shifting computational power from ground-based +centers to on-orbit platforms, transforming the +"sensing-communication-decision-feedback" cycle and reducing latency from +acquisition to delivery. The current research presents a framework addressing +the strict bandwidth, energy, and latency constraints of small satellites, +focusing on maritime monitoring. The study contributes three main innovations. +Firstly, it investigates the application of deep learning techniques for direct +ship detection and classification from raw satellite imagery. By simplifying +the onboard processing chain, our approach facilitates direct analyses without +requiring computationally intensive steps such as calibration and +ortho-rectification. Secondly, to address the scarcity of raw satellite data, +we introduce two novel datasets, VDS2Raw and VDV2Raw, which are derived from +raw data from Sentinel-2 and Vegetation and Environment Monitoring New Micro +Satellite (VENuS) missions, respectively, and enriched with Automatic +Identification System (AIS) records. Thirdly, we characterize the tasks' +optimal single and multiple spectral band combinations through statistical and +feature-based analyses validated on both datasets. In sum, we demonstrate the +feasibility of the proposed method through a proof-of-concept on CubeSat-like +hardware, confirming the models' potential for operational satellite-based +maritime monitoring. + +
+
+ comment: 38 pages +
+
+
+
+
+ + ♻ ☆ DAAL: Density-Aware Adaptive Line Margin Loss for Multi-Modal Deep + Metric Learning + + +
+ Multi-modal deep metric learning is crucial for effectively capturing diverse +representations in tasks such as face verification, fine-grained object +recognition, and product search. Traditional approaches to metric learning, +whether based on distance or margin metrics, primarily emphasize class +separation, often overlooking the intra-class distribution essential for +multi-modal feature learning. In this context, we propose a novel loss function +called Density-Aware Adaptive Margin Loss(DAAL), which preserves the density +distribution of embeddings while encouraging the formation of adaptive +sub-clusters within each class. By employing an adaptive line strategy, DAAL +not only enhances intra-class variance but also ensures robust inter-class +separation, facilitating effective multi-modal representation. Comprehensive +experiments on benchmark fine-grained datasets demonstrate the superior +performance of DAAL, underscoring its potential in advancing retrieval +applications and multi-modal deep metric learning. + +
+
+ comment: 13 pages, 4 fugues, 2 tables +
+
+
+
+
+ + ♻ ☆ Cognitive Planning for Object Goal Navigation using Generative AI Models + + +
+ Recent advancements in Generative AI, particularly in Large Language Models +(LLMs) and Large Vision-Language Models (LVLMs), offer new possibilities for +integrating cognitive planning into robotic systems. In this work, we present a +novel framework for solving the object goal navigation problem that generates +efficient exploration strategies. Our approach enables a robot to navigate +unfamiliar environments by leveraging LLMs and LVLMs to understand the semantic +structure of the scene. To address the challenge of representing complex +environments without overwhelming the system, we propose a 3D modular scene +representation, enriched with semantic descriptions. This representation is +dynamically pruned using an LLM-based mechanism, which filters irrelevant +information and focuses on task-specific data. By combining these elements, our +system generates high-level sub-goals that guide the exploration of the robot +toward the target object. We validate our approach in simulated environments, +demonstrating its ability to enhance object search efficiency while maintaining +scalability in complex settings. + +
+
+
+
+
+ + ♻ ☆ Digi2Real: Bridging the Realism Gap in Synthetic Data Face Recognition + via Foundation Models + + +
+ The accuracy of face recognition systems has improved significantly in the +past few years, thanks to the large amount of data collected and the +advancement in neural network architectures. However, these large-scale +datasets are often collected without explicit consent, raising ethical and +privacy concerns. To address this, there have been proposals to use synthetic +datasets for training face recognition models. Yet, such models still rely on +real data to train the generative models and generally exhibit inferior +performance compared to those trained on real datasets. One of these datasets, +DigiFace, uses a graphics pipeline to generate different identities and +different intra-class variations without using real data in training the +models. However, the performance of this approach is poor on face recognition +benchmarks, possibly due to the lack of realism in the images generated from +the graphics pipeline. In this work, we introduce a novel framework for realism +transfer aimed at enhancing the realism of synthetically generated face images. +Our method leverages the large-scale face foundation model, and we adapt the +pipeline for realism enhancement. By integrating the controllable aspects of +the graphics pipeline with our realism enhancement technique, we generate a +large amount of realistic variations-combining the advantages of both +approaches. Our empirical evaluations demonstrate that models trained using our +enhanced dataset significantly improve the performance of face recognition +systems over the baseline. The source code and datasets will be made available +publicly. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Decoupled Pseudo-labeling for Semi-Supervised Monocular 3D Object + Detection CVPR2024 + + +
+ We delve into pseudo-labeling for semi-supervised monocular 3D object +detection (SSM3OD) and discover two primary issues: a misalignment between the +prediction quality of 3D and 2D attributes and the tendency of depth +supervision derived from pseudo-labels to be noisy, leading to significant +optimization conflicts with other reliable forms of supervision. We introduce a +novel decoupled pseudo-labeling (DPL) approach for SSM3OD. Our approach +features a Decoupled Pseudo-label Generation (DPG) module, designed to +efficiently generate pseudo-labels by separately processing 2D and 3D +attributes. This module incorporates a unique homography-based method for +identifying dependable pseudo-labels in BEV space, specifically for 3D +attributes. Additionally, we present a DepthGradient Projection (DGP) module to +mitigate optimization conflicts caused by noisy depth supervision of +pseudo-labels, effectively decoupling the depth gradient and removing +conflicting gradients. This dual decoupling strategy-at both the pseudo-label +generation and gradient levels-significantly improves the utilization of +pseudo-labels in SSM3OD. Our comprehensive experiments on the KITTI benchmark +demonstrate the superiority of our method over existing approaches. + +
+
+ comment: accepted to CVPR2024 +
+
+
+
+
+ + ♻ ☆ Contextual Knowledge Pursuit for Faithful Visual Synthesis ECCV 2024 + + +
+ Modern text-to-vision generative models often hallucinate when the prompt +describing the scene to be generated is underspecified. In large language +models (LLMs), a prevalent strategy to reduce hallucinations is to retrieve +factual knowledge from an external database. While such retrieval augmentation +strategies have great potential to enhance text-to-vision generators, existing +static top-K retrieval methods explore the knowledge pool once, missing the +broader context necessary for high-quality generation. Furthermore, LLMs +internally possess rich world knowledge learned during large-scale training +(parametric knowledge) that could mitigate the need for external data +retrieval. This paper proposes Contextual Knowledge Pursuit (CKPT), a framework +that leverages the complementary strengths of external and parametric knowledge +to help generators produce reliable visual content. Instead of the one-time +retrieval of facts from an external database to improve a given prompt, CKPT +uses (1) an LLM to decide whether to seek external knowledge or to self-elicit +descriptions from LLM parametric knowledge, (2) a knowledge pursuit process to +contextually seek and sequentially gather most relevant facts, (3) a knowledge +aggregator for prompt enhancement with the gathered fact context, and (4) a +filtered fine-tuning objective to improve visual synthesis with richer prompts. +We evaluate CKPT across multiple text-driven generative tasks (image, 3D +rendering, and video) on datasets of rare objects and daily scenarios. Our +results show that CKPT is capable of generating faithful and semantically rich +content across diverse visual domains, offering a promising data source for +zero-shot synthesis and filtered fine-tuning of text-to-vision generative +models. + +
+
+ comment: Accepted in ECCV 2024 SDCV Workshop. GitHub repository at + https://github.com/peterljq/Contextual-Knowledge-Pursuit +
+
+
+
+
+ + ♻ ☆ DeBaRA: Denoising-Based 3D Room Arrangement Generation NeurIPS 2024 + + +
+ Generating realistic and diverse layouts of furnished indoor 3D scenes +unlocks multiple interactive applications impacting a wide range of industries. +The inherent complexity of object interactions, the limited amount of available +data and the requirement to fulfill spatial constraints all make generative +modeling for 3D scene synthesis and arrangement challenging. Current methods +address these challenges autoregressively or by using off-the-shelf diffusion +objectives by simultaneously predicting all attributes without 3D reasoning +considerations. In this paper, we introduce DeBaRA, a score-based model +specifically tailored for precise, controllable and flexible arrangement +generation in a bounded environment. We argue that the most critical component +of a scene synthesis system is to accurately establish the size and position of +various objects within a restricted area. Based on this insight, we propose a +lightweight conditional score-based model designed with 3D spatial awareness at +its core. We demonstrate that by focusing on spatial attributes of objects, a +single trained DeBaRA model can be leveraged at test time to perform several +downstream applications such as scene synthesis, completion and re-arrangement. +Further, we introduce a novel Self Score Evaluation procedure so it can be +optimally employed alongside external LLM models. We evaluate our approach +through extensive experiments and demonstrate significant improvement upon +state-of-the-art approaches in a range of scenarios. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Seeing Eye to AI: Comparing Human Gaze and Model Attention in Video + Memorability + + +
+ Understanding what makes a video memorable has important applications in +advertising or education technology. Towards this goal, we investigate +spatio-temporal attention mechanisms underlying video memorability. Different +from previous works that fuse multiple features, we adopt a simple +CNN+Transformer architecture that enables analysis of spatio-temporal attention +while matching state-of-the-art (SoTA) performance on video memorability +prediction. We compare model attention against human gaze fixations collected +through a small-scale eye-tracking study where humans perform the video memory +task. We uncover the following insights: (i) Quantitative saliency metrics show +that our model, trained only to predict a memorability score, exhibits similar +spatial attention patterns to human gaze, especially for more memorable videos. +(ii) The model assigns greater importance to initial frames in a video, +mimicking human attention patterns. (iii) Panoptic segmentation reveals that +both (model and humans) assign a greater share of attention to things and less +attention to stuff as compared to their occurrence probability. + +
+
+ comment: Accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ Re-assembling the past: The RePAIR dataset and benchmark for real world + 2D and 3D puzzle solving NeurIPS 2024 + + +
+ This paper proposes the RePAIR dataset that represents a challenging +benchmark to test modern computational and data driven methods for +puzzle-solving and reassembly tasks. Our dataset has unique properties that are +uncommon to current benchmarks for 2D and 3D puzzle solving. The fragments and +fractures are realistic, caused by a collapse of a fresco during a World War II +bombing at the Pompeii archaeological park. The fragments are also eroded and +have missing pieces with irregular shapes and different dimensions, challenging +further the reassembly algorithms. The dataset is multi-modal providing high +resolution images with characteristic pictorial elements, detailed 3D scans of +the fragments and meta-data annotated by the archaeologists. Ground truth has +been generated through several years of unceasing fieldwork, including the +excavation and cleaning of each fragment, followed by manual puzzle solving by +archaeologists of a subset of approx. 1000 pieces among the 16000 available. +After digitizing all the fragments in 3D, a benchmark was prepared to challenge +current reassembly and puzzle-solving methods that often solve more simplistic +synthetic scenarios. The tested baselines show that there clearly exists a gap +to fill in solving this computationally complex problem. + +
+
+ comment: NeurIPS 2024, Track Datasets and Benchmarks, 10 pages +
+
+
+
+
+ + ♻ ☆ Attention-based Class-Conditioned Alignment for Multi-Source Domain + Adaptation of Object Detectors + + +
+ Domain adaptation methods for object detection (OD) strive to mitigate the +impact of distribution shifts by promoting feature alignment across source and +target domains. Multi-source domain adaptation (MSDA) allows leveraging +multiple annotated source datasets and unlabeled target data to improve the +accuracy and robustness of the detection model. Most state-of-the-art MSDA +methods for OD perform feature alignment in a class-agnostic manner. This is +challenging since the objects have unique modal information due to variations +in object appearance across domains. A recent prototype-based approach proposed +a class-wise alignment, yet it suffers from error accumulation due to noisy +pseudo-labels that can negatively affect adaptation with imbalanced data. To +overcome these limitations, we propose an attention-based class-conditioned +alignment method for MSDA that aligns instances of each object category across +domains. In particular, an attention module coupled with an adversarial domain +classifier allows learning domain-invariant and class-specific instance +representations. Experimental results on multiple benchmarking MSDA datasets +indicate that our method outperforms the state-of-the-art methods and is robust +to class imbalance using a conceptually simple class-conditioning method. Our +code is available at https://github.com/imatif17/ACIA. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2309.14950 +
+
+
+
+
+ + ♻ ☆ GlobalDoc: A Cross-Modal Vision-Language Framework for Real-World + Document Image Retrieval and Classification + + +
+ Visual document understanding (VDU) has rapidly advanced with the development +of powerful multi-modal language models. However, these models typically +require extensive document pre-training data to learn intermediate +representations and often suffer a significant performance drop in real-world +online industrial settings. A primary issue is their heavy reliance on OCR +engines to extract local positional information within document pages, which +limits the models' ability to capture global information and hinders their +generalizability, flexibility, and robustness. In this paper, we introduce +GlobalDoc, a cross-modal transformer-based architecture pre-trained in a +self-supervised manner using three novel pretext objective tasks. GlobalDoc +improves the learning of richer semantic concepts by unifying language and +visual representations, resulting in more transferable models. For proper +evaluation, we also propose two novel document-level downstream VDU tasks, +Few-Shot Document Image Classification (DIC) and Content-based Document Image +Retrieval (DIR), designed to simulate industrial scenarios more closely. +Extensive experimentation has been conducted to demonstrate GlobalDoc's +effectiveness in practical settings. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ♻ ☆ SynCo: Synthetic Hard Negatives in Contrastive Learning for Better + Unsupervised Visual Representations + + +
+ Contrastive learning has become a dominant approach in self-supervised visual +representation learning. Hard negatives - samples closely resembling the anchor +- are key to enhancing learned representations' discriminative power. However, +efficiently leveraging hard negatives remains challenging. We introduce SynCo +(Synthetic Negatives in Contrastive learning), a novel approach that improves +model performance by generating synthetic hard negatives on the representation +space. Building on the MoCo framework, SynCo introduces six strategies for +creating diverse synthetic hard negatives on-the-fly with minimal computational +overhead. SynCo achieves faster training and better representation learning, +reaching 67.9% top-1 accuracy on ImageNet ILSVRC-2012 linear evaluation after +200 pretraining epochs, surpassing MoCo's 67.5% using the same ResNet-50 +encoder. It also transfers more effectively to detection tasks: on PASCAL VOC, +it outperforms both the supervised baseline and MoCo with 82.5% AP; on COCO, it +sets new benchmarks with 40.9% AP for bounding box detection and 35.5% AP for +instance segmentation. Our synthetic hard negative generation approach +significantly enhances visual representations learned through self-supervised +contrastive learning. Code is available at +https://github.com/giakoumoglou/synco. + +
+
+ comment: 10 pages, 5 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Tencent Hunyuan3D-1.0: A Unified Framework for Text-to-3D and + Image-to-3D Generation + + +
+ While 3D generative models have greatly improved artists' workflows, the +existing diffusion models for 3D generation suffer from slow generation and +poor generalization. To address this issue, we propose a two-stage approach +named Hunyuan3D-1.0 including a lite version and a standard version, that both +support text- and image-conditioned generation. In the first stage, we employ a +multi-view diffusion model that efficiently generates multi-view RGB in +approximately 4 seconds. These multi-view images capture rich details of the 3D +asset from different viewpoints, relaxing the tasks from single-view to +multi-view reconstruction. In the second stage, we introduce a feed-forward +reconstruction model that rapidly and faithfully reconstructs the 3D asset +given the generated multi-view images in approximately 7 seconds. The +reconstruction network learns to handle noises and in-consistency introduced by +the multi-view diffusion and leverages the available information from the +condition image to efficiently recover the 3D structure. Our framework involves +the text-to-image model, i.e., Hunyuan-DiT, making it a unified framework to +support both text- and image-conditioned 3D generation. Our standard version +has 3x more parameters than our lite and other existing model. Our +Hunyuan3D-1.0 achieves an impressive balance between speed and quality, +significantly reducing generation time while maintaining the quality and +diversity of the produced assets. + +
+
+ comment: Technical Report; 3D Generation +
+
+
+
+
+ + ♻ ☆ A Framework for Real-Time Volcano-Seismic Event Recognition Based on + Multi-Station Seismograms and Semantic Segmentation Models + + +
+ In volcano monitoring, effective recognition of seismic events is essential +for understanding volcanic activity and raising timely warning alerts. +Traditional methods rely on manual analysis, which can be subjective and +labor-intensive. Furthermore, current automatic approaches often tackle +detection and classification separately, mostly rely on single station +information and generally require tailored preprocessing and representations to +perform predictions. These limitations often hinder their application to +real-time monitoring and utilization across different volcano conditions. This +study introduces a novel approach that utilizes Semantic Segmentation models to +automate seismic event recognition by applying a straight forward +transformation of multi-channel 1D signals into 2D representations, enabling +their use as images. Our framework employs a data-driven, end-to-end design +that integrates multi-station seismic data with minimal preprocessing, +performing both detection and classification simultaneously for five seismic +event classes. We evaluated four state-of-the-art segmentation models (UNet, +UNet++, DeepLabV3+ and SwinUNet) on approximately 25.000 seismic events +recorded at four different Chilean volcanoes: Nevados del Chill\'an Volcanic +Complex, Laguna del Maule, Villarrica and Puyehue-Cord\'on Caulle. Among these +models, the UNet architecture was identified as the most effective model, +achieving mean F1 and Intersection over Union (IoU) scores of up to 0.91 and +0.88, respectively, and demonstrating superior noise robustness and model +flexibility to unseen volcano datasets. + +
+
+ comment: 10 pages, 9 figures. This is a pre-print, it is currently under + review for publication +
+
+
+
+
+ + ♻ ☆ Better, Not Just More: Data-Centric Machine Learning for Earth + Observation + + +
+ Recent developments and research in modern machine learning have led to +substantial improvements in the geospatial field. Although numerous deep +learning architectures and models have been proposed, the majority of them have +been solely developed on benchmark datasets that lack strong real-world +relevance. Furthermore, the performance of many methods has already saturated +on these datasets. We argue that a shift from a model-centric view to a +complementary data-centric perspective is necessary for further improvements in +accuracy, generalization ability, and real impact on end-user applications. +Furthermore, considering the entire machine learning cycle-from problem +definition to model deployment with feedback-is crucial for enhancing machine +learning models that can be reliable in unforeseen situations. This work +presents a definition as well as a precise categorization and overview of +automated data-centric learning approaches for geospatial data. It highlights +the complementary role of data-centric learning with respect to model-centric +in the larger machine learning deployment cycle. We review papers across the +entire geospatial field and categorize them into different groups. A set of +representative experiments shows concrete implementation examples. These +examples provide concrete steps to act on geospatial data with data-centric +machine learning approaches. + +
+
+ comment: Accepted to Geoscience and Remote Sensing Magazine +
+
+
+
+
+ + ♻ ☆ Robustly overfitting latents for flexible neural image compression NeurIPS + + +
+ Neural image compression has made a great deal of progress. State-of-the-art +models are based on variational autoencoders and are outperforming classical +models. Neural compression models learn to encode an image into a quantized +latent representation that can be efficiently sent to the decoder, which +decodes the quantized latent into a reconstructed image. While these models +have proven successful in practice, they lead to sub-optimal results due to +imperfect optimization and limitations in the encoder and decoder capacity. +Recent work shows how to use stochastic Gumbel annealing (SGA) to refine the +latents of pre-trained neural image compression models. We extend this idea by +introducing SGA+, which contains three different methods that build upon SGA. +We show how our method improves the overall compression performance in terms of +the R-D trade-off, compared to its predecessors. Additionally, we show how +refinement of the latents with our best-performing method improves the +compression performance on both the Tecnick and CLIC dataset. Our method is +deployed for a pre-trained hyperprior and for a more flexible model. Further, +we give a detailed analysis of our proposed methods and show that they are less +sensitive to hyperparameter choices. Finally, we show how each method can be +extended to three- instead of two-class rounding. + +
+
+ comment: Accepted at Neural Information Processing Systems (NeurIPS) 2024 +
+
+
+
+
+ + ♻ ☆ Deep Priors for Video Quality Prediction + + +
+ In this work, we designed a completely blind video quality assessment +algorithm using the deep video prior. This work mainly explores the utility of +deep video prior in estimating the visual quality of the video. In our work, we +have used a single distorted video and a reference video pair to learn the deep +video prior. At inference time, the learned deep prior is used to restore the +original videos from the distorted videos. The ability of learned deep video +prior to restore the original video from the distorted video is measured to +quantify distortion in the video. Our hypothesis is that the learned deep video +prior fails in restoring the highly distorted videos. The restoring ability of +deep video prior is proportional to the distortion present in the video. +Therefore, we propose to use the distance between the distorted video and the +restored video as the perceptual quality of the video. Our algorithm is trained +using a single video pair and it does not need any labelled data. We show that +our proposed algorithm outperforms the existing unsupervised video quality +assessment algorithms in terms of LCC and SROCC on a synthetically distorted +video quality assessment dataset. + +
+
+ comment: Indian Conference on Computer Vision, Graphics and Image Processing + (ICVGIP) 2024 conference tinny paper +
+
+
+
+
+ + ♻ ☆ Asynchronous Perception Machine For Efficient Test-Time-Training NeurIPS 2024 + + +
+ In this work, we propose Asynchronous Perception Machine (APM), a +computationally-efficient architecture for test-time-training (TTT). APM can +process patches of an image one at a time in any order asymmetrically and still +encode semantic-awareness in the net. We demonstrate APM's ability to recognize +out-of-distribution images without dataset-specific pre-training, augmentation +or any-pretext task. APM offers competitive performance over existing TTT +approaches. To perform TTT, APM just distills test sample's representation +once. APM possesses a unique property: it can learn using just this single +representation and starts predicting semantically-aware features. + APM demostrates potential applications beyond test-time-training: APM can +scale up to a dataset of 2D images and yield semantic-clusterings in a single +forward pass. APM also provides first empirical evidence towards validating +GLOM's insight, i.e. input percept is a field. Therefore, APM helps us converge +towards an implementation which can do both interpolation and perception on a +shared-connectionist hardware. Our code is publicly available at this link: +https://rajatmodi62.github.io/apm_project_page/. + +
+
+ comment: Accepted to NeurIPS 2024 Main Track. APM is a step to getting + Geoffrey Hinton's GLOM working +
+
+
+
+
+ + ♻ ☆ Dynamic Typography: Bringing Text to Life via Video Diffusion Prior + + +
+ Text animation serves as an expressive medium, transforming static +communication into dynamic experiences by infusing words with motion to evoke +emotions, emphasize meanings, and construct compelling narratives. Crafting +animations that are semantically aware poses significant challenges, demanding +expertise in graphic design and animation. We present an automated text +animation scheme, termed "Dynamic Typography", which combines two challenging +tasks. It deforms letters to convey semantic meaning and infuses them with +vibrant movements based on user prompts. Our technique harnesses vector +graphics representations and an end-to-end optimization-based framework. This +framework employs neural displacement fields to convert letters into base +shapes and applies per-frame motion, encouraging coherence with the intended +textual concept. Shape preservation techniques and perceptual loss +regularization are employed to maintain legibility and structural integrity +throughout the animation process. We demonstrate the generalizability of our +approach across various text-to-video models and highlight the superiority of +our end-to-end methodology over baseline methods, which might comprise separate +tasks. Through quantitative and qualitative evaluations, we demonstrate the +effectiveness of our framework in generating coherent text animations that +faithfully interpret user prompts while maintaining readability. Our code is +available at: https://animate-your-word.github.io/demo/. + +
+
+ comment: Our demo and code is available at: + https://animate-your-word.github.io/demo/ +
+
+
+
+
+ + ♻ ☆ FakeShield: Explainable Image Forgery Detection and Localization via + Multi-modal Large Language Models + + +
+ The rapid development of generative AI is a double-edged sword, which not +only facilitates content creation but also makes image manipulation easier and +more difficult to detect. Although current image forgery detection and +localization (IFDL) methods are generally effective, they tend to face two +challenges: \textbf{1)} black-box nature with unknown detection principle, +\textbf{2)} limited generalization across diverse tampering methods (e.g., +Photoshop, DeepFake, AIGC-Editing). To address these issues, we propose the +explainable IFDL task and design FakeShield, a multi-modal framework capable of +evaluating image authenticity, generating tampered region masks, and providing +a judgment basis based on pixel-level and image-level tampering clues. +Additionally, we leverage GPT-4o to enhance existing IFDL datasets, creating +the Multi-Modal Tamper Description dataSet (MMTD-Set) for training FakeShield's +tampering analysis capabilities. Meanwhile, we incorporate a Domain Tag-guided +Explainable Forgery Detection Module (DTE-FDM) and a Multi-modal Forgery +Localization Module (MFLM) to address various types of tamper detection +interpretation and achieve forgery localization guided by detailed textual +descriptions. Extensive experiments demonstrate that FakeShield effectively +detects and localizes various tampering techniques, offering an explainable and +superior solution compared to previous IFDL methods. + +
+
+
+
+
+ + ♻ ☆ ACE: All-round Creator and Editor Following Instructions via Diffusion + Transformer + + +
+ Diffusion models have emerged as a powerful generative technology and have +been found to be applicable in various scenarios. Most existing foundational +diffusion models are primarily designed for text-guided visual generation and +do not support multi-modal conditions, which are essential for many visual +editing tasks. This limitation prevents these foundational diffusion models +from serving as a unified model in the field of visual generation, like GPT-4 +in the natural language processing field. In this work, we propose ACE, an +All-round Creator and Editor, which achieves comparable performance compared to +those expert models in a wide range of visual generation tasks. To achieve this +goal, we first introduce a unified condition format termed Long-context +Condition Unit (LCU), and propose a novel Transformer-based diffusion model +that uses LCU as input, aiming for joint training across various generation and +editing tasks. Furthermore, we propose an efficient data collection approach to +address the issue of the absence of available training data. It involves +acquiring pairwise images with synthesis-based or clustering-based pipelines +and supplying these pairs with accurate textual instructions by leveraging a +fine-tuned multi-modal large language model. To comprehensively evaluate the +performance of our model, we establish a benchmark of manually annotated pairs +data across a variety of visual generation tasks. The extensive experimental +results demonstrate the superiority of our model in visual generation fields. +Thanks to the all-in-one capabilities of our model, we can easily build a +multi-modal chat system that responds to any interactive request for image +creation using a single model to serve as the backend, avoiding the cumbersome +pipeline typically employed in visual agents. Code and models will be available +on the project page: https://ali-vilab.github.io/ace-page/. + +
+
+
+
+
+ + ♻ ☆ Blind Image Restoration via Fast Diffusion Inversion + + +
+ Image Restoration (IR) methods based on a pre-trained diffusion model have +demonstrated state-of-the-art performance. However, they have two fundamental +limitations: 1) they often assume that the degradation operator is completely +known and 2) they alter the diffusion sampling process, which may result in +restored images that do not lie onto the data manifold. To address these +issues, we propose Blind Image Restoration via fast Diffusion inversion (BIRD) +a blind IR method that jointly optimizes for the degradation model parameters +and the restored image. To ensure that the restored images lie onto the data +manifold, we propose a novel sampling technique on a pre-trained diffusion +model. A key idea in our method is not to modify the reverse sampling, i.e, not +to alter all the intermediate latents, once an initial noise is sampled. This +is ultimately equivalent to casting the IR task as an optimization problem in +the space of the input noise. Moreover, to mitigate the computational cost +associated with inverting a fully unrolled diffusion model, we leverage the +inherent capability of these models to skip ahead in the forward diffusion +process using large time steps. We experimentally validate BIRD on several +image restoration tasks and show that it achieves state of the art performance +on all of them. Our code is available at +https://github.com/hamadichihaoui/BIRD. + +
+
+ comment: Accepted to Neurips 2024 +
+
+
+
+
+ + ♻ ☆ Sampling Strategies in Bayesian Inversion: A Study of RTO and Langevin + Methods + + +
+ This paper studies two classes of sampling methods for the solution of +inverse problems, namely Randomize-Then-Optimize (RTO), which is rooted in +sensitivity analysis, and Langevin methods, which are rooted in the Bayesian +framework. The two classes of methods correspond to different assumptions and +yield samples from different target distributions. We highlight the main +conceptual and theoretical differences between the two approaches and compare +them from a practical point of view by tackling two classical inverse problems +in imaging: deblurring and inpainting. We show that the choice of the sampling +method has a significant impact on the quality of the reconstruction and that +the RTO method is more robust to the choice of the parameters. + +
+
+
+
+
+ + ♻ ☆ 3DGS.zip: A survey on 3D Gaussian Splatting Compression Methods + + +
+ 3D Gaussian Splatting (3DGS) has emerged as a cutting-edge technique for +real-time radiance field rendering, offering state-of-the-art performance in +terms of both quality and speed. 3DGS models a scene as a collection of +three-dimensional Gaussians, or splats, with additional attributes optimized to +conform to the scene's geometric and visual properties. Despite its advantages +in rendering speed and image fidelity, 3DGS is limited by its significant +storage and memory demands. These high demands make 3DGS impractical for mobile +devices or headsets, reducing its applicability in important areas of computer +graphics. To address these challenges and advance the practicality of 3DGS, +this survey provides a comprehensive and detailed examination of compression +and compaction techniques developed to make 3DGS more efficient. We categorize +current approaches into compression techniques, which aim at achieving the +highest quality at minimal data size, and compaction techniques, which aim for +optimal quality with the fewest Gaussians. We introduce the basic mathematical +concepts underlying the analyzed methods, as well as key implementation details +and design choices. Our report thoroughly discusses similarities and +differences among the methods, as well as their respective advantages and +disadvantages. We establish a consistent standard for comparing these methods +based on key performance metrics and datasets. Specifically, since these +methods have been developed in parallel and over a short period of time, +currently, no comprehensive comparison exists. This survey, for the first time, +presents a unified standard to evaluate 3DGS compression techniques. To +facilitate the continuous monitoring of emerging methodologies, we maintain a +dedicated website that will be regularly updated with new techniques and +revisions of existing findings https://w-m.github.io/3dgs-compression-survey/ . + +
+
+ comment: 3D Gaussian Splatting compression survey; 3DGS compression; new + approaches added +
+
+
+
+
+ + ♻ ☆ Target Detection of Safety Protective Gear Using the Improved YOLOv5 + + +
+ In high-risk railway construction, personal protective equipment monitoring +is critical but challenging due to small and frequently obstructed targets. We +propose YOLO-EA, an innovative model that enhances safety measure detection by +integrating ECA into its backbone's convolutional layers, improving discernment +of minuscule objects like hardhats. YOLO-EA further refines target recognition +under occlusion by replacing GIoU with EIoU loss. YOLO-EA's effectiveness was +empirically substantiated using a dataset derived from real-world railway +construction site surveillance footage. It outperforms YOLOv5, achieving 98.9% +precision and 94.7% recall, up 2.5% and 0.5% respectively, while maintaining +real-time performance at 70.774 fps. This highly efficient and precise YOLO-EA +holds great promise for practical application in intricate construction +scenarios, enforcing stringent safety compliance during complex railway +construction projects. + +
+
+
+
+
+ + ♻ ☆ Ocean-omni: To Understand the World with Omni-modality + + +
+ The salient multimodal capabilities and interactive experience of GPT-4o +highlight its critical role in practical applications, yet it lacks a +high-performing open-source counterpart. In this paper, we introduce +Ocean-omni, the first open-source 7B Multimodal Large Language Model (MLLM) +adept at concurrently processing and analyzing modalities of image, video, +audio, and text, while delivering an advanced multimodal interactive experience +and strong performance. We propose an effective multimodal training schema +starting with 7B model and proceeding through two stages of multimodal +alignment and multitask fine-tuning across audio, image, video, and text modal. +This approach equips the language model with the ability to handle visual and +audio data effectively. Demonstrating strong performance across various +omni-modal and multimodal benchmarks, we aim for this contribution to serve as +a competitive baseline for the open-source community in advancing multimodal +understanding and real-time interaction. + +
+
+
+
+
+ + ♻ ☆ FASTER: A Font-Agnostic Scene Text Editing and Rendering Framework + + +
+ Scene Text Editing (STE) is a challenging research problem, that primarily +aims towards modifying existing texts in an image while preserving the +background and the font style of the original text. Despite its utility in +numerous real-world applications, existing style-transfer-based approaches have +shown sub-par editing performance due to (1) complex image backgrounds, (2) +diverse font attributes, and (3) varying word lengths within the text. To +address such limitations, in this paper, we propose a novel font-agnostic scene +text editing and rendering framework, named FASTER, for simultaneously +generating text in arbitrary styles and locations while preserving a natural +and realistic appearance and structure. A combined fusion of target mask +generation and style transfer units, with a cascaded self-attention mechanism +has been proposed to focus on multi-level text region edits to handle varying +word lengths. Extensive evaluation on a real-world database with further +subjective human evaluation study indicates the superiority of FASTER in both +scene text editing and rendering tasks, in terms of model performance and +efficiency. Our code will be released upon acceptance. + +
+
+ comment: Accepted in WACV 2025 +
+
+
+
+
+ + ♻ ☆ Diversifying Deep Ensembles: A Saliency Map Approach for Enhanced OOD + Detection, Calibration, and Accuracy + + +
+ Deep ensembles are capable of achieving state-of-the-art results in +classification and out-of-distribution (OOD) detection. However, their +effectiveness is limited due to the homogeneity of learned patterns within +ensembles. To overcome this issue, our study introduces Saliency Diversified +Deep Ensemble (SDDE), a novel approach that promotes diversity among ensemble +members by leveraging saliency maps. Through incorporating saliency map +diversification, our method outperforms conventional ensemble techniques and +improves calibration in multiple classification and OOD detection tasks. In +particular, the proposed method achieves state-of-the-art OOD detection +quality, calibration, and accuracy on multiple benchmarks, including +CIFAR10/100 and large-scale ImageNet datasets. + +
+
+
+
+
+ + ♻ ☆ Identity Curvature Laplace Approximation for Improved + Out-of-Distribution Detection + + +
+ Uncertainty estimation is crucial in safety-critical applications, where +robust out-of-distribution (OOD) detection is essential. Traditional Bayesian +methods, though effective, are often hindered by high computational demands. As +an alternative, Laplace approximation offers a more practical and efficient +approach to uncertainty estimation. In this paper, we introduce the Identity +Curvature Laplace Approximation (ICLA), a novel method that challenges the +conventional posterior covariance formulation by using identity curvature and +optimizing prior precision. This innovative design significantly enhances OOD +detection performance on well-known datasets such as CIFAR-10, CIFAR-100, and +ImageNet, while maintaining calibration scores. We attribute this improvement +to the alignment issues between typical feature embeddings and curvature as +measured by the Fisher information matrix. Our findings are further supported +by demonstrating that incorporating Fisher penalty or sharpness-aware +minimization techniques can greatly enhance the uncertainty estimation +capabilities of standard Laplace approximation. + +
+
+
+
+
+ + ♻ ☆ Not Just Object, But State: Compositional Incremental Learning without + Forgetting NeurIPS 2024 + + +
+ Most incremental learners excessively prioritize coarse classes of objects +while neglecting various kinds of states (e.g. color and material) attached to +the objects. As a result, they are limited in the ability to reason +fine-grained compositionality of state-object pairs. To remedy this limitation, +we propose a novel task called Compositional Incremental Learning +(composition-IL), enabling the model to recognize state-object compositions as +a whole in an incremental learning fashion. Since the lack of suitable +benchmarks, we re-organize two existing datasets and make them tailored for +composition-IL. Then, we propose a prompt-based Composition Incremental Learner +(CompILer), to overcome the ambiguous composition boundary problem which +challenges composition-IL largely. Specifically, we exploit multi-pool prompt +learning, which is regularized by inter-pool prompt discrepancy and intra-pool +prompt diversity. Besides, we devise object-injected state prompting by using +object prompts to guide the selection of state prompts. Furthermore, we fuse +the selected prompts by a generalized-mean strategy, to eliminate irrelevant +information learned in the prompts. Extensive experiments on two datasets +exhibit state-of-the-art performance achieved by CompILer. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Extrapolating Prospective Glaucoma Fundus Images through Diffusion Model + in Irregular Longitudinal Sequences + + +
+ The utilization of longitudinal datasets for glaucoma progression prediction +offers a compelling approach to support early therapeutic interventions. +Predominant methodologies in this domain have primarily focused on the direct +prediction of glaucoma stage labels from longitudinal datasets. However, such +methods may not adequately encapsulate the nuanced developmental trajectory of +the disease. To enhance the diagnostic acumen of medical practitioners, we +propose a novel diffusion-based model to predict prospective images by +extrapolating from existing longitudinal fundus images of patients. The +methodology delineated in this study distinctively leverages sequences of +images as inputs. Subsequently, a time-aligned mask is employed to select a +specific year for image generation. During the training phase, the time-aligned +mask resolves the issue of irregular temporal intervals in longitudinal image +sequence sampling. Additionally, we utilize a strategy of randomly masking a +frame in the sequence to establish the ground truth. This methodology aids the +network in continuously acquiring knowledge regarding the internal +relationships among the sequences throughout the learning phase. Moreover, the +introduction of textual labels is instrumental in categorizing images generated +within the sequence. The empirical findings from the conducted experiments +indicate that our proposed model not only effectively generates longitudinal +data but also significantly improves the precision of downstream classification +tasks. + +
+
+ comment: Accepted at BIBM 2024 +
+
+
+
+
+ + ♻ ☆ Shape2.5D: A Dataset of Texture-less Surfaces for Depth and Normals + Estimation + + +
+ Reconstructing texture-less surfaces poses unique challenges in computer +vision, primarily due to the lack of specialized datasets that cater to the +nuanced needs of depth and normals estimation in the absence of textural +information. We introduce "Shape2.5D," a novel, large-scale dataset designed to +address this gap. Comprising 1.17 million frames spanning over 39,772 3D models +and 48 unique objects, our dataset provides depth and surface normal maps for +texture-less object reconstruction. The proposed dataset includes synthetic +images rendered with 3D modeling software to simulate various lighting +conditions and viewing angles. It also includes a real-world subset comprising +4,672 frames captured with a depth camera. Our comprehensive benchmarks +demonstrate the dataset's ability to support the development of algorithms that +robustly estimate depth and normals from RGB images and perform voxel +reconstruction. Our open-source data generation pipeline allows the dataset to +be extended and adapted for future research. The dataset is publicly available +at https://github.com/saifkhichi96/Shape25D. + +
+
+ comment: Accepted for publication in IEEE Access +
+
+
+
+
+ + ♻ ☆ UniVST: A Unified Framework for Training-free Localized Video Style + Transfer + + +
+ This paper presents UniVST, a unified framework for localized video style +transfer. It operates without the need for training, offering a distinct +advantage over existing methods that transfer style across entire videos. The +endeavors of this paper comprise: (1) A point-matching mask propagation +strategy that leverages feature maps from the DDIM inversion. This streamlines +the model's architecture by obviating the need for tracking models. (2) An +AdaIN-guided style transfer mechanism that operates at both the latent and +attention levels. This balances content fidelity and style richness, mitigating +the loss of localized details commonly associated with direct video +stylization. (3) A sliding window smoothing strategy that harnesses optical +flow within the pixel representation and refines predicted noise to update the +latent space. This significantly enhances temporal consistency and diminishes +artifacts in video outputs. Our proposed UniVST has been validated to be +superior to existing methods in quantitative and qualitative metrics. It +adeptly addresses the challenges of preserving the primary object's style while +ensuring temporal consistency and detail preservation. + +
+
+ comment: 10 pages not including reference +
+
+
+
+
+ + ♻ ☆ Stable-Pose: Leveraging Transformers for Pose-Guided Text-to-Image + Generation NeurIPS 2024 + + +
+ Controllable text-to-image (T2I) diffusion models have shown impressive +performance in generating high-quality visual content through the incorporation +of various conditions. Current methods, however, exhibit limited performance +when guided by skeleton human poses, especially in complex pose conditions such +as side or rear perspectives of human figures. To address this issue, we +present Stable-Pose, a novel adapter model that introduces a coarse-to-fine +attention masking strategy into a vision Transformer (ViT) to gain accurate +pose guidance for T2I models. Stable-Pose is designed to adeptly handle pose +conditions within pre-trained Stable Diffusion, providing a refined and +efficient way of aligning pose representation during image synthesis. We +leverage the query-key self-attention mechanism of ViTs to explore the +interconnections among different anatomical parts in human pose skeletons. +Masked pose images are used to smoothly refine the attention maps based on +target pose-related features in a hierarchical manner, transitioning from +coarse to fine levels. Additionally, our loss function is formulated to +allocate increased emphasis to the pose region, thereby augmenting the model's +precision in capturing intricate pose details. We assessed the performance of +Stable-Pose across five public datasets under a wide range of indoor and +outdoor human pose scenarios. Stable-Pose achieved an AP score of 57.1 in the +LAION-Human dataset, marking around 13% improvement over the established +technique ControlNet. The project link and code is available at +https://github.com/ai-med/StablePose. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ PPLLaVA: Varied Video Sequence Understanding With Prompt Guidance + + +
+ The past year has witnessed the significant advancement of video-based large +language models. However, the challenge of developing a unified model for both +short and long video understanding remains unresolved. Most existing video LLMs +cannot handle hour-long videos, while methods custom for long videos tend to be +ineffective for shorter videos and images. In this paper, we identify the key +issue as the redundant content in videos. To address this, we propose a novel +pooling strategy that simultaneously achieves token compression and +instruction-aware visual feature aggregation. Our model is termed Prompt-guided +Pooling LLaVA, or PPLLaVA for short. Specifically, PPLLaVA consists of three +core components: the CLIP-based visual-prompt alignment that extracts visual +information relevant to the user's instructions, the prompt-guided pooling that +compresses the visual sequence to arbitrary scales using convolution-style +pooling, and the clip context extension designed for lengthy prompt common in +visual dialogue. Moreover, our codebase also integrates the most advanced video +Direct Preference Optimization (DPO) and visual interleave training. Extensive +experiments have validated the performance of our model. With superior +throughput and only 1024 visual context, PPLLaVA achieves better results on +image benchmarks as a video LLM, while achieving state-of-the-art performance +across various video benchmarks, excelling in tasks ranging from caption +generation to multiple-choice questions, and handling video lengths from +seconds to hours. Codes have been available at +https://github.com/farewellthree/PPLLaVA. + +
+
+
+
+
+ + ♻ ☆ DenoiseRep: Denoising Model for Representation Learning NeurIPS 2024 + + +
+ The denoising model has been proven a powerful generative model but has +little exploration of discriminative tasks. Representation learning is +important in discriminative tasks, which is defined as "learning +representations (or features) of the data that make it easier to extract useful +information when building classifiers or other predictors". In this paper, we +propose a novel Denoising Model for Representation Learning (DenoiseRep) to +improve feature discrimination with joint feature extraction and denoising. +DenoiseRep views each embedding layer in a backbone as a denoising layer, +processing the cascaded embedding layers as if we are recursively denoise +features step-by-step. This unifies the frameworks of feature extraction and +denoising, where the former progressively embeds features from low-level to +high-level, and the latter recursively denoises features step-by-step. After +that, DenoiseRep fuses the parameters of feature extraction and denoising +layers, and theoretically demonstrates its equivalence before and after the +fusion, thus making feature denoising computation-free. DenoiseRep is a +label-free algorithm that incrementally improves features but also +complementary to the label if available. Experimental results on various +discriminative vision tasks, including re-identification (Market-1501, +DukeMTMC-reID, MSMT17, CUHK-03, vehicleID), image classification (ImageNet, +UB200, Oxford-Pet, Flowers), object detection (COCO), image segmentation +(ADE20K) show stability and impressive improvements. We also validate its +effectiveness on the CNN (ResNet) and Transformer (ViT, Swin, Vmamda) +architectures. + +
+
+ comment: Accepted by NeurIPS 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ Leveraging generative models to characterize the failure conditions of + image classifiers + + +
+ We address in this work the question of identifying the failure conditions of +a given image classifier. To do so, we exploit the capacity of producing +controllable distributions of high quality image data made available by recent +Generative Adversarial Networks (StyleGAN2): the failure conditions are +expressed as directions of strong performance degradation in the generative +model latent space. This strategy of analysis is used to discover corner cases +that combine multiple sources of corruption, and to compare in more details the +behavior of different classifiers. The directions of degradation can also be +rendered visually by generating data for better interpretability. Some +degradations such as image quality can affect all classes, whereas other ones +such as shape are more class-specific. The approach is demonstrated on the +MNIST dataset that has been completed by two sources of corruption: noise and +blur, and shows a promising way to better understand and control the risks of +exploiting Artificial Intelligence components for safety-critical applications. + +
+
+
+
+
+ + ♻ ☆ Latent Representation Matters: Human-like Sketches in One-shot Drawing + Tasks + + +
+ Humans can effortlessly draw new categories from a single exemplar, a feat +that has long posed a challenge for generative models. However, this gap has +started to close with recent advances in diffusion models. This one-shot +drawing task requires powerful inductive biases that have not been +systematically investigated. Here, we study how different inductive biases +shape the latent space of Latent Diffusion Models (LDMs). Along with standard +LDM regularizers (KL and vector quantization), we explore supervised +regularizations (including classification and prototype-based representation) +and contrastive inductive biases (using SimCLR and redundancy reduction +objectives). We demonstrate that LDMs with redundancy reduction and +prototype-based regularizations produce near-human-like drawings (regarding +both samples' recognizability and originality) -- better mimicking human +perception (as evaluated psychophysically). Overall, our results suggest that +the gap between humans and machines in one-shot drawings is almost closed. + +
+
+
+
+
+ + ♻ ☆ Explaining an image classifier with a generative model conditioned by + uncertainty + + +
+ We propose to condition a generative model by a given image classifier +uncertainty in order to analyze and explain its behavior. Preliminary +experiments on synthetic data and a corrupted version of MNIST dataset +illustrate the idea. + +
+
+
+
+
+ + ♻ ☆ Advancements and limitations of LLMs in replicating human color-word + associations + + +
+ Color-word associations play a fundamental role in human cognition and design +applications. Large Language Models (LLMs) have become widely available and +demonstrated intelligent behaviors in various benchmarks with natural +conversation skills. However, their ability to replicate human color-word +associations remains understudied. We compared multiple generations of LLMs +(from GPT-3 to GPT-4o) against human color-word associations using data +collected from over 10,000 Japanese participants, involving 17 colors and words +from eight categories in Japanese. Our findings reveal a clear progression in +LLM performance across generations, with GPT-4o achieving the highest accuracy +in predicting the best voted word for each color and category. However, the +highest median performance was approximately 50% even for GPT-4o with visual +inputs (chance level is 10%), and the performance levels varied significantly +across word categories and colors, indicating a failure to fully replicate +human color-word associations. On the other hand, color discrimination ability +estimated from our color-word association data showed that LLMs demonstrated +high correlation with human color discrimination patterns, similarly to +previous studies. Our study highlights both the advancements in LLM +capabilities and their persistent limitations, suggesting differences in +semantic memory structures between humans and LLMs in representing color-word +associations. + +
+
+ comment: 20 pages, 7 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Predictive Dynamic Fusion ICML 2024 + + +
+ Multimodal fusion is crucial in joint decision-making systems for rendering +holistic judgments. Since multimodal data changes in open environments, dynamic +fusion has emerged and achieved remarkable progress in numerous applications. +However, most existing dynamic multimodal fusion methods lack theoretical +guarantees and easily fall into suboptimal problems, yielding unreliability and +instability. To address this issue, we propose a Predictive Dynamic Fusion +(PDF) framework for multimodal learning. We proceed to reveal the multimodal +fusion from a generalization perspective and theoretically derive the +predictable Collaborative Belief (Co-Belief) with Mono- and Holo-Confidence, +which provably reduces the upper bound of generalization error. Accordingly, we +further propose a relative calibration strategy to calibrate the predicted +Co-Belief for potential uncertainty. Extensive experiments on multiple +benchmarks confirm our superiority. Our code is available at +https://github.com/Yinan-Xia/PDF. + +
+
+ comment: Accepted by ICML 2024 +
+
+
+
+
+ + ♻ ☆ EchoSight: Advancing Visual-Language Models with Wiki Knowledge + + +
+ Knowledge-based Visual Question Answering (KVQA) tasks require answering +questions about images using extensive background knowledge. Despite +significant advancements, generative models often struggle with these tasks due +to the limited integration of external knowledge. In this paper, we introduce +EchoSight, a novel multimodal Retrieval-Augmented Generation (RAG) framework +that enables large language models (LLMs) to answer visual questions requiring +fine-grained encyclopedic knowledge. To strive for high-performing retrieval, +EchoSight first searches wiki articles by using visual-only information, +subsequently, these candidate articles are further reranked according to their +relevance to the combined text-image query. This approach significantly +improves the integration of multimodal knowledge, leading to enhanced retrieval +outcomes and more accurate VQA responses. Our experimental results on the +Encyclopedic VQA and InfoSeek datasets demonstrate that EchoSight establishes +new state-of-the-art results in knowledge-based VQA, achieving an accuracy of +41.8% on Encyclopedic VQA and 31.3% on InfoSeek. + +
+
+ comment: Technical Report; Project Page: https://go2heart.github.io/echosight +
+
+
+
+
+ + ♻ ☆ MSTA3D: Multi-scale Twin-attention for 3D Instance Segmentation + + +
+ Recently, transformer-based techniques incorporating superpoints have become +prevalent in 3D instance segmentation. However, they often encounter an +over-segmentation problem, especially noticeable with large objects. +Additionally, unreliable mask predictions stemming from superpoint mask +prediction further compound this issue. To address these challenges, we propose +a novel framework called MSTA3D. It leverages multi-scale feature +representation and introduces a twin-attention mechanism to effectively capture +them. Furthermore, MSTA3D integrates a box query with a box regularizer, +offering a complementary spatial constraint alongside semantic queries. +Experimental evaluations on ScanNetV2, ScanNet200 and S3DIS datasets +demonstrate that our approach surpasses state-of-the-art 3D instance +segmentation methods. + +
+
+ comment: 14 pages, 9 figures, 7 tables, conference +
+
+
+
+
+ + ♻ ☆ Diversity-Driven Synthesis: Enhancing Dataset Distillation through + Directed Weight Adjustment + + +
+ The sharp increase in data-related expenses has motivated research into +condensing datasets while retaining the most informative features. Dataset +distillation has thus recently come to the fore. This paradigm generates +synthetic datasets that are representative enough to replace the original +dataset in training a neural network. To avoid redundancy in these synthetic +datasets, it is crucial that each element contains unique features and remains +diverse from others during the synthesis stage. In this paper, we provide a +thorough theoretical and empirical analysis of diversity within synthesized +datasets. We argue that enhancing diversity can improve the parallelizable yet +isolated synthesizing approach. Specifically, we introduce a novel method that +employs dynamic and directed weight adjustment techniques to modulate the +synthesis process, thereby maximizing the representativeness and diversity of +each synthetic instance. Our method ensures that each batch of synthetic data +mirrors the characteristics of a large, varying subset of the original dataset. +Extensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet, +and ImageNet-1K, demonstrate the superior performance of our method, +highlighting its effectiveness in producing diverse and representative +synthetic datasets with minimal computational expense. Our code is available at +https://github.com/AngusDujw/Diversity-Driven-Synthesis.https://github.com/AngusDujw/Diversity-Driven-Synthesis. + +
+
+
+
+
+ + ♻ ☆ In-Context LoRA for Diffusion Transformers + + +
+ Recent research arXiv:2410.15027 has explored the use of diffusion +transformers (DiTs) for task-agnostic image generation by simply concatenating +attention tokens across images. However, despite substantial computational +resources, the fidelity of the generated images remains suboptimal. In this +study, we reevaluate and streamline this framework by hypothesizing that +text-to-image DiTs inherently possess in-context generation capabilities, +requiring only minimal tuning to activate them. Through diverse task +experiments, we qualitatively demonstrate that existing text-to-image DiTs can +effectively perform in-context generation without any tuning. Building on this +insight, we propose a remarkably simple pipeline to leverage the in-context +abilities of DiTs: (1) concatenate images instead of tokens, (2) perform joint +captioning of multiple images, and (3) apply task-specific LoRA tuning using +small datasets (e.g., 20~100 samples) instead of full-parameter tuning with +large datasets. We name our models In-Context LoRA (IC-LoRA). This approach +requires no modifications to the original DiT models, only changes to the +training data. Remarkably, our pipeline generates high-fidelity image sets that +better adhere to prompts. While task-specific in terms of tuning data, our +framework remains task-agnostic in architecture and pipeline, offering a +powerful tool for the community and providing valuable insights for further +research on product-level task-agnostic generation systems. We release our +code, data, and models at https://github.com/ali-vilab/In-Context-LoRA + +
+
+ comment: Tech report. Project page: + https://ali-vilab.github.io/In-Context-LoRA-Page/ +
+
+
+
+
+ + ♻ ☆ Edge AI-Enabled Chicken Health Detection Based on Enhanced FCOS-Lite and + Knowledge Distillation + + +
+ The utilization of AIoT technology has become a crucial trend in modern +poultry management, offering the potential to optimize farming operations and +reduce human workloads. This paper presents a real-time and compact edge-AI +enabled detector designed to identify chickens and their healthy statuses using +frames captured by a lightweight and intelligent camera equipped with an +edge-AI enabled CMOS sensor. To ensure efficient deployment of the proposed +compact detector within the memory-constrained edge-AI enabled CMOS sensor, we +employ a FCOS-Lite detector leveraging MobileNet as the backbone. To mitigate +the issue of reduced accuracy in compact edge-AI detectors without incurring +additional inference costs, we propose a gradient weighting loss function as +classification loss and introduce CIOU loss function as localization loss. +Additionally, we propose a knowledge distillation scheme to transfer valuable +information from a large teacher detector to the proposed FCOS-Lite detector, +thereby enhancing its performance while preserving a compact model size. +Experimental results demonstrate the proposed edge-AI enabled detector achieves +commendable performance metrics, including a mean average precision (mAP) of +95.1$\%$ and an F1-score of 94.2$\%$, etc. Notably, the proposed detector can +be efficiently deployed and operates at a speed exceeding 20 FPS on the edge-AI +enabled CMOS sensor, achieved through int8 quantization. That meets practical +demands for automated poultry health monitoring using lightweight intelligent +cameras with low power consumption and minimal bandwidth costs. + +
+
+
+
+
+ + ♻ ☆ SeTAR: Out-of-Distribution Detection with Selective Low-Rank + Approximation NeurIPS 2024 + + +
+ Out-of-distribution (OOD) detection is crucial for the safe deployment of +neural networks. Existing CLIP-based approaches perform OOD detection by +devising novel scoring functions or sophisticated fine-tuning methods. In this +work, we propose SeTAR, a novel, training-free OOD detection method that +leverages selective low-rank approximation of weight matrices in +vision-language and vision-only models. SeTAR enhances OOD detection via +post-hoc modification of the model's weight matrices using a simple greedy +search algorithm. Based on SeTAR, we further propose SeTAR+FT, a fine-tuning +extension optimizing model performance for OOD detection tasks. Extensive +evaluations on ImageNet1K and Pascal-VOC benchmarks show SeTAR's superior +performance, reducing the relatively false positive rate by up to 18.95% and +36.80% compared to zero-shot and fine-tuning baselines. Ablation studies +further validate SeTAR's effectiveness, robustness, and generalizability across +different model backbones. Our work offers a scalable, efficient solution for +OOD detection, setting a new state-of-the-art in this area. + +
+
+ comment: Accepted by NeurIPS 2024. Project page is live at + https://SeTAR-OOD.github.io. Code are available at + https://github.com/X1AOX1A/SeTAR +
+
+
+
+
+ + ♻ ☆ TaCOS: Task-Specific Camera Optimization with Simulation + + +
+ The performance of perception tasks is heavily influenced by imaging systems. +However, designing cameras with high task performance is costly, requiring +extensive camera knowledge and experimentation with physical hardware. +Additionally, cameras and perception tasks are mostly designed in isolation, +whereas recent methods that jointly design cameras and tasks have shown +improved performance. Therefore, we present a novel end-to-end optimization +approach that co-designs cameras with specific vision tasks. This method +combines derivative-free and gradient-based optimizers to support both +continuous and discrete camera parameters within manufacturing constraints. We +leverage recent computer graphics techniques and physical camera +characteristics to simulate the cameras in virtual environments, making the +design process cost-effective. We validate our simulations against physical +cameras and provide a procedurally generated virtual environment. Our +experiments demonstrate that our method designs cameras that outperform common +off-the-shelf options, and more efficiently compared to the state-of-the-art +approach, requiring only 2 minutes to design a camera on an example experiment +compared with 67 minutes for the competing method. Designed to support the +development of cameras under manufacturing constraints, multiple cameras, and +unconventional cameras, we believe this approach can advance the fully +automated design of cameras. + +
+
+
+
+
+ + ♻ ☆ Passive Non-Line-of-Sight Imaging with Light Transport Modulation + + +
+ Passive non-line-of-sight (NLOS) imaging has witnessed rapid development in +recent years, due to its ability to image objects that are out of sight. The +light transport condition plays an important role in this task since changing +the conditions will lead to different imaging models. Existing learning-based +NLOS methods usually train independent models for different light transport +conditions, which is computationally inefficient and impairs the practicality +of the models. In this work, we propose NLOS-LTM, a novel passive NLOS imaging +method that effectively handles multiple light transport conditions with a +single network. We achieve this by inferring a latent light transport +representation from the projection image and using this representation to +modulate the network that reconstructs the hidden image from the projection +image. We train a light transport encoder together with a vector quantizer to +obtain the light transport representation. To further regulate this +representation, we jointly learn both the reconstruction network and the +reprojection network during training. A set of light transport modulation +blocks is used to modulate the two jointly trained networks in a multi-scale +way. Extensive experiments on a large-scale passive NLOS dataset demonstrate +the superiority of the proposed method. The code is available at +https://github.com/JerryOctopus/NLOS-LTM. + +
+
+
+
+
+ + ♻ ☆ GenXD: Generating Any 3D and 4D Scenes + + +
+ Recent developments in 2D visual generation have been remarkably successful. +However, 3D and 4D generation remain challenging in real-world applications due +to the lack of large-scale 4D data and effective model design. In this paper, +we propose to jointly investigate general 3D and 4D generation by leveraging +camera and object movements commonly observed in daily life. Due to the lack of +real-world 4D data in the community, we first propose a data curation pipeline +to obtain camera poses and object motion strength from videos. Based on this +pipeline, we introduce a large-scale real-world 4D scene dataset: CamVid-30K. +By leveraging all the 3D and 4D data, we develop our framework, GenXD, which +allows us to produce any 3D or 4D scene. We propose multiview-temporal modules, +which disentangle camera and object movements, to seamlessly learn from both 3D +and 4D data. Additionally, GenXD employs masked latent conditions to support a +variety of conditioning views. GenXD can generate videos that follow the camera +trajectory as well as consistent 3D views that can be lifted into 3D +representations. We perform extensive evaluations across various real-world and +synthetic datasets, demonstrating GenXD's effectiveness and versatility +compared to previous methods in 3D and 4D generation. + +
+
+
+
+
+ + ♻ ☆ Self-supervised Auxiliary Learning for Texture and Model-based Hybrid + Robust and Fair Featuring in Face Analysis + + +
+ In this work, we explore Self-supervised Learning (SSL) as an auxiliary task +to blend the texture-based local descriptors into feature modelling for +efficient face analysis. Combining a primary task and a self-supervised +auxiliary task is beneficial for robust representation. Therefore, we used the +SSL task of mask auto-encoder (MAE) as an auxiliary task to reconstruct texture +features such as local patterns along with the primary task for robust and +unbiased face analysis. We experimented with our hypothesis on three major +paradigms of face analysis: face attribute and face-based emotion analysis, and +deepfake detection. Our experiment results exhibit that better feature +representation can be gleaned from our proposed model for fair and bias-less +face analysis. + +
+
+
+
+
+ + ♻ ☆ RelationBooth: Towards Relation-Aware Customized Object Generation + + +
+ Customized image generation is crucial for delivering personalized content +based on user-provided image prompts, aligning large-scale text-to-image +diffusion models with individual needs. However, existing models often overlook +the relationships between customized objects in generated images. Instead, this +work addresses that gap by focusing on relation-aware customized image +generation, which aims to preserve the identities from image prompts while +maintaining the predicate relations described in text prompts. Specifically, we +introduce RelationBooth, a framework that disentangles identity and relation +learning through a well-curated dataset. Our training data consists of +relation-specific images, independent object images containing identity +information, and text prompts to guide relation generation. Then, we propose +two key modules to tackle the two main challenges: generating accurate and +natural relations, especially when significant pose adjustments are required, +and avoiding object confusion in cases of overlap. First, we introduce a +keypoint matching loss that effectively guides the model in adjusting object +poses closely tied to their relationships. Second, we incorporate local +features from the image prompts to better distinguish between objects, +preventing confusion in overlapping cases. Extensive results on three +benchmarks demonstrate the superiority of RelationBooth in generating precise +relations while preserving object identities across a diverse set of objects +and relations. The source code and trained models will be made available to the +public. + +
+
+
+
+
+ + ♻ ☆ Advantages of Neural Population Coding for Deep Learning + + +
+ Scalar variables, e.g., the orientation of a shape in an image, are commonly +predicted using a single output neuron in a neural network. In contrast, the +mammalian cortex represents variables with a population of neurons. In this +population code, each neuron is most active at its preferred value and shows +partial activity for other values. Here, we investigate the benefit of using a +population code for the output layer of a neural network. We compare population +codes against single-neuron outputs and one-hot vectors. First, we show +theoretically and in experiments with synthetic data that population codes +improve robustness to input noise in networks of stacked linear layers. Second, +we demonstrate the benefit of using population codes to encode ambiguous +outputs, such as the pose of symmetric objects. Using the T-LESS dataset of +feature-less real-world objects, we show that population codes improve the +accuracy of predicting 3D object orientation from image input. + +
+
+
+
+
+ + ♻ ☆ VHM: Versatile and Honest Vision Language Model for Remote Sensing Image + Analysis + + +
+ This paper develops a Versatile and Honest vision language Model (VHM) for +remote sensing image analysis. VHM is built on a large-scale remote sensing +image-text dataset with rich-content captions (VersaD), and an honest +instruction dataset comprising both factual and deceptive questions (HnstD). +Unlike prevailing remote sensing image-text datasets, in which image captions +focus on a few prominent objects and their relationships, VersaD captions +provide detailed information about image properties, object attributes, and the +overall scene. This comprehensive captioning enables VHM to thoroughly +understand remote sensing images and perform diverse remote sensing tasks. +Moreover, different from existing remote sensing instruction datasets that only +include factual questions, HnstD contains additional deceptive questions +stemming from the non-existence of objects. This feature prevents VHM from +producing affirmative answers to nonsense queries, thereby ensuring its +honesty. In our experiments, VHM significantly outperforms various vision +language models on common tasks of scene classification, visual question +answering, and visual grounding. Additionally, VHM achieves competent +performance on several unexplored tasks, such as building vectorizing, +multi-label classification and honest question answering. + +
+
+ comment: Equal contribution: Chao Pang, Xingxing Weng, Jiang Wu; Corresponding + author: Gui-Song Xia, Conghui He +
+
+
+
+
+ + ♻ ☆ Multi-modal Preference Alignment Remedies Degradation of Visual + Instruction Tuning on Language Models + + +
+ Multi-modal large language models (MLLMs) are expected to support multi-turn +queries of interchanging image and text modalities in production. However, the +current MLLMs trained with visual-question-answering (VQA) datasets could +suffer from degradation, as VQA datasets lack the diversity and complexity of +the original text instruction datasets with which the underlying language model +was trained. To address this degradation, we first collect a lightweight, +5k-sample VQA preference dataset where answers were annotated by Gemini for +five quality metrics in a granular fashion and investigate standard Supervised +Fine-tuning, rejection sampling, Direct Preference Optimization (DPO) and +SteerLM algorithms. Our findings indicate that with DPO, we can surpass the +instruction-following capabilities of the language model, achieving a 6.73 +score on MT-Bench, compared to Vicuna's 6.57 and LLaVA's 5.99. This enhancement +in textual instruction-following capability correlates with boosted visual +instruction performance (+4.9\% on MM-Vet, +6\% on LLaVA-Bench), with minimal +alignment tax on visual knowledge benchmarks compared to the previous RLHF +approach. In conclusion, we propose a distillation-based multi-modal alignment +model with fine-grained annotations on a small dataset that restores and boosts +MLLM's language capability after visual instruction tuning. + +
+
+ comment: Project code, model and data: https://github.com/findalexli/mllm-dpo +
+
+
+
+
+ + ♻ ☆ Dynamic Multimodal Evaluation with Flexible Complexity by + Vision-Language Bootstrapping + + +
+ Large Vision-Language Models (LVLMs) have demonstrated remarkable +capabilities across multimodal tasks such as visual perception and reasoning, +leading to good performance on various multimodal evaluation benchmarks. +However, these benchmarks keep a static nature and overlap with the +pre-training data, resulting in fixed complexity constraints and data +contamination issues. This raises the concern regarding the validity of the +evaluation. To address these two challenges, we introduce a dynamic multimodal +evaluation protocol called Vision-Language Bootstrapping (VLB). VLB provides a +robust and comprehensive assessment for LVLMs with reduced data contamination +and flexible complexity. To this end, VLB dynamically generates new visual +question-answering samples through a multimodal bootstrapping module that +modifies both images and language, while ensuring that newly generated samples +remain consistent with the original ones by a judge module. By composing +various bootstrapping strategies, VLB offers dynamic variants of existing +benchmarks with diverse complexities, enabling the evaluation to co-evolve with +the ever-evolving capabilities of LVLMs. Extensive experimental results across +multiple benchmarks, including SEEDBench, MMBench, and MME, show that VLB +significantly reduces data contamination and exposes performance limitations of +LVLMs. + +
+
+
+
+
+ + ♻ ☆ Sliding Gaussian ball adaptive growth (SlingBAG): point cloud-based + iterative algorithm for large-scale 3D photoacoustic imaging + + +
+ Large-scale 3D photoacoustic (PA) imaging has become increasingly important +for both clinical and pre-clinical applications. Limited by cost and system +complexity, only systems with sparsely-distributed sensors can be widely +implemented, which desires advanced reconstruction algorithms to reduce +artifacts. However, high computing memory and time consumption of traditional +iterative reconstruction (IR) algorithms is practically unacceptable for +large-scale 3D PA imaging. Here, we propose a point cloud-based IR algorithm +that reduces memory consumption by several orders, wherein the 3D PA scene is +modeled as a series of Gaussian-distributed spherical sources stored in form of +point cloud. During the IR process, not only are properties of each Gaussian +source, including its peak intensity (initial pressure value), standard +deviation (size) and mean (position) continuously optimized, but also each +Gaussian source itself adaptively undergoes destroying, splitting, and +duplication along the gradient direction. This method, named the sliding +Gaussian ball adaptive growth (SlingBAG) algorithm, enables high-quality +large-scale 3D PA reconstruction with fast iteration and extremely low memory +usage. We validated SlingBAG algorithm in both simulation study and in vivo +animal experiments. The source code and data for SlingBAG, along with +supplementary materials and demonstration videos, are now available in the +following GitHub repository: https://github.com/JaegerCQ/SlingBAG. + +
+
+ comment: Added SlingBAG reconstruction of rat kidney and rat liver results; + updated methods; added references +
+
+
+
+
+ + ♻ Improving Domain Generalization in Self-supervised Monocular Depth + Estimation via Stabilized Adversarial Training ECCV 2024 + + +
+ Learning a self-supervised Monocular Depth Estimation (MDE) model with great +generalization remains significantly challenging. Despite the success of +adversarial augmentation in the supervised learning generalization, naively +incorporating it into self-supervised MDE models potentially causes +over-regularization, suffering from severe performance degradation. In this +paper, we conduct qualitative analysis and illuminate the main causes: (i) +inherent sensitivity in the UNet-alike depth network and (ii) dual optimization +conflict caused by over-regularization. To tackle these issues, we propose a +general adversarial training framework, named Stabilized Conflict-optimization +Adversarial Training (SCAT), integrating adversarial data augmentation into +self-supervised MDE methods to achieve a balance between stability and +generalization. Specifically, we devise an effective scaling depth network that +tunes the coefficients of long skip connection and effectively stabilizes the +training process. Then, we propose a conflict gradient surgery strategy, which +progressively integrates the adversarial gradient and optimizes the model +toward a conflict-free direction. Extensive experiments on five benchmarks +demonstrate that SCAT can achieve state-of-the-art performance and +significantly improve the generalization capability of existing self-supervised +MDE methods. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Mirror-Yolo: A Novel Attention Focus, Instance Segmentation and Mirror + Detection Model + + +
+ Mirrors can degrade the performance of computer vision models, but research +into detecting them is in the preliminary phase. YOLOv4 achieves phenomenal +results in terms of object detection accuracy and speed, but it still fails in +detecting mirrors. Thus, we propose Mirror-YOLO, which targets mirror +detection, containing a novel attention focus mechanism for features +acquisition, a hypercolumn-stairstep approach to better fusion the feature +maps, and the mirror bounding polygons for instance segmentation. Compared to +the existing mirror detection networks and YOLO series, our proposed network +achieves superior performance in average accuracy on our proposed mirror +dataset and another state-of-art mirror dataset, which demonstrates the +validity and effectiveness of Mirror-YOLO. + +
+
+
+
+
+ + ♻ ☆ CollaMamba: Efficient Collaborative Perception with Cross-Agent + Spatial-Temporal State Space Model + + +
+ By sharing complementary perceptual information, multi-agent collaborative +perception fosters a deeper understanding of the environment. Recent studies on +collaborative perception mostly utilize CNNs or Transformers to learn feature +representation and fusion in the spatial dimension, which struggle to handle +long-range spatial-temporal features under limited computing and communication +resources. Holistically modeling the dependencies over extensive spatial areas +and extended temporal frames is crucial to enhancing feature quality. To this +end, we propose a resource efficient cross-agent spatial-temporal collaborative +state space model (SSM), named CollaMamba. Initially, we construct a +foundational backbone network based on spatial SSM. This backbone adeptly +captures positional causal dependencies from both single-agent and cross-agent +views, yielding compact and comprehensive intermediate features while +maintaining linear complexity. Furthermore, we devise a history-aware feature +boosting module based on temporal SSM, extracting contextual cues from extended +historical frames to refine vague features while preserving low overhead. +Extensive experiments across several datasets demonstrate that CollaMamba +outperforms state-of-the-art methods, achieving higher model accuracy while +reducing computational and communication overhead by up to 71.9% and 1/64, +respectively. This work pioneers the exploration of the Mamba's potential in +collaborative perception. The source code will be made available. + +
+
+
+
+
+ + ♻ ☆ Establishing Causal Relationship Between Whole Slide Image Predictions + and Diagnostic Evidence Subregions in Deep Learning + + +
+ Due to the lack of fine-grained annotation guidance, current Multiple +Instance Learning (MIL) struggles to establish a robust causal relationship +between Whole Slide Image (WSI) diagnosis and evidence sub-images, just like +fully supervised learning. So many noisy images can undermine the network's +prediction. The proposed Causal Inference Multiple Instance Learning (CI-MIL), +uses out-of-distribution generalization to reduce the recognition confusion of +sub-images by MIL network, without requiring pixelwise annotations. +Specifically, feature distillation is introduced to roughly identify the +feature representation of lesion patches. Then, in the random Fourier feature +space, these features are re-weighted to minimize the cross-correlation, +effectively correcting the feature distribution deviation. These processes +reduce the uncertainty when tracing the prediction results back to patches. +Predicted diagnoses are more direct and reliable because the causal +relationship between them and diagnostic evidence images is more clearly +recognized by the network. Experimental results demonstrate that CI-MIL +outperforms state-of-the-art methods, achieving 92.25% accuracy and 95.28% AUC +on the Camelyon16 dataset (breast cancer), while 94.29% accuracy and 98.07% AUC +on the TCGA-NSCLC dataset (non-small cell lung cancer). Additionally, CI-MIL +exhibits superior interpretability, as its selected regions demonstrate high +consistency with ground truth annotations, promising more reliable diagnostic +assistance for pathologists. + +
+
+
+
+
+ + ♻ ☆ CoVR-2: Automatic Data Construction for Composed Video Retrieval TPAMI 2024 + + +
+ Composed Image Retrieval (CoIR) has recently gained popularity as a task that +considers both text and image queries together, to search for relevant images +in a database. Most CoIR approaches require manually annotated datasets, +comprising image-text-image triplets, where the text describes a modification +from the query image to the target image. However, manual curation of CoIR +triplets is expensive and prevents scalability. In this work, we instead +propose a scalable automatic dataset creation methodology that generates +triplets given video-caption pairs, while also expanding the scope of the task +to include composed video retrieval (CoVR). To this end, we mine paired videos +with a similar caption from a large database, and leverage a large language +model to generate the corresponding modification text. Applying this +methodology to the extensive WebVid2M collection, we automatically construct +our WebVid-CoVR dataset, resulting in 1.6 million triplets. Moreover, we +introduce a new benchmark for CoVR with a manually annotated evaluation set, +along with baseline results. We further validate that our methodology is +equally applicable to image-caption pairs, by generating 3.3 million CoIR +training triplets using the Conceptual Captions dataset. Our model builds on +BLIP-2 pretraining, adapting it to composed video (or image) retrieval, and +incorporates an additional caption retrieval loss to exploit extra supervision +beyond the triplet. We provide extensive ablations to analyze the design +choices on our new CoVR benchmark. Our experiments also demonstrate that +training a CoVR model on our datasets effectively transfers to CoIR, leading to +improved state-of-the-art performance in the zero-shot setup on the CIRR, +FashionIQ, and CIRCO benchmarks. Our code, datasets, and models are publicly +available at https://imagine.enpc.fr/ ventural/covr. + +
+
+ comment: Appears in TPAMI 2024 (DOI: 10.1109/TPAMI.2024.3463799). Journal + extension of the AAAI 2024 conference paper arXiv:2308.14746v3. Project page: + https://imagine.enpc.fr/~ventural/covr/ +
+
+
+
+
+ + ♻ ☆ POINTS: Improving Your Vision-language Model with Affordable Strategies + + +
+ In recent years, vision-language models have made significant strides, +excelling in tasks like optical character recognition and geometric +problem-solving. However, several critical issues remain: 1) Proprietary models +often lack transparency about their architectures, while open-source models +need more detailed ablations of their training strategies. 2) Pre-training data +in open-source works is under-explored, with datasets added empirically, making +the process cumbersome. 3) Fine-tuning often focuses on adding datasets, +leading to diminishing returns. To address these issues, we propose the +following contributions: 1) We trained a robust baseline model using the latest +advancements in vision-language models, introducing effective improvements and +conducting comprehensive ablation and validation for each technique. 2) +Inspired by recent work on large language models, we filtered pre-training data +using perplexity, selecting the lowest perplexity data for training. This +approach allowed us to train on a curated 1M dataset, achieving competitive +performance. 3) During visual instruction tuning, we used model soup on +different datasets when adding more datasets yielded marginal improvements. +These innovations resulted in a 9B parameter model that performs competitively +with state-of-the-art models. Our strategies are efficient and lightweight, +making them easily adoptable by the community. + +
+
+ comment: v2 +
+
+
+
+
+ + ♻ ☆ Mini-Omni2: Towards Open-source GPT-4o with Vision, Speech and Duplex + Capabilities + + +
+ GPT-4o, an all-encompassing model, represents a milestone in the development +of large multi-modal language models. It can understand visual, auditory, and +textual modalities, directly output audio, and support flexible duplex +interaction. Models from the open-source community often achieve some +functionalities of GPT-4o, such as visual understanding and voice chat. +Nevertheless, training a unified model that incorporates all modalities is +challenging due to the complexities of multi-modal data, intricate model +architectures, and training processes. In this paper, we introduce Mini-Omni2, +a visual-audio assistant capable of providing real-time, end-to-end voice +responses to visoin and audio queries. By integrating pretrained visual and +auditory encoders, Mini-Omni2 maintains performance in individual modalities. +We propose a three-stage training process to align modalities, allowing the +language model to handle multi-modal inputs and outputs after training on a +limited dataset. For interaction, we introduce a command-based interruption +mechanism, enabling more flexible interaction with users. To the best of our +knowledge, Mini-Omni2 is one of the closest reproductions of GPT-4o, which have +similar form of functionality, and we hope it can offer valuable insights for +subsequent research. + +
+
+ comment: Technical report, work in progress. Demo and code: + https://github.com/gpt-omni/mini-omni2 +
+
+
+
+
+ + ♻ ☆ Rethinking Misalignment in Vision-Language Model Adaptation from a + Causal Perspective NeurIPS 2024 + + +
+ Foundational Vision-Language models such as CLIP have exhibited impressive +generalization in downstream tasks. However, CLIP suffers from a two-level +misalignment issue, i.e., task misalignment and data misalignment, when +adapting to specific tasks. Soft prompt tuning has mitigated the task +misalignment, yet the data misalignment remains a challenge. To analyze the +impacts of the data misalignment, we revisit the pre-training and adaptation +processes of CLIP and develop a structural causal model. We discover that while +we expect to capture task-relevant information for downstream tasks accurately, +the task-irrelevant knowledge impacts the prediction results and hampers the +modeling of the true relationships between the images and the predicted +classes. As task-irrelevant knowledge is unobservable, we leverage the +front-door adjustment and propose Causality-Guided Semantic Decoupling and +Classification (CDC) to mitigate the interference of task-irrelevant knowledge. +Specifically, we decouple semantics contained in the data of downstream tasks +and perform classification based on each semantic. Furthermore, we employ the +Dempster-Shafer evidence theory to evaluate the uncertainty of each prediction +generated by diverse semantics. Experiments conducted in multiple different +settings have consistently demonstrated the effectiveness of CDC. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Seeing the Image: Prioritizing Visual Correlation by Contrastive + Alignment + + +
+ Existing image-text modality alignment in Vision Language Models (VLMs) +treats each text token equally in an autoregressive manner. Despite being +simple and effective, this method results in sub-optimal cross-modal alignment +by over-emphasizing the text tokens that are less correlated with or even +contradictory with the input images. In this paper, we advocate for assigning +distinct contributions for each text token based on its visual correlation. +Specifically, we present by contrasting image inputs, the difference in +prediction logits on each text token provides strong guidance of visual +correlation. We therefore introduce Contrastive ALignment (CAL), a simple yet +effective re-weighting strategy that prioritizes training visually correlated +tokens. Our experimental results demonstrate that CAL consistently improves +different types of VLMs across different resolutions and model sizes on various +benchmark datasets. Importantly, our method incurs minimal additional +computational overhead, rendering it highly efficient compared to alternative +data scaling strategies. Codes are available at +https://github.com/foundation-multimodal-models/CAL. + +
+
+ comment: NeurlPS 2024, Camera ready +
+
+
+
+
+ + ♻ ☆ Unified Human-Scene Interaction via Prompted Chain-of-Contacts + + +
+ Human-Scene Interaction (HSI) is a vital component of fields like embodied AI +and virtual reality. Despite advancements in motion quality and physical +plausibility, two pivotal factors, versatile interaction control and the +development of a user-friendly interface, require further exploration before +the practical application of HSI. This paper presents a unified HSI framework, +UniHSI, which supports unified control of diverse interactions through language +commands. This framework is built upon the definition of interaction as Chain +of Contacts (CoC): steps of human joint-object part pairs, which is inspired by +the strong correlation between interaction types and human-object contact +regions. Based on the definition, UniHSI constitutes a Large Language Model +(LLM) Planner to translate language prompts into task plans in the form of CoC, +and a Unified Controller that turns CoC into uniform task execution. To +facilitate training and evaluation, we collect a new dataset named ScenePlan +that encompasses thousands of task plans generated by LLMs based on diverse +scenarios. Comprehensive experiments demonstrate the effectiveness of our +framework in versatile task execution and generalizability to real scanned +scenes. The project page is at https://github.com/OpenRobotLab/UniHSI . + +
+
+ comment: A unified Human-Scene Interaction framework that supports versatile + interactions through language commands.Project URL: + https://xizaoqu.github.io/unihsi/ . Code: + https://github.com/OpenRobotLab/UniHSI +
+
+
+
+
+ + ♻ ☆ Exploring PCA-based feature representations of image pixels via CNN to + enhance food image segmentation + + +
+ For open vocabulary recognition of ingredients in food images, segmenting the +ingredients is a crucial step. This paper proposes a novel approach that +explores PCA-based feature representations of image pixels using a +convolutional neural network (CNN) to enhance segmentation. An internal +clustering metric based on the silhouette score is defined to evaluate the +clustering quality of various pixel-level feature representations generated by +different feature maps derived from various CNN backbones. Using this metric, +the paper explores optimal feature representation selection and suitable +clustering methods for ingredient segmentation. Additionally, it is found that +principal component (PC) maps derived from concatenations of backbone feature +maps improve the clustering quality of pixel-level feature representations, +resulting in stable segmentation outcomes. Notably, the number of selected +eigenvalues can be used as the number of clusters to achieve good segmentation +results. The proposed method performs well on the ingredient-labeled dataset +FoodSeg103, achieving a mean Intersection over Union (mIoU) score of 0.5423. +Importantly, the proposed method is unsupervised, and pixel-level feature +representations from backbones are not fine-tuned on specific datasets. This +demonstrates the flexibility, generalizability, and interpretability of the +proposed method, while reducing the need for extensive labeled datasets. + +
+
+
+
+
+ + ♻ ☆ Optimizing Negative Prompts for Enhanced Aesthetics and Fidelity in + Text-To-Image Generation + + +
+ In text-to-image generation, using negative prompts, which describe +undesirable image characteristics, can significantly boost image quality. +However, producing good negative prompts is manual and tedious. To address +this, we propose NegOpt, a novel method for optimizing negative prompt +generation toward enhanced image generation, using supervised fine-tuning and +reinforcement learning. Our combined approach results in a substantial increase +of 25% in Inception Score compared to other approaches and surpasses +ground-truth negative prompts from the test set. Furthermore, with NegOpt we +can preferentially optimize the metrics most important to us. Finally, we +construct Negative Prompts DB +(https://huggingface.co/datasets/mikeogezi/negopt_full), a publicly available +dataset of negative prompts. + +
+
+
+
+
+ + ♻ ☆ MS-DETR: Multispectral Pedestrian Detection Transformer with Loosely + Coupled Fusion and Modality-Balanced Optimization + + +
+ Multispectral pedestrian detection is an important task for many +around-the-clock applications, since the visible and thermal modalities can +provide complementary information especially under low light conditions. Due to +the presence of two modalities, misalignment and modality imbalance are the +most significant issues in multispectral pedestrian detection. In this paper, +we propose M ulti S pectral pedestrian DE tection TR ansformer (MS-DETR) to fix +above issues. MS-DETR consists of two modality-specific backbones and +Transformer encoders, followed by a multi-modal Transformer decoder, and the +visible and thermal features are fused in the multi-modal Transformer decoder. +To well resist the misalignment between multi-modal images, we design a loosely +coupled fusion strategy by sparsely sampling some keypoints from multi-modal +features independently and fusing them with adaptively learned attention +weights. Moreover, based on the insight that not only different modalities, but +also different pedestrian instances tend to have different confidence scores to +final detection, we further propose an instance-aware modality-balanced +optimization strategy, which preserves visible and thermal decoder branches and +aligns their predicted slots through an instance-wise dynamic loss. Our +end-to-end MS-DETR shows superior performance on the challenging KAIST, CVC-14 +and LLVIP benchmark datasets. The source code is available at +https://github.com/YinghuiXing/MS-DETR. + +
+
+ comment: The paper has been accepted by IEEE Transactions on Intelligent + Transportation Systems +
+
+
+
+
+ + ♻ ☆ RIAV-MVS: Recurrent-Indexing an Asymmetric Volume for Multi-View Stereo CVPR 2023 + + +
+ This paper presents a learning-based method for multi-view depth estimation +from posed images. Our core idea is a "learning-to-optimize" paradigm that +iteratively indexes a plane-sweeping cost volume and regresses the depth map +via a convolutional Gated Recurrent Unit (GRU). Since the cost volume plays a +paramount role in encoding the multi-view geometry, we aim to improve its +construction both at pixel- and frame- levels. At the pixel level, we propose +to break the symmetry of the Siamese network (which is typically used in MVS to +extract image features) by introducing a transformer block to the reference +image (but not to the source images). Such an asymmetric volume allows the +network to extract global features from the reference image to predict its +depth map. Given potential inaccuracies in the poses between reference and +source images, we propose to incorporate a residual pose network to correct the +relative poses. This essentially rectifies the cost volume at the frame level. +We conduct extensive experiments on real-world MVS datasets and show that our +method achieves state-of-the-art performance in terms of both within-dataset +evaluation and cross-dataset generalization. Code available: +https://github.com/oppo-us-research/riav-mvs. + +
+
+ comment: CVPR 2023. Code link added +
+
+
+
+
+ + ♻ ☆ LADDER: Language Driven Slice Discovery and Error Rectification + + +
+ Error slice discovery associates structured patterns with model errors. +Existing methods discover error slices by clustering the error-prone samples +with similar patterns or assigning discrete attributes to each sample for +post-hoc analysis. While these methods aim for interpretability and easier +mitigation through reweighting or rebalancing, they may not capture the full +complexity of error patterns due to incomplete or missing attributes. Contrary +to the existing approach, this paper utilizes the reasoning capabilities of the +Large Language Model (LLM) to analyze complex error patterns and generate +testable hypotheses. This paper proposes LADDER: Language Driven slice +Discovery and Error Rectification. It first projects the model's representation +into a language-aligned feature space (eg CLIP) to preserve semantics in the +original model feature space. This ensures the accurate retrieval of sentences +that highlight the model's errors. Next, the LLM utilizes the sentences and +generates hypotheses to discover error slices. Finally, we mitigate the error +by fine-tuning the classification head by creating a group-balanced dataset +using the hypotheses. Our entire method does not require any attribute +annotation, either explicitly or through external tagging models. We validate +our method with \textbf{five} image classification datasets. + +
+
+
+
+
+ + ♻ ☆ LifelongMemory: Leveraging LLMs for Answering Queries in Long-form + Egocentric Videos + + +
+ In this paper we introduce LifelongMemory, a new framework for accessing +long-form egocentric videographic memory through natural language question +answering and retrieval. LifelongMemory generates concise video activity +descriptions of the camera wearer and leverages the zero-shot capabilities of +pretrained large language models to perform reasoning over long-form video +context. Furthermore, LifelongMemory uses a confidence and explanation module +to produce confident, high-quality, and interpretable answers. Our approach +achieves state-of-the-art performance on the EgoSchema benchmark for question +answering and is highly competitive on the natural language query (NLQ) +challenge of Ego4D. Code is available at +https://github.com/agentic-learning-ai-lab/lifelong-memory. + +
+
+
+
+
+ + ♻ ☆ D3: Data Diversity Design for Systematic Generalization in Visual + Question Answering + + +
+ Systematic generalization is a crucial aspect of intelligence, which refers +to the ability to generalize to novel tasks by combining known subtasks and +concepts. One critical factor that has been shown to influence systematic +generalization is the diversity of training data. However, diversity can be +defined in various ways, as data have many factors of variation. A more +granular understanding of how different aspects of data diversity affect +systematic generalization is lacking. We present new evidence in the problem of +Visual Question Answering (VQA) that reveals that the diversity of simple tasks +(i.e. tasks formed by a few subtasks and concepts) plays a key role in +achieving systematic generalization. This implies that it may not be essential +to gather a large and varied number of complex tasks, which could be costly to +obtain. We demonstrate that this result is independent of the similarity +between the training and testing data and applies to well-known families of +neural network architectures for VQA (i.e. monolithic architectures and neural +module networks). Additionally, we observe that neural module networks leverage +all forms of data diversity we evaluated, while monolithic architectures +require more extensive amounts of data to do so. These findings provide a first +step towards understanding the interactions between data diversity design, +neural network architectures, and systematic generalization capabilities. + +
+
+ comment: TMLR (https://openreview.net/forum?id=ZAin13msOp) +
+
+
+
+
+ + ♻ ☆ Interpretable Lightweight Transformer via Unrolling of Learned Graph + Smoothness Priors + + +
+ We build interpretable and lightweight transformer-like neural networks by +unrolling iterative optimization algorithms that minimize graph smoothness +priors -- the quadratic graph Laplacian regularizer (GLR) and the $\ell_1$-norm +graph total variation (GTV) -- subject to an interpolation constraint. The +crucial insight is that a normalized signal-dependent graph learning module +amounts to a variant of the basic self-attention mechanism in conventional +transformers. Unlike "black-box" transformers that require learning of large +key, query and value matrices to compute scaled dot products as affinities and +subsequent output embeddings, resulting in huge parameter sets, our unrolled +networks employ shallow CNNs to learn low-dimensional features per node to +establish pairwise Mahalanobis distances and construct sparse similarity +graphs. At each layer, given a learned graph, the target interpolated signal is +simply a low-pass filtered output derived from the minimization of an assumed +graph smoothness prior, leading to a dramatic reduction in parameter count. +Experiments for two image interpolation applications verify the restoration +performance, parameter efficiency and robustness to covariate shift of our +graph-based unrolled networks compared to conventional transformers. + +
+
+
+
+
+ + ♻ ☆ GD doesn't make the cut: Three ways that non-differentiability affects + neural network training + + +
+ This paper investigates the distinctions between gradient methods applied to +non-differentiable functions (NGDMs) and classical gradient descents (GDs) +designed for differentiable functions. First, we demonstrate significant +differences in the convergence properties of NGDMs compared to GDs, challenging +the applicability of the extensive neural network convergence literature based +on $L-smoothness$ to non-smooth neural networks. Next, we demonstrate the +paradoxical nature of NGDM solutions for $L_{1}$-regularized problems, showing +that increasing the regularization penalty leads to an increase in the $L_{1}$ +norm of optimal solutions in NGDMs. Consequently, we show that widely adopted +$L_{1}$ penalization-based techniques for network pruning do not yield expected +results. Additionally, we dispel the common belief that optimization algorithms +like Adam and RMSProp perform similarly in non-differentiable contexts. +Finally, we explore the Edge of Stability phenomenon, indicating its +inapplicability even to Lipschitz continuous convex differentiable functions, +leaving its relevance to non-convex non-differentiable neural networks +inconclusive. Our analysis exposes misguided interpretations of NGDMs in widely +referenced papers and texts due to an overreliance on strong smoothness +assumptions, emphasizing the necessity for a nuanced understanding of +foundational assumptions in the analysis of these systems. + +
+
+
+
+
+ + ♻ ☆ Anatomical Foundation Models for Brain MRIs + + +
+ Deep Learning (DL) in neuroimaging has become increasingly relevant for +detecting neurological conditions and neurodegenerative disorders. One of the +most predominant biomarkers in neuroimaging is represented by brain age, which +has been shown to be a good indicator for different conditions, such as +Alzheimer's Disease. Using brain age for pretraining DL models in transfer +learning settings has also recently shown promising results, especially when +dealing with data scarcity of different conditions. On the other hand, +anatomical information of brain MRIs (e.g. cortical thickness) can provide +important information for learning good representations that can be transferred +to many downstream tasks. In this work, we propose AnatCL, an anatomical +foundation model for brain MRIs that i.) leverages anatomical information with +a weakly contrastive learning approach and ii.) achieves state-of-the-art +performances in many different downstream tasks. To validate our approach we +consider 12 different downstream tasks for diagnosis classification, and +prediction of 10 different clinical assessment scores. Pretrained models can be +found at https://github.com/EIDOSLAB/AnatCL. + +
+
+ comment: 12 pages; added source url +
+
+
+
+
+ + ♻ ☆ FewViewGS: Gaussian Splatting with Few View Matching and Multi-stage + Training NeurIPS2024 + + +
+ The field of novel view synthesis from images has seen rapid advancements +with the introduction of Neural Radiance Fields (NeRF) and more recently with +3D Gaussian Splatting. Gaussian Splatting became widely adopted due to its +efficiency and ability to render novel views accurately. While Gaussian +Splatting performs well when a sufficient amount of training images are +available, its unstructured explicit representation tends to overfit in +scenarios with sparse input images, resulting in poor rendering performance. To +address this, we present a 3D Gaussian-based novel view synthesis method using +sparse input images that can accurately render the scene from the viewpoints +not covered by the training images. We propose a multi-stage training scheme +with matching-based consistency constraints imposed on the novel views without +relying on pre-trained depth estimation or diffusion models. This is achieved +by using the matches of the available training images to supervise the +generation of the novel views sampled between the training frames with color, +geometry, and semantic losses. In addition, we introduce a locality preserving +regularization for 3D Gaussians which removes rendering artifacts by preserving +the local color structure of the scene. Evaluation on synthetic and real-world +datasets demonstrates competitive or superior performance of our method in +few-shot novel view synthesis compared to existing state-of-the-art methods. + +
+
+ comment: Accepted by NeurIPS2024 +
+
+
+
+
+ + ♻ ☆ SLVideo: A Sign Language Video Moment Retrieval Framework + + +
+ SLVideo is a video moment retrieval system for Sign Language videos that +incorporates facial expressions, addressing this gap in existing technology. +The system extracts embedding representations for the hand and face signs from +video frames to capture the signs in their entirety, enabling users to search +for a specific sign language video segment with text queries. A collection of +eight hours of annotated Portuguese Sign Language videos is used as the +dataset, and a CLIP model is used to generate the embeddings. The initial +results are promising in a zero-shot setting. In addition, SLVideo incorporates +a thesaurus that enables users to search for similar signs to those retrieved, +using the video segment embeddings, and also supports the edition and creation +of video sign language annotations. Project web page: +https://novasearch.github.io/SLVideo/ + +
+
+ comment: 4 pages, 1 figure, 1 table +
+
+
+
+
+ + ♻ ☆ DC-Gaussian: Improving 3D Gaussian Splatting for Reflective Dash Cam + Videos NeurIPS 2024 + + +
+ We present DC-Gaussian, a new method for generating novel views from +in-vehicle dash cam videos. While neural rendering techniques have made +significant strides in driving scenarios, existing methods are primarily +designed for videos collected by autonomous vehicles. However, these videos are +limited in both quantity and diversity compared to dash cam videos, which are +more widely used across various types of vehicles and capture a broader range +of scenarios. Dash cam videos often suffer from severe obstructions such as +reflections and occlusions on the windshields, which significantly impede the +application of neural rendering techniques. To address this challenge, we +develop DC-Gaussian based on the recent real-time neural rendering technique 3D +Gaussian Splatting (3DGS). Our approach includes an adaptive image +decomposition module to model reflections and occlusions in a unified manner. +Additionally, we introduce illumination-aware obstruction modeling to manage +reflections and occlusions under varying lighting conditions. Lastly, we employ +a geometry-guided Gaussian enhancement strategy to improve rendering details by +incorporating additional geometry priors. Experiments on self-captured and +public dash cam videos show that our method not only achieves state-of-the-art +performance in novel view synthesis, but also accurately reconstructing +captured scenes getting rid of obstructions. See the project page for code, +data: https://linhanwang.github.io/dcgaussian/. + +
+
+ comment: 10 pages,7 figures;project page: + https://linhanwang.github.io/dcgaussian/; Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ ODGEN: Domain-specific Object Detection Data Generation with Diffusion + Models NeurIPS2024 + + +
+ Modern diffusion-based image generative models have made significant progress +and become promising to enrich training data for the object detection task. +However, the generation quality and the controllability for complex scenes +containing multi-class objects and dense objects with occlusions remain +limited. This paper presents ODGEN, a novel method to generate high-quality +images conditioned on bounding boxes, thereby facilitating data synthesis for +object detection. Given a domain-specific object detection dataset, we first +fine-tune a pre-trained diffusion model on both cropped foreground objects and +entire images to fit target distributions. Then we propose to control the +diffusion model using synthesized visual prompts with spatial constraints and +object-wise textual descriptions. ODGEN exhibits robustness in handling complex +scenes and specific domains. Further, we design a dataset synthesis pipeline to +evaluate ODGEN on 7 domain-specific benchmarks to demonstrate its +effectiveness. Adding training data generated by ODGEN improves up to 25.3% +mAP@.50:.95 with object detectors like YOLOv5 and YOLOv7, outperforming prior +controllable generative methods. In addition, we design an evaluation protocol +based on COCO-2014 to validate ODGEN in general domains and observe an +advantage up to 5.6% in mAP@.50:.95 against existing methods. + +
+
+ comment: Accepted by NeurIPS2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 73 + +
+
+
+ + ☆ Adaptive Length Image Tokenization via Recurrent Allocation + + +
+ Current vision systems typically assign fixed-length representations to +images, regardless of the information content. This contrasts with human +intelligence - and even large language models - which allocate varying +representational capacities based on entropy, context and familiarity. Inspired +by this, we propose an approach to learn variable-length token representations +for 2D images. Our encoder-decoder architecture recursively processes 2D image +tokens, distilling them into 1D latent tokens over multiple iterations of +recurrent rollouts. Each iteration refines the 2D tokens, updates the existing +1D latent tokens, and adaptively increases representational capacity by adding +new tokens. This enables compression of images into a variable number of +tokens, ranging from 32 to 256. We validate our tokenizer using reconstruction +loss and FID metrics, demonstrating that token count aligns with image entropy, +familiarity and downstream task requirements. Recurrent token processing with +increasing representational capacity in each iteration shows signs of token +specialization, revealing potential for object / part discovery. + +
+
+ comment: Code at: https://github.com/ShivamDuggal4/adaptive-length-tokenizer +
+
+
+
+
+ + ☆ DeeR-VLA: Dynamic Inference of Multimodal Large Language Models for + Efficient Robot Execution NeurIPS 2024 + + +
+ MLLMs have demonstrated remarkable comprehension and reasoning capabilities +with complex language and visual data. These advances have spurred the vision +of establishing a generalist robotic MLLM proficient in understanding complex +human instructions and accomplishing various embodied tasks. However, +developing MLLMs for real-world robots is challenging due to the typically +limited computation and memory capacities available on robotic platforms. In +contrast, the inference of MLLMs involves storing billions of parameters and +performing tremendous computation, imposing significant hardware demands. In +our paper, we propose a Dynamic Early-Exit Framework for Robotic +Vision-Language-Action Model (DeeR-VLA, or simply DeeR) that automatically +adjusts the size of the activated MLLM based on each situation at hand. The +approach leverages a multi-exit architecture in MLLMs, which allows the model +to terminate processing once a proper size of the model has been activated for +a specific situation, thus avoiding further redundant computation. +Additionally, we develop novel algorithms that establish early-termination +criteria for DeeR, conditioned on predefined demands such as average +computational cost (i.e., power consumption), as well as peak computational +consumption (i.e., latency) and GPU memory usage. These enhancements ensure +that DeeR operates efficiently under varying resource constraints while +maintaining competitive performance. On the CALVIN robot manipulation +benchmark, DeeR demonstrates significant reductions in computational costs of +LLM by 5.2-6.5x and GPU memory of LLM by 2-6x without compromising performance. +Code and checkpoints are available at https://github.com/yueyang130/DeeR-VLA. + +
+
+ comment: 25 pages, 6 figures, NeurIPS 2024 +
+
+
+
+
+ + ☆ Simulation of Nanorobots with Artificial Intelligence and Reinforcement + Learning for Advanced Cancer Cell Detection and Tracking + + +
+ Nanorobots are a promising development in targeted drug delivery and the +treatment of neurological disorders, with potential for crossing the +blood-brain barrier (BBB). These small devices leverage advancements in +nanotechnology and bioengineering for precise navigation and targeted payload +delivery, particularly for conditions like brain tumors, Alzheimer's disease, +and Parkinson's disease. Recent progress in artificial intelligence (AI) and +machine learning (ML) has improved the navigation and effectiveness of +nanorobots, allowing them to detect and interact with cancer cells through +biomarker analysis. This study presents a new reinforcement learning (RL) +framework for optimizing nanorobot navigation in complex biological +environments, focusing on cancer cell detection by analyzing the concentration +gradients of surrounding biomarkers. We utilize a computer simulation model to +explore the behavior of nanorobots in a three-dimensional space with cancer +cells and biological barriers. The proposed method uses Q-learning to refine +movement strategies based on real-time biomarker concentration data, enabling +nanorobots to autonomously navigate to cancerous tissues for targeted drug +delivery. This research lays the groundwork for future laboratory experiments +and clinical applications, with implications for personalized medicine and less +invasive cancer treatments. The integration of intelligent nanorobots could +revolutionize therapeutic strategies, reducing side effects and enhancing +treatment effectiveness for cancer patients. Further research will investigate +the practical deployment of these technologies in medical settings, aiming to +unlock the full potential of nanorobotics in healthcare. + +
+
+ comment: The source code for this simulation is available on GitHub: + https://github.com/SHAHAB-K93/cancer-and-smart-nanorobot +
+
+
+
+
+ + ☆ Kilovolt Pyroelectric Voltage Generation and Electrostatic Actuation + With Fluidic Heating + + +
+ Integrated micro power generators are crucial components for micro robotic +platforms to demonstrate untethered operation and to achieve autonomy. Current +micro robotic electrostatic actuators typically require hundreds to thousands +of voltages to output sufficient work. Pyroelectricity is one such source of +high voltages that can be scaled to small form factors. This paper demonstrates +a distributed pyroelectric high voltage generation mechanism to power kV +actuators using alternating exposure of crystals to hot and cold water (300C to +900C water temperature). Using this fluidic temperature control, a +pyroelectrically generated voltage of 2470 V was delivered to a 2 pF storage +capacitor yielding a 6.10 {\mu}J stored energy. A maximum energy of 17.46 +{\mu}J was delivered to a 47 pF capacitor at 861 V. The recirculating water can +be used to heat a distributed array of converters to generate electricity in +distant robotic actuator sections. The development of this distributed system +would enable untethered micro-robot to be operated with a flexible body and +free of battery recharging, which advances its applications in the real world. + +
+
+ comment: Accepted and published at Hilton Head Workshop 2022: A Solid-State + Sensors, Actuators and Microsystems Workshop +
+
+
+
+
+ + ☆ Energy-Aware Coverage Planning for Heterogeneous Multi-Robot System + + +
+ We propose a distributed control law for a heterogeneous multi-robot coverage +problem, where the robots could have different energy characteristics, such as +capacity and depletion rates, due to their varying sizes, speeds, capabilities, +and payloads. Existing energy-aware coverage control laws consider capacity +differences but assume the battery depletion rate to be the same for all +robots. In realistic scenarios, however, some robots can consume energy much +faster than other robots; for instance, UAVs hover at different altitudes, and +these changes could be dynamically updated based on their assigned tasks. +Robots' energy capacities and depletion rates need to be considered to maximize +the performance of a multi-robot system. To this end, we propose a new +energy-aware controller based on Lloyd's algorithm to adapt the weights of the +robots based on their energy dynamics and divide the area of interest among the +robots accordingly. The controller is theoretically analyzed and extensively +evaluated through simulations and real-world demonstrations in multiple +realistic scenarios and compared with three baseline control laws to validate +its performance and efficacy. + +
+
+ comment: Presented at DARS 2024 +
+
+
+
+
+ + ☆ DexHub and DART: Towards Internet Scale Robot Data Collection + + +
+ The quest to build a generalist robotic system is impeded by the scarcity of +diverse and high-quality data. While real-world data collection effort exist, +requirements for robot hardware, physical environment setups, and frequent +resets significantly impede the scalability needed for modern learning +frameworks. We introduce DART, a teleoperation platform designed for +crowdsourcing that reimagines robotic data collection by leveraging cloud-based +simulation and augmented reality (AR) to address many limitations of prior data +collection efforts. Our user studies highlight that DART enables higher data +collection throughput and lower physical fatigue compared to real-world +teleoperation. We also demonstrate that policies trained using DART-collected +datasets successfully transfer to reality and are robust to unseen visual +disturbances. All data collected through DART is automatically stored in our +cloud-hosted database, DexHub, which will be made publicly available upon +curation, paving the path for DexHub to become an ever-growing data hub for +robot learning. Videos are available at: https://dexhub.ai/project + +
+
+ comment: Visit https://dexhub.ai/project for more details +
+
+
+
+
+ + ☆ DiffSim2Real: Deploying Quadrupedal Locomotion Policies Purely Trained + in Differentiable Simulation + + +
+ Differentiable simulators provide analytic gradients, enabling more +sample-efficient learning algorithms and paving the way for data intensive +learning tasks such as learning from images. In this work, we demonstrate that +locomotion policies trained with analytic gradients from a differentiable +simulator can be successfully transferred to the real world. Typically, +simulators that offer informative gradients lack the physical accuracy needed +for sim-to-real transfer, and vice-versa. A key factor in our success is a +smooth contact model that combines informative gradients with physical +accuracy, ensuring effective transfer of learned behaviors. To the best of our +knowledge, this is the first time a real quadrupedal robot is able to locomote +after training exclusively in a differentiable simulation. + +
+
+ comment: Presented at the CoRL 2024 Workshop 'Differentiable Optimization + Everywhere' +
+
+
+
+
+ + ☆ Touch-to-Touch Translation -- Learning the Mapping Between Heterogeneous + Tactile Sensing Technologies IROS + + +
+ The use of data-driven techniques for tactile data processing and +classification has recently increased. However, collecting tactile data is a +time-expensive and sensor-specific procedure. Indeed, due to the lack of +hardware standards in tactile sensing, data is required to be collected for +each different sensor. This paper considers the problem of learning the mapping +between two tactile sensor outputs with respect to the same physical stimulus +-- we refer to this problem as touch-to-touch translation. In this respect, we +proposed two data-driven approaches to address this task and we compared their +performance. The first one exploits a generative model developed for +image-to-image translation and adapted for this context. The second one uses a +ResNet model trained to perform a regression task. We validated both methods +using two completely different tactile sensors -- a camera-based, Digit and a +capacitance-based, CySkin. In particular, we used Digit images to generate the +corresponding CySkin data. We trained the models on a set of tactile features +that can be found in common larger objects and we performed the testing on a +previously unseen set of data. Experimental results show the possibility of +translating Digit images into the CySkin output by preserving the contact shape +and with an error of 15.18% in the magnitude of the sensor responses. + +
+
+ comment: This paper was initially submitted at the International Conference on + Intelligent Robots and Systems (IROS) 2023 +
+
+
+
+
+ + ☆ Limiting Kinetic Energy through Control Barrier Functions: Analysis and + Experimental Validation + + +
+ In the context of safety-critical control, we propose and analyse the use of +Control Barrier Functions (CBFs) to limit the kinetic energy of +torque-controlled robots. The proposed scheme is able to modify a nominal +control action in a minimally invasive manner to achieve the desired kinetic +energy limit. We show how this safety condition is achieved by appropriately +injecting damping in the underlying robot dynamics independently of the nominal +controller structure. We present an extensive experimental validation of the +approach on a 7-Degree of Freedom (DoF) Franka Emika Panda robot. The results +demonstrate that this approach provides an effective, minimally invasive safety +layer that is straightforward to implement and is robust in real experiments. + +
+
+
+
+
+ + ☆ Diffusion-based Virtual Fixtures ICRA + + +
+ Virtual fixtures assist human operators in teleoperation settings by +constraining their actions. This extended abstract introduces a novel virtual +fixture formulation \emph{on surfaces} for tactile robotics tasks. Unlike +existing methods, our approach constrains the behavior based on the position on +the surface and generalizes it over the surface by considering the distance +(metric) on the surface. Our method works directly on possibly noisy and +partial point clouds collected via a camera. Given a set of regions on the +surface together with their desired behaviors, our method diffuses the +behaviors across the entire surface by taking into account the surface +geometry. We demonstrate our method's ability in two simulated experiments (i) +to regulate contact force magnitude or tangential speed based on surface +position and (ii) to guide the robot to targets while avoiding restricted +regions defined on the surface. All source codes, experimental data, and videos +are available as open access at +https://sites.google.com/view/diffusion-virtual-fixtures + +
+
+ comment: Presented at ICRA@40 +
+
+
+
+
+ + ☆ Learning Multiple Initial Solutions to Optimization Problems + + +
+ Sequentially solving similar optimization problems under strict runtime +constraints is essential for many applications, such as robot control, +autonomous driving, and portfolio management. The performance of local +optimization methods in these settings is sensitive to the initial solution: +poor initialization can lead to slow convergence or suboptimal solutions. To +address this challenge, we propose learning to predict \emph{multiple} diverse +initial solutions given parameters that define the problem instance. We +introduce two strategies for utilizing multiple initial solutions: (i) a +single-optimizer approach, where the most promising initial solution is chosen +using a selection function, and (ii) a multiple-optimizers approach, where +several optimizers, potentially run in parallel, are each initialized with a +different solution, with the best solution chosen afterward. We validate our +method on three optimal control benchmark tasks: cart-pole, reacher, and +autonomous driving, using different optimizers: DDP, MPPI, and iLQR. We find +significant and consistent improvement with our method across all evaluation +settings and demonstrate that it efficiently scales with the number of initial +solutions required. The code is available at +$\href{https://github.com/EladSharony/miso}{\tt{https://github.com/EladSharony/miso}}$. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Toward Realistic Cinema: The State of the Art in Mechatronics for Modern + Animatronic + + +
+ The pursuit of realism in cinema has driven significant advancements in +animatronics, where the integration of mechatronics, a multidisciplinary field +that combines mechanical engineering, electronics, and computer science, plays +a pivotal role in enhancing the functionality and realism of animatronics. This +interdisciplinary approach facilitates smoother characters movements and +enhances the sophistication of behaviors in animatronic creatures, thereby +increasing their realism. This article examines the most recent developments in +mechatronic technology and their significant impact on the art and engineering +of animatronics in the filmmaking. It explores the sophisticated integration of +system components and analyzes how these enhancements foster complexity and +integration, crucial for achieving unprecedented levels of realism in modern +cinema. Further, the article delves into in-depth case studies of well-known +movie characters, demonstrating the practical applicability of these +state-of-the-art mechatronic solutions in creating compelling, lifelike +cinematic experiences. This paper aims to bridge the gap between the technical +aspects of mechatronics and the creative demands of the film industry, +ultimately contributing to the ongoing evolution of cinematic realism. + +
+
+
+
+
+ + ☆ Heterogeneous Multi-robot Task Allocation for Long-Endurance Missions in + Dynamic Scenarios + + +
+ We present a framework for Multi-Robot Task Allocation (MRTA) in +heterogeneous teams performing long-endurance missions in dynamic scenarios. +Given the limited battery of robots, especially in the case of aerial vehicles, +we allow for robot recharges and the possibility of fragmenting and/or relaying +certain tasks. We also address tasks that must be performed by a coalition of +robots in a coordinated manner. Given these features, we introduce a new class +of heterogeneous MRTA problems which we analyze theoretically and optimally +formulate as a Mixed-Integer Linear Program. We then contribute a heuristic +algorithm to compute approximate solutions and integrate it into a mission +planning and execution architecture capable of reacting to unexpected events by +repairing or recomputing plans online. Our experimental results show the +relevance of our newly formulated problem in a realistic use case for +inspection with aerial robots. We assess the performance of our heuristic +solver in comparison with other variants and with exact optimal solutions in +small-scale scenarios. In addition, we evaluate the ability of our replanning +framework to repair plans online. + +
+
+ comment: 20 pages, 10 figures +
+
+
+
+
+ + ☆ An Immediate Update Strategy of Multi-State Constraint Kalman Filter + + +
+ The lightweight Multi-state Constraint Kalman Filter (MSCKF) has been +well-known for its high efficiency, in which the delayed update has been +usually adopted since its proposal. This work investigates the immediate update +strategy of MSCKF based on timely reconstructed 3D feature points and +measurement constraints. The differences between the delayed update and the +immediate update are theoretically analyzed in detail. It is found that the +immediate update helps construct more observation constraints and employ more +filtering updates than the delayed update, which improves the linearization +point of the measurement model and therefore enhances the estimation accuracy. +Numerical simulations and experiments show that the immediate update strategy +significantly enhances MSCKF even with a small amount of feature observations. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Reshaping UAV-Enabled Communications with Omnidirectional Multi-Rotor + Aerial Vehicles + + +
+ A new class of Multi-Rotor Aerial Vehicles (MRAVs), known as omnidirectional +MRAVs (o-MRAVs), has attracted significant interest in the robotics community. +These MRAVs have the unique capability of independently controlling their 3D +position and 3D orientation. In the context of aerial communication networks, +this translates into the ability to control the position and orientation of the +antenna mounted on the MRAV without any additional devices tasked for antenna +orientation. This additional Degrees of Freedom (DoF) adds a new dimension to +aerial communication systems, creating various research opportunities in +communications-aware trajectory planning and positioning. This paper presents +this new class of MRAVs and discusses use cases in areas such as physical layer +security and optical communications. Furthermore, the benefits of these MRAVs +are illustrated with realistic simulation scenarios. Finally, new research +problems and opportunities introduced by this advanced robotics technology are +discussed. + +
+
+ comment: Accepted for IEEE Communications Magazine. \c{opyright}2024 IEEE. + Personal use of this material is permitted. Permission from IEEE must be + obtained for all other uses, in any current or future media +
+
+
+
+
+ + ☆ V-CAS: A Realtime Vehicle Anti Collision System Using Vision Transformer + on Multi-Camera Streams ICML + + +
+ This paper introduces a real-time Vehicle Collision Avoidance System (V-CAS) +designed to enhance vehicle safety through adaptive braking based on +environmental perception. V-CAS leverages the advanced vision-based transformer +model RT-DETR, DeepSORT tracking, speed estimation, brake light detection, and +an adaptive braking mechanism. It computes a composite collision risk score +based on vehicles' relative accelerations, distances, and detected braking +actions, using brake light signals and trajectory data from multiple camera +streams to improve scene perception. Implemented on the Jetson Orin Nano, V-CAS +enables real-time collision risk assessment and proactive mitigation through +adaptive braking. A comprehensive training process was conducted on various +datasets for comparative analysis, followed by fine-tuning the selected object +detection model using transfer learning. The system's effectiveness was +rigorously evaluated on the Car Crash Dataset (CCD) from YouTube and through +real-time experiments, achieving over 98% accuracy with an average proactive +alert time of 1.13 seconds. Results indicate significant improvements in object +detection and tracking, enhancing collision avoidance compared to traditional +single-camera methods. This research demonstrates the potential of low-cost, +multi-camera embedded vision transformer systems to advance automotive safety +through enhanced environmental perception and proactive collision avoidance +mechanisms. + +
+
+ comment: Accepted at ICMLA 2024 +
+
+
+
+
+ + ☆ Brainbots as smart autonomous active particles with programmable motion + + +
+ We present an innovative robotic device designed to provide controlled motion +for studying active matter. Motion is driven by an internal vibrator powered by +a small rechargeable battery. The system integrates acoustic and magnetic +sensors along with a programmable microcontroller. Unlike conventional +vibrobots, the motor induces horizontal vibrations, resulting in cycloidal +trajectories that have been characterized and optimized. Portions of these +orbits can be utilized to create specific motion patterns. As a proof of +concept, we demonstrate how this versatile system can be exploited to develop +active particles with varying dynamics, ranging from ballistic motion to +run-and-tumble diffusive behavior. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ Real-Time Polygonal Semantic Mapping for Humanoid Robot Stair Climbing + + +
+ We present a novel algorithm for real-time planar semantic mapping tailored +for humanoid robots navigating complex terrains such as staircases. Our method +is adaptable to any odometry input and leverages GPU-accelerated processes for +planar extraction, enabling the rapid generation of globally consistent +semantic maps. We utilize an anisotropic diffusion filter on depth images to +effectively minimize noise from gradient jumps while preserving essential edge +details, enhancing normal vector images' accuracy and smoothness. Both the +anisotropic diffusion and the RANSAC-based plane extraction processes are +optimized for parallel processing on GPUs, significantly enhancing +computational efficiency. Our approach achieves real-time performance, +processing single frames at rates exceeding $30~Hz$, which facilitates detailed +plane extraction and map management swiftly and efficiently. Extensive testing +underscores the algorithm's capabilities in real-time scenarios and +demonstrates its practical application in humanoid robot gait planning, +significantly improving its ability to navigate dynamic environments. + +
+
+ comment: Accepted by The 2024 IEEE-RAS International Conference on Humanoid + Robots. The code: https://github.com/BTFrontier/polygon_mapping +
+
+
+
+
+ + ☆ RoboCrowd: Scaling Robot Data Collection through Crowdsourcing + + +
+ In recent years, imitation learning from large-scale human demonstrations has +emerged as a promising paradigm for training robot policies. However, the +burden of collecting large quantities of human demonstrations is significant in +terms of collection time and the need for access to expert operators. We +introduce a new data collection paradigm, RoboCrowd, which distributes the +workload by utilizing crowdsourcing principles and incentive design. RoboCrowd +helps enable scalable data collection and facilitates more efficient learning +of robot policies. We build RoboCrowd on top of ALOHA (Zhao et al. 2023) -- a +bimanual platform that supports data collection via puppeteering -- to explore +the design space for crowdsourcing in-person demonstrations in a public +environment. We propose three classes of incentive mechanisms to appeal to +users' varying sources of motivation for interacting with the system: material +rewards, intrinsic interest, and social comparison. We instantiate these +incentives through tasks that include physical rewards, engaging or challenging +manipulations, as well as gamification elements such as a leaderboard. We +conduct a large-scale, two-week field experiment in which the platform is +situated in a university cafe. We observe significant engagement with the +system -- over 200 individuals independently volunteered to provide a total of +over 800 interaction episodes. Our findings validate the proposed incentives as +mechanisms for shaping users' data quantity and quality. Further, we +demonstrate that the crowdsourced data can serve as useful pre-training data +for policies fine-tuned on expert demonstrations -- boosting performance up to +20% compared to when this data is not available. These results suggest the +potential for RoboCrowd to reduce the burden of robot data collection by +carefully implementing crowdsourcing and incentive design principles. + +
+
+ comment: 21 pages, 25 figures +
+
+
+
+
+ + ☆ Traffic and Safety Rule Compliance of Humans in Diverse Driving + Situations + + +
+ The increasing interest in autonomous driving systems has highlighted the +need for an in-depth analysis of human driving behavior in diverse scenarios. +Analyzing human data is crucial for developing autonomous systems that +replicate safe driving practices and ensure seamless integration into +human-dominated environments. This paper presents a comparative evaluation of +human compliance with traffic and safety rules across multiple trajectory +prediction datasets, including Argoverse 2, nuPlan, Lyft, and DeepUrban. By +defining and leveraging existing safety and behavior-related metrics, such as +time to collision, adherence to speed limits, and interactions with other +traffic participants, we aim to provide a comprehensive understanding of each +datasets strengths and limitations. Our analysis focuses on the distribution of +data samples, identifying noise, outliers, and undesirable behaviors exhibited +by human drivers in both the training and validation sets. The results +underscore the need for applying robust filtering techniques to certain +datasets due to high levels of noise and the presence of such undesirable +behaviors. + +
+
+ comment: 8 pages, CoRL 2024 Workshop SAFE-ROL +
+
+
+
+
+ + ☆ Improving Trust Estimation in Human-Robot Collaboration Using Beta + Reputation at Fine-grained Timescales + + +
+ When interacting with each other, humans adjust their behavior based on +perceived trust. However, to achieve similar adaptability, robots must +accurately estimate human trust at sufficiently granular timescales during the +human-robot collaboration task. A beta reputation is a popular way to formalize +a mathematical estimation of human trust. However, it relies on binary +performance, which updates trust estimations only after each task concludes. +Additionally, manually crafting a reward function is the usual method of +building a performance indicator, which is labor-intensive and time-consuming. +These limitations prevent efficiently capturing continuous changes in trust at +more granular timescales throughout the collaboration task. Therefore, this +paper presents a new framework for the estimation of human trust using a beta +reputation at fine-grained timescales. To achieve granularity in beta +reputation, we utilize continuous reward values to update trust estimations at +each timestep of a task. We construct a continuous reward function using +maximum entropy optimization to eliminate the need for the laborious +specification of a performance indicator. The proposed framework improves trust +estimations by increasing accuracy, eliminating the need for manually crafting +a reward function, and advancing toward developing more intelligent robots. The +source code is publicly available. +https://github.com/resuldagdanov/robot-learning-human-trust + +
+
+ comment: 8 pages, 7 figures, 1 table. This work has been submitted to the IEEE + for possible publication +
+
+
+
+
+ + ManiBox: Enhancing Spatial Grasping Generalization via Scalable + Simulation Data Generation + + +
+ Learning a precise robotic grasping policy is crucial for embodied agents +operating in complex real-world manipulation tasks. Despite significant +advancements, most models still struggle with accurate spatial positioning of +objects to be grasped. We first show that this spatial generalization challenge +stems primarily from the extensive data requirements for adequate spatial +understanding. However, collecting such data with real robots is prohibitively +expensive, and relying on simulation data often leads to visual generalization +gaps upon deployment. To overcome these challenges, we then focus on +state-based policy generalization and present \textbf{ManiBox}, a novel +bounding-box-guided manipulation method built on a simulation-based +teacher-student framework. The teacher policy efficiently generates scalable +simulation data using bounding boxes, which are proven to uniquely determine +the objects' spatial positions. The student policy then utilizes these +low-dimensional spatial states to enable zero-shot transfer to real robots. +Through comprehensive evaluations in simulated and real-world environments, +ManiBox demonstrates a marked improvement in spatial grasping generalization +and adaptability to diverse objects and backgrounds. Further, our empirical +study into scaling laws for policy performance indicates that spatial volume +generalization scales positively with data volume. For a certain level of +spatial volume, the success rate of grasping empirically follows +Michaelis-Menten kinetics relative to data volume, showing a saturation effect +as data increases. Our videos and code are available in +https://thkkk.github.io/manibox. + +
+
+
+
+
+ + ☆ Toward Integrating Semantic-aware Path Planning and Reliable + Localization for UAV Operations + + +
+ Localization is one of the most crucial tasks for Unmanned Aerial Vehicle +systems (UAVs) directly impacting overall performance, which can be achieved +with various sensors and applied to numerous tasks related to search and rescue +operations, object tracking, construction, etc. However, due to the negative +effects of challenging environments, UAVs may lose signals for localization. In +this paper, we present an effective path-planning system leveraging semantic +segmentation information to navigate around texture-less and problematic areas +like lakes, oceans, and high-rise buildings using a monocular camera. We +introduce a real-time semantic segmentation architecture and a novel keyframe +decision pipeline to optimize image inputs based on pixel distribution, +reducing processing time. A hierarchical planner based on the Dynamic Window +Approach (DWA) algorithm, integrated with a cost map, is designed to facilitate +efficient path planning. The system is implemented in a photo-realistic +simulation environment using Unity, aligning with segmentation model +parameters. Comprehensive qualitative and quantitative evaluations validate the +effectiveness of our approach, showing significant improvements in the +reliability and efficiency of UAV localization in challenging environments. + +
+
+ comment: In The 24th International Conference on Control, Automation, and + Systems (ICCAS 2024), Jeju, Korea +
+
+
+
+
+ + ☆ Enhancing Social Robot Navigation with Integrated Motion Prediction and + Trajectory Planning in Dynamic Human Environments + + +
+ Navigating safely in dynamic human environments is crucial for mobile service +robots, and social navigation is a key aspect of this process. In this paper, +we proposed an integrative approach that combines motion prediction and +trajectory planning to enable safe and socially-aware robot navigation. The +main idea of the proposed method is to leverage the advantages of Socially +Acceptable trajectory prediction and Timed Elastic Band (TEB) by incorporating +human interactive information including position, orientation, and motion into +the objective function of the TEB algorithms. In addition, we designed social +constraints to ensure the safety of robot navigation. The proposed system is +evaluated through physical simulation using both quantitative and qualitative +metrics, demonstrating its superior performance in avoiding human and dynamic +obstacles, thereby ensuring safe navigation. The implementations are open +source at: \url{https://github.com/thanhnguyencanh/SGan-TEB.git} + +
+
+ comment: In the 24th International Conference on Control, Automation, and + Systems (ICCAS 2024), Jeju, Korea +
+
+
+
+
+ + ☆ So You Think You Can Scale Up Autonomous Robot Data Collection? + + +
+ A long-standing goal in robot learning is to develop methods for robots to +acquire new skills autonomously. While reinforcement learning (RL) comes with +the promise of enabling autonomous data collection, it remains challenging to +scale in the real-world partly due to the significant effort required for +environment design and instrumentation, including the need for designing reset +functions or accurate success detectors. On the other hand, imitation learning +(IL) methods require little to no environment design effort, but instead +require significant human supervision in the form of collected demonstrations. +To address these shortcomings, recent works in autonomous IL start with an +initial seed dataset of human demonstrations that an autonomous policy can +bootstrap from. While autonomous IL approaches come with the promise of +addressing the challenges of autonomous RL as well as pure IL strategies, in +this work, we posit that such techniques do not deliver on this promise and are +still unable to scale up autonomous data collection in the real world. Through +a series of real-world experiments, we demonstrate that these approaches, when +scaled up to realistic settings, face much of the same scaling challenges as +prior attempts in RL in terms of environment design. Further, we perform a +rigorous study of autonomous IL methods across different data scales and 7 +simulation and real-world tasks, and demonstrate that while autonomous data +collection can modestly improve performance, simply collecting more human data +often provides significantly more improvement. Our work suggests a negative +result: that scaling up autonomous data collection for learning robot policies +for real-world tasks is more challenging and impractical than what is suggested +in prior work. We hope these insights about the core challenges of scaling up +data collection help inform future efforts in autonomous learning. + +
+
+ comment: 21 pages, 25 figures. Conference on Robot Learning (CoRL) 2024 +
+
+
+
+
+ + ☆ Semantic Masking and Visual Feature Matching for Robust Localization + + +
+ We are interested in long-term deployments of autonomous robots to aid +astronauts with maintenance and monitoring operations in settings such as the +International Space Station. Unfortunately, such environments tend to be highly +dynamic and unstructured, and their frequent reconfiguration poses a challenge +for robust long-term localization of robots. Many state-of-the-art visual +feature-based localization algorithms are not robust towards spatial scene +changes, and SLAM algorithms, while promising, cannot run within the +low-compute budget available to space robots. To address this gap, we present a +computationally efficient semantic masking approach for visual feature matching +that improves the accuracy and robustness of visual localization systems during +long-term deployment in changing environments. Our method introduces a +lightweight check that enforces matches to be within long-term static objects +and have consistent semantic classes. We evaluate this approach using both +map-based relocalization and relative pose estimation and show that it improves +Absolute Trajectory Error (ATE) and correct match ratios on the publicly +available Astrobee dataset. While this approach was originally developed for +microgravity robotic freeflyers, it can be applied to any visual feature +matching pipeline to improve robustness. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Constrained Human-AI Cooperation: An Inclusive Embodied Social + Intelligence Challenge NeurIPS 2024 + + +
+ We introduce Constrained Human-AI Cooperation (CHAIC), an inclusive embodied +social intelligence challenge designed to test social perception and +cooperation in embodied agents. In CHAIC, the goal is for an embodied agent +equipped with egocentric observations to assist a human who may be operating +under physical constraints -- e.g., unable to reach high places or confined to +a wheelchair -- in performing common household or outdoor tasks as efficiently +as possible. To achieve this, a successful helper must: (1) infer the human's +intents and constraints by following the human and observing their behaviors +(social perception), and (2) make a cooperative plan tailored to the human +partner to solve the task as quickly as possible, working together as a team +(cooperative planning). To benchmark this challenge, we create four new agents +with real physical constraints and eight long-horizon tasks featuring both +indoor and outdoor scenes with various constraints, emergency events, and +potential risks. We benchmark planning- and learning-based baselines on the +challenge and introduce a new method that leverages large language models and +behavior modeling. Empirical evaluations demonstrate the effectiveness of our +benchmark in enabling systematic assessment of key aspects of machine social +intelligence. Our benchmark and code are publicly available at this URL: +https://github.com/UMass-Foundation-Model/CHAIC. + +
+
+ comment: NeurIPS 2024 Dataset and Benchmark Track. Project at this URL: + https://github.com/UMass-Foundation-Model/CHAIC +
+
+
+
+
+ + ☆ Eurekaverse: Environment Curriculum Generation via Large Language Models + + +
+ Recent work has demonstrated that a promising strategy for teaching robots a +wide range of complex skills is by training them on a curriculum of +progressively more challenging environments. However, developing an effective +curriculum of environment distributions currently requires significant +expertise, which must be repeated for every new domain. Our key insight is that +environments are often naturally represented as code. Thus, we probe whether +effective environment curriculum design can be achieved and automated via code +generation by large language models (LLM). In this paper, we introduce +Eurekaverse, an unsupervised environment design algorithm that uses LLMs to +sample progressively more challenging, diverse, and learnable environments for +skill training. We validate Eurekaverse's effectiveness in the domain of +quadrupedal parkour learning, in which a quadruped robot must traverse through +a variety of obstacle courses. The automatic curriculum designed by Eurekaverse +enables gradual learning of complex parkour skills in simulation and can +successfully transfer to the real-world, outperforming manual training courses +designed by humans. + +
+
+ comment: Conference on Robot Learning (CoRL), 2024. Project website and code: + https://eureka-research.github.io/eurekaverse +
+
+
+
+
+ + ☆ A Probabilistic Formulation of LiDAR Mapping with Neural Radiance Fields + + +
+ In this paper we reexamine the process through which a Neural Radiance Field +(NeRF) can be trained to produce novel LiDAR views of a scene. Unlike image +applications where camera pixels integrate light over time, LiDAR pulses arrive +at specific times. As such, multiple LiDAR returns are possible for any given +detector and the classification of these returns is inherently probabilistic. +Applying a traditional NeRF training routine can result in the network learning +phantom surfaces in free space between conflicting range measurements, similar +to how floater aberrations may be produced by an image model. We show that by +formulating loss as an integral of probability (rather than as an integral of +optical density) the network can learn multiple peaks for a given ray, allowing +the sampling of first, nth, or strongest returns from a single output channel. +Code is available at https://github.com/mcdermatt/PLINK + +
+
+
+
+
+ + ☆ Multi-Transmotion: Pre-trained Model for Human Motion Prediction + + +
+ The ability of intelligent systems to predict human behaviors is crucial, +particularly in fields such as autonomous vehicle navigation and social +robotics. However, the complexity of human motion have prevented the +development of a standardized dataset for human motion prediction, thereby +hindering the establishment of pre-trained models. In this paper, we address +these limitations by integrating multiple datasets, encompassing both +trajectory and 3D pose keypoints, to propose a pre-trained model for human +motion prediction. We merge seven distinct datasets across varying modalities +and standardize their formats. To facilitate multimodal pre-training, we +introduce Multi-Transmotion, an innovative transformer-based model designed for +cross-modality pre-training. Additionally, we present a novel masking strategy +to capture rich representations. Our methodology demonstrates competitive +performance across various datasets on several downstream tasks, including +trajectory prediction in the NBA and JTA datasets, as well as pose prediction +in the AMASS and 3DPW datasets. The code is publicly available: +https://github.com/vita-epfl/multi-transmotion + +
+
+ comment: CoRL 2024 +
+
+
+
+
+ + ☆ Intelligent Magnetic Inspection Robot for Enhanced Structural Health + Monitoring of Ferromagnetic Infrastructure + + +
+ This paper presents an innovative solution to the issue of infrastructure +deterioration in the U.S., where a significant portion of facilities are in +poor condition, and over 130,000 steel bridges have exceeded their lifespan. +Aging steel structures face corrosion and hidden defects, posing major safety +risks. The Silver Bridge collapse, resulting from an undetected flaw, +highlights the limitations of manual inspection methods, which often miss +subtle or concealed defects. Addressing the need for improved inspection +technology, this work introduces an AI-powered magnetic inspection robot. +Equipped with magnetic wheels, the robot adheres to and navigates complex +ferromagnetic surfaces, including challenging areas like vertical inclines and +internal corners, enabling thorough, large-scale inspections. Utilizing +MobileNetV2, a deep learning model trained on steel surface defects, the system +achieved an 85% precision rate across six defect types. This AI-driven +inspection process enhances accuracy and reliability, outperforming traditional +methods in defect detection and efficiency. The findings suggest that combining +robotic mobility with AI-based image analysis offers a scalable, automated +approach to infrastructure inspection, reducing human labor while improving +detection precision and the safety of critical assets. + +
+
+ comment: 10 pages, 17 figures +
+
+
+
+
+ + ☆ Enhancing Indoor Mobility with Connected Sensor Nodes: A Real-Time, + Delay-Aware Cooperative Perception Approach + + +
+ This paper presents a novel real-time, delay-aware cooperative perception +system designed for intelligent mobility platforms operating in dynamic indoor +environments. The system contains a network of multi-modal sensor nodes and a +central node that collectively provide perception services to mobility +platforms. The proposed Hierarchical Clustering Considering the Scanning +Pattern and Ground Contacting Feature based Lidar Camera Fusion improve +intra-node perception for crowded environment. The system also features +delay-aware global perception to synchronize and aggregate data across nodes. +To validate our approach, we introduced the Indoor Pedestrian Tracking dataset, +compiled from data captured by two indoor sensor nodes. Our experiments, +compared to baselines, demonstrate significant improvements in detection +accuracy and robustness against delays. The dataset is available in the +repository: https://github.com/NingMingHao/MVSLab-IndoorCooperativePerception + +
+
+
+
+
+ + ☆ Tracking Tumors under Deformation from Partial Point Clouds using + Occupancy Networks IROS 2024 + + +
+ To track tumors during surgery, information from preoperative CT scans is +used to determine their position. However, as the surgeon operates, the tumor +may be deformed which presents a major hurdle for accurately resecting the +tumor, and can lead to surgical inaccuracy, increased operation time, and +excessive margins. This issue is particularly pronounced in robot-assisted +partial nephrectomy (RAPN), where the kidney undergoes significant deformations +during operation. Toward addressing this, we introduce a occupancy +network-based method for the localization of tumors within kidney phantoms +undergoing deformations at interactive speeds. We validate our method by +introducing a 3D hydrogel kidney phantom embedded with exophytic and endophytic +renal tumors. It closely mimics real tissue mechanics to simulate kidney +deformation during in vivo surgery, providing excellent contrast and clear +delineation of tumor margins to enable automatic threshold-based segmentation. +Our findings indicate that the proposed method can localize tumors in +moderately deforming kidneys with a margin of 6mm to 10mm, while providing +essential volumetric 3D information at over 60Hz. This capability directly +enables downstream tasks such as robotic resection. + +
+
+ comment: Accepted at IROS 2024 +
+
+
+
+
+ + ☆ Advanced XR-Based 6-DOF Catheter Tracking System for Immersive Cardiac + Intervention Training + + +
+ Extended Reality (XR) technologies are gaining traction as effective tools +for medical training and procedural guidance, particularly in complex cardiac +interventions. This paper presents a novel system for real-time 3D tracking and +visualization of intracardiac echocardiography (ICE) catheters, with precise +measurement of the roll angle. A custom 3D-printed setup, featuring orthogonal +cameras, captures biplane video of the catheter, while a specialized computer +vision algorithm reconstructs its 3D trajectory, localizing the tip with +sub-millimeter accuracy and tracking the roll angle in real-time. The system's +data is integrated into an interactive Unity-based environment, rendered +through the Meta Quest 3 XR headset, combining a dynamically tracked catheter +with a patient-specific 3D heart model. This immersive environment allows the +testing of the importance of 3D depth perception, in comparison to 2D +projections, as a form of visualization in XR. Our experimental study, +conducted using the ICE catheter with six participants, suggests that 3D +visualization is not necessarily beneficial over 2D views offered by the XR +system; although all cardiologists saw its utility for pre-operative training, +planning, and intra-operative guidance. The proposed system qualitatively shows +great promise in transforming catheter-based interventions, particularly ICE +procedures, by improving visualization, interactivity, and skill development. + +
+
+
+
+
+ + ☆ Vocal Sandbox: Continual Learning and Adaptation for Situated + Human-Robot Collaboration + + +
+ We introduce Vocal Sandbox, a framework for enabling seamless human-robot +collaboration in situated environments. Systems in our framework are +characterized by their ability to adapt and continually learn at multiple +levels of abstraction from diverse teaching modalities such as spoken dialogue, +object keypoints, and kinesthetic demonstrations. To enable such adaptation, we +design lightweight and interpretable learning algorithms that allow users to +build an understanding and co-adapt to a robot's capabilities in real-time, as +they teach new behaviors. For example, after demonstrating a new low-level +skill for "tracking around" an object, users are provided with trajectory +visualizations of the robot's intended motion when asked to track a new object. +Similarly, users teach high-level planning behaviors through spoken dialogue, +using pretrained language models to synthesize behaviors such as "packing an +object away" as compositions of low-level skills $-$ concepts that can be +reused and built upon. We evaluate Vocal Sandbox in two settings: collaborative +gift bag assembly and LEGO stop-motion animation. In the first setting, we run +systematic ablations and user studies with 8 non-expert participants, +highlighting the impact of multi-level teaching. Across 23 hours of total robot +interaction time, users teach 17 new high-level behaviors with an average of 16 +novel low-level skills, requiring 22.1% less active supervision compared to +baselines and yielding more complex autonomous performance (+19.7%) with fewer +failures (-67.1%). Qualitatively, users strongly prefer Vocal Sandbox systems +due to their ease of use (+20.6%) and overall performance (+13.9%). Finally, we +pair an experienced system-user with a robot to film a stop-motion animation; +over two hours of continuous collaboration, the user teaches progressively more +complex motion skills to shoot a 52 second (232 frame) movie. + +
+
+ comment: Published at CoRL 2024. 24 pages, 8 figures. Project Page: + https://vocal-sandbox.github.io +
+
+
+
+
+ + ☆ Map++: Towards User-Participatory Visual SLAM Systems with Efficient Map + Expansion and Sharing + + +
+ Constructing precise 3D maps is crucial for the development of future +map-based systems such as self-driving and navigation. However, generating +these maps in complex environments, such as multi-level parking garages or +shopping malls, remains a formidable challenge. In this paper, we introduce a +participatory sensing approach that delegates map-building tasks to map users, +thereby enabling cost-effective and continuous data collection. The proposed +method harnesses the collective efforts of users, facilitating the expansion +and ongoing update of the maps as the environment evolves. + We realized this approach by developing Map++, an efficient system that +functions as a plug-and-play extension, supporting participatory map-building +based on existing SLAM algorithms. Map++ addresses a plethora of scalability +issues in this participatory map-building system by proposing a set of +lightweight, application-layer protocols. We evaluated Map++ in four +representative settings: an indoor garage, an outdoor plaza, a public SLAM +benchmark, and a simulated environment. The results demonstrate that Map++ can +reduce traffic volume by approximately 46% with negligible degradation in +mapping accuracy, i.e., less than 0.03m compared to the baseline system. It can +support approximately $2 \times$ as many concurrent users as the baseline under +the same network bandwidth. Additionally, for users who travel on +already-mapped trajectories, they can directly utilize the existing maps for +localization and save 47% of the CPU usage. + +
+
+ comment: 15 pages, 15 figures. Accepted by MobiCom 2024 +
+
+
+
+
+ + ☆ Modeling Uncertainty in 3D Gaussian Splatting through Continuous + Semantic Splatting + + +
+ In this paper, we present a novel algorithm for probabilistically updating +and rasterizing semantic maps within 3D Gaussian Splatting (3D-GS). Although +previous methods have introduced algorithms which learn to rasterize features +in 3D-GS for enhanced scene understanding, 3D-GS can fail without warning which +presents a challenge for safety-critical robotic applications. To address this +gap, we propose a method which advances the literature of continuous semantic +mapping from voxels to ellipsoids, combining the precise structure of 3D-GS +with the ability to quantify uncertainty of probabilistic robotic maps. Given a +set of images, our algorithm performs a probabilistic semantic update directly +on the 3D ellipsoids to obtain an expectation and variance through the use of +conjugate priors. We also propose a probabilistic rasterization which returns +per-pixel segmentation predictions with quantifiable uncertainty. We compare +our method with similar probabilistic voxel-based methods to verify our +extension to 3D ellipsoids, and perform ablation studies on uncertainty +quantification and temporal smoothing. + +
+
+
+
+
+ + ☆ SPACE: 3D Spatial Co-operation and Exploration Framework for Robust + Mapping and Coverage with Multi-Robot Systems + + +
+ In indoor environments, multi-robot visual (RGB-D) mapping and exploration +hold immense potential for application in domains such as domestic service and +logistics, where deploying multiple robots in the same environment can +significantly enhance efficiency. However, there are two primary challenges: +(1) the "ghosting trail" effect, which occurs due to overlapping views of +robots impacting the accuracy and quality of point cloud reconstruction, and +(2) the oversight of visual reconstructions in selecting the most effective +frontiers for exploration. Given these challenges are interrelated, we address +them together by proposing a new semi-distributed framework (SPACE) for spatial +cooperation in indoor environments that enables enhanced coverage and 3D +mapping. SPACE leverages geometric techniques, including "mutual awareness" and +a "dynamic robot filter," to overcome spatial mapping constraints. +Additionally, we introduce a novel spatial frontier detection system and map +merger, integrated with an adaptive frontier assigner for optimal coverage +balancing the exploration and reconstruction objectives. In extensive +ROS-Gazebo simulations, SPACE demonstrated superior performance over +state-of-the-art approaches in both exploration and mapping metrics. + +
+
+
+
+
+ + ☆ NeRF-Aug: Data Augmentation for Robotics with Neural Radiance Fields + + +
+ Training a policy that can generalize to unknown objects is a long standing +challenge within the field of robotics. The performance of a policy often drops +significantly in situations where an object in the scene was not seen during +training. To solve this problem, we present NeRF-Aug, a novel method that is +capable of teaching a policy to interact with objects that are not present in +the dataset. This approach differs from existing approaches by leveraging the +speed and photorealism of a neural radiance field for augmentation. NeRF- Aug +both creates more photorealistic data and runs 3.83 times faster than existing +methods. We demonstrate the effectiveness of our method on 4 tasks with 11 +novel objects that have no expert demonstration data. We achieve an average +69.1% success rate increase over existing methods. See video results at +https://nerf-aug.github.io. + +
+
+
+
+
+ + ☆ Digitizing Touch with an Artificial Multimodal Fingertip + + +
+ Touch is a crucial sensing modality that provides rich information about +object properties and interactions with the physical environment. Humans and +robots both benefit from using touch to perceive and interact with the +surrounding environment (Johansson and Flanagan, 2009; Li et al., 2020; +Calandra et al., 2017). However, no existing systems provide rich, multi-modal +digital touch-sensing capabilities through a hemispherical compliant +embodiment. Here, we describe several conceptual and technological innovations +to improve the digitization of touch. These advances are embodied in an +artificial finger-shaped sensor with advanced sensing capabilities. +Significantly, this fingertip contains high-resolution sensors (~8.3 million +taxels) that respond to omnidirectional touch, capture multi-modal signals, and +use on-device artificial intelligence to process the data in real time. +Evaluations show that the artificial fingertip can resolve spatial features as +small as 7 um, sense normal and shear forces with a resolution of 1.01 mN and +1.27 mN, respectively, perceive vibrations up to 10 kHz, sense heat, and even +sense odor. Furthermore, it embeds an on-device AI neural network accelerator +that acts as a peripheral nervous system on a robot and mimics the reflex arc +found in humans. These results demonstrate the possibility of digitizing touch +with superhuman performance. The implications are profound, and we anticipate +potential applications in robotics (industrial, medical, agricultural, and +consumer-level), virtual reality and telepresence, prosthetics, and e-commerce. +Toward digitizing touch at scale, we open-source a modular platform to +facilitate future research on the nature of touch. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ☆ Modeling and Simulation of a Multi Robot System Architecture + + +
+ A Multi Robot System (MRS) is the infrastructure of an intelligent +cyberphysical system, where the robots understand the need of the human, and +hence cooperate together to fulfill this need. Modeling an MRS is a crucial +aspect of designing the proper system architecture, because this model can be +used to simulate and measure the performance of the proposed architecture. +However, an MRS solution architecture modeling is a very difficult problem, as +it contains many dependent behaviors that dynamically change due to the current +status of the overall system. In this paper, we introduce a general purpose MRS +case study, where the humans initiate requests that are achieved by the +available robots. These requests require different plans that use the current +capabilities of the available robots. After proposing an architecture that +defines the solution components, three steps are followed. First is modeling +these components via Business Process Model and Notation (BPMN) language. BPMN +provides a graphical notation to precisely represent the behaviors of every +component, which is an essential need to model the solution. Second is to +simulate these components behaviors and interaction in form of software agents. +Java Agent DEvelopment (JADE) middleware has been used to develop and simulate +the proposed model. JADE is based on a reactive agent approach, therefore it +can dynamically represent the interaction among the solution components. +Finally is to analyze the performance of the solution by defining a number of +quantitative measurements, which can be obtained while simulating the system +model in JADE middleware, therefore the solution can be analyzed and compared +to another architecture. + +
+
+
+
+
+ + ♻ ☆ From Imitation to Refinement -- Residual RL for Precise Assembly + + +
+ Recent advances in behavior cloning (BC), like action-chunking and diffusion, +have led to impressive progress. Still, imitation alone remains insufficient +for tasks requiring reliable and precise movements, such as aligning and +inserting objects. Our key insight is that chunked BC policies function as +trajectory planners, enabling long-horizon tasks. Conversely, as they execute +action chunks open-loop, they lack the fine-grained reactivity necessary for +reliable execution. Further, we find that the performance of BC policies +saturates despite increasing data. Reinforcement learning (RL) is a natural way +to overcome this, but it is not straightforward to apply directly to +action-chunked models like diffusion policies. We present a simple yet +effective method, ResiP (Residual for Precise Manipulation), that sidesteps +these challenges by augmenting a frozen, chunked BC model with a fully +closed-loop residual policy trained with RL. The residual policy is trained via +on-policy RL, addressing distribution shifts and introducing reactivity without +altering the BC trajectory planner. Evaluation on high-precision manipulation +tasks demonstrates strong performance of ResiP over BC methods and direct RL +fine-tuning. Videos, code, and data are available at +\url{https://residual-assembly.github.io}. + +
+
+
+
+
+ + ♻ ☆ EMMA: End-to-End Multimodal Model for Autonomous Driving + + +
+ We introduce EMMA, an End-to-end Multimodal Model for Autonomous driving. +Built on a multi-modal large language model foundation, EMMA directly maps raw +camera sensor data into various driving-specific outputs, including planner +trajectories, perception objects, and road graph elements. EMMA maximizes the +utility of world knowledge from the pre-trained large language models, by +representing all non-sensor inputs (e.g. navigation instructions and ego +vehicle status) and outputs (e.g. trajectories and 3D locations) as natural +language text. This approach allows EMMA to jointly process various driving +tasks in a unified language space, and generate the outputs for each task using +task-specific prompts. Empirically, we demonstrate EMMA's effectiveness by +achieving state-of-the-art performance in motion planning on nuScenes as well +as competitive results on the Waymo Open Motion Dataset (WOMD). EMMA also +yields competitive results for camera-primary 3D object detection on the Waymo +Open Dataset (WOD). We show that co-training EMMA with planner trajectories, +object detection, and road graph tasks yields improvements across all three +domains, highlighting EMMA's potential as a generalist model for autonomous +driving applications. However, EMMA also exhibits certain limitations: it can +process only a small amount of image frames, does not incorporate accurate 3D +sensing modalities like LiDAR or radar and is computationally expensive. We +hope that our results will inspire further research to mitigate these issues +and to further evolve the state of the art in autonomous driving model +architectures. + +
+
+ comment: Blog post: https://waymo.com/blog/2024/10/introducing-emma/ +
+
+
+
+
+ + ♻ ☆ Taxonomy-Aware Continual Semantic Segmentation in Hyperbolic Spaces for + Open-World Perception + + +
+ Semantic segmentation models are typically trained on a fixed set of classes, +limiting their applicability in open-world scenarios. Class-incremental +semantic segmentation aims to update models with emerging new classes while +preventing catastrophic forgetting of previously learned ones. However, +existing methods impose strict rigidity on old classes, reducing their +effectiveness in learning new incremental classes. In this work, we propose +Taxonomy-Oriented Poincar\'e-regularized Incremental-Class Segmentation +(TOPICS) that learns feature embeddings in hyperbolic space following explicit +taxonomy-tree structures. This supervision provides plasticity for old classes, +updating ancestors based on new classes while integrating new classes at +fitting positions. Additionally, we maintain implicit class relational +constraints on the geometric basis of the Poincar\'e ball. This ensures that +the latent space can continuously adapt to new constraints while maintaining a +robust structure to combat catastrophic forgetting. We also establish eight +realistic incremental learning protocols for autonomous driving scenarios, +where novel classes can originate from known classes or the background. +Extensive evaluations of TOPICS on the Cityscapes and Mapillary Vistas 2.0 +benchmarks demonstrate that it achieves state-of-the-art performance. We make +the code and trained models publicly available at +http://topics.cs.uni-freiburg.de. + +
+
+
+
+
+ + ♻ ☆ High-density Electromyography for Effective Gesture-based Control of + Physically Assistive Mobile Manipulators + + +
+ High-density electromyography (HDEMG) can detect myoelectric activity as +control inputs to a variety of electronically-controlled devices. Furthermore, +HDEMG sensors may be built into a variety of clothing, allowing for a +non-intrusive myoelectric interface that is integrated into a user's routine. +In our work, we introduce an easily-producible HDEMG device that interfaces +with the control of a mobile manipulator to perform a range of household and +physically assistive tasks. Mobile manipulators can operate throughout the home +and are applicable for a spectrum of assistive and daily tasks in the home. We +evaluate the use of real-time myoelectric gesture recognition using our device +to enable precise control over the intricate mobility and manipulation +functionalities of an 8 degree-of-freedom mobile manipulator. Our evaluation, +involving 13 participants engaging in challenging self-care and household +activities, demonstrates the potential of our wearable HDEMG system to control +a mobile manipulator in the home. + +
+
+
+
+
+ + ♻ ☆ Knowledge Transfer from Simple to Complex: A Safe and Efficient + Reinforcement Learning Framework for Autonomous Driving Decision-Making + + +
+ A safe and efficient decision-making system is crucial for autonomous +vehicles. However, the complexity of driving environments limits the +effectiveness of many rule-based and machine learning approaches. Reinforcement +Learning (RL), with its robust self-learning capabilities and environmental +adaptability, offers a promising solution to these challenges. Nevertheless, +safety and efficiency concerns during training hinder its widespread +application. To address these concerns, we propose a novel RL framework, Simple +to Complex Collaborative Decision (S2CD). First, we rapidly train the teacher +model in a lightweight simulation environment. In the more complex and +realistic environment, teacher intervenes when the student agent exhibits +suboptimal behavior by assessing actions' value to avert dangers. We also +introduce an RL algorithm called Adaptive Clipping Proximal Policy Optimization +Plus, which combines samples from both teacher and student policies and employs +dynamic clipping strategies based on sample importance. This approach improves +sample efficiency while effectively alleviating data imbalance. Additionally, +we employ the Kullback-Leibler divergence as a policy constraint, transforming +it into an unconstrained problem with the Lagrangian method to accelerate the +student's learning. Finally, a gradual weaning strategy ensures that the +student learns to explore independently over time, overcoming the teacher's +limitations and maximizing performance. Simulation experiments in highway +lane-change scenarios show that the S2CD framework enhances learning +efficiency, reduces training costs, and significantly improves safety compared +to state-of-the-art algorithms. This framework also ensures effective knowledge +transfer between teacher and student models, even with suboptimal teachers, the +student achieves superior performance, demonstrating the robustness and +effectiveness of S2CD. + +
+
+
+
+
+ + ♻ ☆ Simplification of Robotic System Model Analysis by Petri Net Meta-Model + Property Transfer + + +
+ This paper presents a simplification of robotic system model analysis due to +the transfer of Robotic System Hierarchical Petri Net (RSHPN) meta-model +properties onto the model of a designed system. Key contributions include: 1) +analysis of RSHPN meta-model properties; 2) decomposition of RSHPN analysis +into analysis of individual Petri nets, thus the reduction of state space +explosion; and 3) transfer of RSHPN meta-model properties onto the produced +models, hence elimination of the need for full re-analysis of the RSHPN model +when creating new robotic systems. Only task-dependent parts of the model need +to be analyzed. This approach streamlines the analysis thus reducing the design +time. Moreover, it produces a specification which is a solid foundation for the +implementation of the system. The obtained results highlight the potential of +Petri nets as a valuable formal framework for analyzing robotic system +properties. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ♻ ☆ Predicting Human Impressions of Robot Performance During Navigation + Tasks + + +
+ Human impressions of robot performance are often measured through surveys. As +a more scalable and cost-effective alternative, we investigate the possibility +of predicting people's impressions of robot behavior using non-verbal +behavioral cues and machine learning techniques. To this end, we first +contribute the SEAN TOGETHER Dataset consisting of observations of an +interaction between a person and a mobile robot in a VR simulation, together +with impressions of robot performance provided by users on a 5-point scale. +Second, we contribute analyses of how well humans and supervised learning +techniques can predict perceived robot performance based on different +observation types (like facial expression features, and features that describe +the navigation behavior of the robot and pedestrians). Our results suggest that +facial expressions alone provide useful information about human impressions of +robot performance; but in the navigation scenarios that we considered, +reasoning about spatial features in context is critical for the prediction +task. Also, supervised learning techniques showed promise because they +outperformed humans' predictions of robot performance in most cases. Further, +when predicting robot performance as a binary classification task on unseen +users' data, the F1 Score of machine learning models more than doubled in +comparison to predicting performance on a 5-point scale. This suggested that +the models can have good generalization capabilities, although they are better +at telling the directionality of robot performance than predicting exact +performance ratings. Based on our findings in simulation, we conducted a +real-world demonstration in which a mobile robot uses a machine learning model +to predict how a human that follows it perceives it. Finally, we discuss the +implications of our results for implementing such supervised learning models in +real-world navigation scenarios. + +
+
+
+
+
+ + ♻ ☆ RoLD: Robot Latent Diffusion for Multi-task Policy Modeling + + +
+ Modeling generalized robot control policies poses ongoing challenges for +language-guided robot manipulation tasks. Existing methods often struggle to +efficiently utilize cross-dataset resources or rely on resource-intensive +vision-language models, thus limiting their multi-task performance and +practical applications. In this study, we propose a novel approach that +decouples robot action trajectory encoding and control policy generation by +leveraging latent action trajectory spaces, enhancing the generalization +ability of policy generation on multi-task manipulation tasks. First, we +pre-train a task-agnostic auto-encoder to project an action trajectory of +several frames accompanied with observations into a latent action trajectory +space on large-scale datasets collected with multiple embodiments in various +environments. Then we propose learning a diffusion model based on the latent +action trajectory space to generate actions of next steps. Through experiments +on two widely used benchmarks, results demonstrate that our proposed method +outperforms baselines by 7%-29% in terms of average success rate across eight +tasks. Our method can consistently benefit from pre-training while baselines +cannot. Our method is more than two times faster than our baseline. + +
+
+
+
+
+ + ♻ ☆ Tactile Ergodic Coverage on Curved Surfaces + + +
+ In this article, we present a feedback control method for tactile coverage +tasks, such as cleaning or surface inspection. These tasks are challenging to +plan due to complex continuous physical interactions. In these tasks, the +coverage target and progress can be easily measured using a camera and encoded +in a point cloud. We propose an ergodic coverage method that operates directly +on point clouds, guiding the robot to spend more time on regions requiring more +coverage. For robot control and contact behavior, we use geometric algebra to +formulate a task-space impedance controller that tracks a line while +simultaneously exerting a desired force along that line. We evaluate the +performance of our method in kinematic simulations and demonstrate its +applicability in real-world experiments on kitchenware. Our source codes, +experimental data, and videos are available as open access at +https://sites.google.com/view/tactile-ergodic-control/ + +
+
+
+
+
+ + ♻ ☆ A Robust Quadruped Robot with Twisting Waist for Flexible Motions + + +
+ The waist plays a crucial role in the agile movement of many animals in +nature. It provides the torso with additional degrees of freedom and +flexibility, inspiring researchers to incorporate this biological feature into +robotic structures to enhance robot locomotion. This paper presents a +cost-effective and low-complexity waist mechanism integrated into the structure +of the open-source robot solo8, adding a new degree of freedom (DOF) to its +torso. We refer to this novel robot as solo9. Additionally, we propose a +full-body control method for the waist-equipped quadruped robot based on +generative adversarial imitation learning (GAIL). During training, the +discriminator is used as input for iterative optimization of the policy and +dataset, enabling solo9 to achieve flexible steering maneuvers across various +gaits. Extensive tests of solo9's steering capabilities, terrain adaptability, +and robustness are conducted in both simulation and real-world scenarios, with +detailed comparisons to solo8 and solo12, demonstrating the effectiveness of +the control algorithm and the advantages of the waist mechanism. + +
+
+
+
+
+ + ♻ ☆ RobotKeyframing: Learning Locomotion with High-Level Objectives via + Mixture of Dense and Sparse Rewards + + +
+ This paper presents a novel learning-based control framework that uses +keyframing to incorporate high-level objectives in natural locomotion for +legged robots. These high-level objectives are specified as a variable number +of partial or complete pose targets that are spaced arbitrarily in time. Our +proposed framework utilizes a multi-critic reinforcement learning algorithm to +effectively handle the mixture of dense and sparse rewards. Additionally, it +employs a transformer-based encoder to accommodate a variable number of input +targets, each associated with specific time-to-arrivals. Throughout simulation +and hardware experiments, we demonstrate that our framework can effectively +satisfy the target keyframe sequence at the required times. In the experiments, +the multi-critic method significantly reduces the effort of hyperparameter +tuning compared to the standard single-critic alternative. Moreover, the +proposed transformer-based architecture enables robots to anticipate future +goals, which results in quantitative improvements in their ability to reach +their targets. + +
+
+ comment: This paper has been accepted to 8th Conference on Robot Learning + (CoRL 2024). Project website: https://sites.google.com/view/robot-keyframing +
+
+
+
+
+ + ♻ ☆ Modular Quantization-Aware Training for 6D Object Pose Estimation + + +
+ Edge applications, such as collaborative robotics and spacecraft rendezvous, +demand efficient 6D object pose estimation on resource-constrained embedded +platforms. Existing 6D pose estimation networks are often too large for such +deployments, necessitating compression while maintaining reliable performance. +To address this challenge, we introduce Modular Quantization-Aware Training +(MQAT), an adaptive and mixed-precision quantization-aware training strategy +that exploits the modular structure of modern 6D pose estimation architectures. +MQAT guides a systematic gradated modular quantization sequence and determines +module-specific bit precisions, leading to quantized models that outperform +those produced by state-of-the-art uniform and mixed-precision quantization +techniques. Our experiments showcase the generality of MQAT across datasets, +architectures, and quantization algorithms. Remarkably, MQAT-trained quantized +models achieve a significant accuracy boost (>7%) over the baseline +full-precision network while reducing model size by a factor of 4x or more. Our +project website is at: https://saqibjaved1.github.io/MQAT_/ + +
+
+ comment: Accepted to Transactions on Machine Learning Research (TMLR), 2024 +
+
+
+
+
+ + ♻ ☆ Prävention und Beseitigung von Fehlerursachen im Kontext von + unbemannten Fahrzeugen + + +
+ Mobile robots, becoming increasingly autonomous, are capable of operating in +diverse and unknown environments. This flexibility allows them to fulfill goals +independently and adapting their actions dynamically without rigidly predefined +control codes. However, their autonomous behavior complicates guaranteeing +safety and reliability due to the limited influence of a human operator to +accurately supervise and verify each robot's actions. To ensure autonomous +mobile robot's safety and reliability, which are aspects of dependability, +methods are needed both in the planning and execution of missions for +autonomous mobile robots. In this article, a twofold approach is presented that +ensures fault removal in the context of mission planning and fault prevention +during mission execution for autonomous mobile robots. First, the approach +consists of a concept based on formal verification applied during the planning +phase of missions. Second, the approach consists of a rule-based concept +applied during mission execution. A use case applying the approach is +presented, discussing how the two concepts complement each other and what +contribution they make to certain aspects of dependability. + +
+
+ comment: Language: German. Dieser Beitrag wird eingereicht in: + "dtec.bw-Beitr\"age der Helmut-Schmidt-Universit\"at/Universit\"at der + Bundeswehr Hamburg: Forschungsaktivit\"aten im Zentrum f\"ur + Digitalisierungs- und Technologieforschung der Bundeswehr dtec.bw" +
+
+
+
+
+ + ♻ ☆ MetaLoco: Universal Quadrupedal Locomotion with Meta-Reinforcement + Learning and Motion Imitation + + +
+ This work presents a meta-reinforcement learning approach to develop a +universal locomotion control policy capable of zero-shot generalization across +diverse quadrupedal platforms. The proposed method trains an RL agent equipped +with a memory unit to imitate reference motions using a small set of +procedurally generated quadruped robots. Through comprehensive simulation and +real-world hardware experiments, we demonstrate the efficacy of our approach in +achieving locomotion across various robots without requiring robot-specific +fine-tuning. Furthermore, we highlight the critical role of the memory unit in +enabling generalization, facilitating rapid adaptation to changes in the robot +properties, and improving sample efficiency. + +
+
+ comment: The supplementary video is available at + https://youtu.be/PaFRUDOrh_U?si=hfdbng3Wxo_GnxIA +
+
+
+
+
+ + ♻ ☆ Distance-based Multiple Non-cooperative Ground Target Encirclement for + Complex Environments + + +
+ This paper proposes a comprehensive strategy for complex +multi-target-multi-drone encirclement in an obstacle-rich and GPS-denied +environment, motivated by practical scenarios such as pursuing vehicles or +humans in urban canyons. The drones have omnidirectional range sensors that can +robustly detect ground targets and obtain noisy relative distances. After each +drone task is assigned, a novel distance-based target state estimator (DTSE) is +proposed by estimating the measurement output noise variance and utilizing the +Kalman filter. By integrating anti-synchronization techniques and pseudo-force +functions, an acceleration controller enables two tasking drones to +cooperatively encircle a target from opposing positions while navigating +obstacles. The algorithms effectiveness for the discrete-time double-integrator +system is established theoretically, particularly regarding observability. +Moreover, the versatility of the algorithm is showcased in aerial-to-ground +scenarios, supported by compelling simulation results. Experimental validation +demonstrates the effectiveness of the proposed approach. + +
+
+
+
+
+ + ♻ ☆ RaLF: Flow-based Global and Metric Radar Localization in LiDAR Maps + + +
+ Localization is paramount for autonomous robots. While camera and LiDAR-based +approaches have been extensively investigated, they are affected by adverse +illumination and weather conditions. Therefore, radar sensors have recently +gained attention due to their intrinsic robustness to such conditions. In this +paper, we propose RaLF, a novel deep neural network-based approach for +localizing radar scans in a LiDAR map of the environment, by jointly learning +to address both place recognition and metric localization. RaLF is composed of +radar and LiDAR feature encoders, a place recognition head that generates +global descriptors, and a metric localization head that predicts the 3-DoF +transformation between the radar scan and the map. We tackle the place +recognition task by learning a shared embedding space between the two +modalities via cross-modal metric learning. Additionally, we perform metric +localization by predicting pixel-level flow vectors that align the query radar +scan with the LiDAR map. We extensively evaluate our approach on multiple +real-world driving datasets and show that RaLF achieves state-of-the-art +performance for both place recognition and metric localization. Moreover, we +demonstrate that our approach can effectively generalize to different cities +and sensor setups than the ones used during training. We make the code and +trained models publicly available at http://ralf.cs.uni-freiburg.de. + +
+
+
+
+
+ + ♻ ☆ Automatic Target-Less Camera-LiDAR Calibration From Motion and Deep + Point Correspondences + + +
+ Sensor setups of robotic platforms commonly include both camera and LiDAR as +they provide complementary information. However, fusing these two modalities +typically requires a highly accurate calibration between them. In this paper, +we propose MDPCalib which is a novel method for camera-LiDAR calibration that +requires neither human supervision nor any specific target objects. Instead, we +utilize sensor motion estimates from visual and LiDAR odometry as well as deep +learning-based 2D-pixel-to-3D-point correspondences that are obtained without +in-domain retraining. We represent camera-LiDAR calibration as an optimization +problem and minimize the costs induced by constraints from sensor motion and +point correspondences. In extensive experiments, we demonstrate that our +approach yields highly accurate extrinsic calibration parameters and is robust +to random initialization. Additionally, our approach generalizes to a wide +range of sensor setups, which we demonstrate by employing it on various robotic +platforms including a self-driving perception car, a quadruped robot, and a +UAV. To make our calibration method publicly accessible, we release the code on +our project website at http://calibration.cs.uni-freiburg.de. + +
+
+
+
+
+ + ♻ ☆ BodySLAM: A Generalized Monocular Visual SLAM Framework for Surgical + Applications + + +
+ Endoscopic surgery relies on two-dimensional views, posing challenges for +surgeons in depth perception and instrument manipulation. While Monocular +Visual Simultaneous Localization and Mapping (MVSLAM) has emerged as a +promising solution, its implementation in endoscopic procedures faces +significant challenges due to hardware limitations, such as the use of a +monocular camera and the absence of odometry sensors. This study presents +BodySLAM, a robust deep learning-based MVSLAM approach that addresses these +challenges through three key components: CycleVO, a novel unsupervised +monocular pose estimation module; the integration of the state-of-the-art Zoe +architecture for monocular depth estimation; and a 3D reconstruction module +creating a coherent surgical map. The approach is rigorously evaluated using +three publicly available datasets (Hamlyn, EndoSLAM, and SCARED) spanning +laparoscopy, gastroscopy, and colonoscopy scenarios, and benchmarked against +four state-of-the-art methods. Results demonstrate that CycleVO exhibited +competitive performance with the lowest inference time among pose estimation +methods, while maintaining robust generalization capabilities, whereas Zoe +significantly outperformed existing algorithms for depth estimation in +endoscopy. BodySLAM's strong performance across diverse endoscopic scenarios +demonstrates its potential as a viable MVSLAM solution for endoscopic +applications. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ A generic approach for reactive stateful mitigation of application + failures in distributed robotics systems deployed with Kubernetes + + +
+ Offloading computationally expensive algorithms to the edge or even cloud +offers an attractive option to tackle limitations regarding on-board +computational and energy resources of robotic systems. In cloud-native +applications deployed with the container management system Kubernetes (K8s), +one key problem is ensuring resilience against various types of failures. +However, complex robotic systems interacting with the physical world pose a +very specific set of challenges and requirements that are not yet covered by +failure mitigation approaches from the cloud-native domain. In this paper, we +therefore propose a novel approach for robotic system monitoring and stateful, +reactive failure mitigation for distributed robotic systems deployed using +Kubernetes (K8s) and the Robot Operating System (ROS2). By employing the +generic substrate of Behaviour Trees, our approach can be applied to any +robotic workload and supports arbitrarily complex monitoring and failure +mitigation strategies. We demonstrate the effectiveness and +application-agnosticism of our approach on two example applications, namely +Autonomous Mobile Robot (AMR) navigation and robotic manipulation in a +simulated environment. + +
+
+
+
+
+ + ♻ ☆ 3D Equivariant Pose Regression via Direct Wigner-D Harmonics Prediction NeurIPS 2024 + + +
+ Determining the 3D orientations of an object in an image, known as +single-image pose estimation, is a crucial task in 3D vision applications. +Existing methods typically learn 3D rotations parametrized in the spatial +domain using Euler angles or quaternions, but these representations often +introduce discontinuities and singularities. SO(3)-equivariant networks enable +the structured capture of pose patterns with data-efficient learning, but the +parametrizations in spatial domain are incompatible with their architecture, +particularly spherical CNNs, which operate in the frequency domain to enhance +computational efficiency. To overcome these issues, we propose a +frequency-domain approach that directly predicts Wigner-D coefficients for 3D +rotation regression, aligning with the operations of spherical CNNs. Our +SO(3)-equivariant pose harmonics predictor overcomes the limitations of spatial +parameterizations, ensuring consistent pose estimation under arbitrary +rotations. Trained with a frequency-domain regression loss, our method achieves +state-of-the-art results on benchmarks such as ModelNet10-SO(3) and PASCAL3D+, +with significant improvements in accuracy, robustness, and data efficiency. + +
+
+ comment: Accepted to NeurIPS 2024, Project webpage at + http://cvlab.postech.ac.kr/research/3D_EquiPose +
+
+
+
+
+ + ♻ ☆ Perceptive Pedipulation with Local Obstacle Avoidance + + +
+ Pedipulation leverages the feet of legged robots for mobile manipulation, +eliminating the need for dedicated robotic arms. While previous works have +showcased blind and task-specific pedipulation skills, they fail to account for +static and dynamic obstacles in the environment. To address this limitation, we +introduce a reinforcement learning-based approach to train a whole-body +obstacle-aware policy that tracks foot position commands while simultaneously +avoiding obstacles. Despite training the policy in only five different static +scenarios in simulation, we show that it generalizes to unknown environments +with different numbers and types of obstacles. We analyze the performance of +our method through a set of simulation experiments and successfully deploy the +learned policy on the ANYmal quadruped, demonstrating its capability to follow +foot commands while navigating around static and dynamic obstacles. Videos of +the experiments are available at +sites.google.com/leggedrobotics.com/perceptive-pedipulation. + +
+
+ comment: Accepted to the IEEE International Conference on Humanoid Robots 2024 + Videos available at + sites.google.com/leggedrobotics.com/perceptive-pedipulation +
+
+
+
+
+ + ♻ ☆ S3PT: Scene Semantics and Structure Guided Clustering to Boost + Self-Supervised Pre-Training for Autonomous Driving + + +
+ Recent self-supervised clustering-based pre-training techniques like DINO and +Cribo have shown impressive results for downstream detection and segmentation +tasks. However, real-world applications such as autonomous driving face +challenges with imbalanced object class and size distributions and complex +scene geometries. In this paper, we propose S3PT a novel scene semantics and +structure guided clustering to provide more scene-consistent objectives for +self-supervised training. Specifically, our contributions are threefold: First, +we incorporate semantic distribution consistent clustering to encourage better +representation of rare classes such as motorcycles or animals. Second, we +introduce object diversity consistent spatial clustering, to handle imbalanced +and diverse object sizes, ranging from large background areas to small objects +such as pedestrians and traffic signs. Third, we propose a depth-guided spatial +clustering to regularize learning based on geometric information of the scene, +thus further refining region separation on the feature level. Our learned +representations significantly improve performance in downstream semantic +segmentation and 3D object detection tasks on the nuScenes, nuImages, and +Cityscapes datasets and show promising domain translation properties. + +
+
+ comment: Accepted for WACV 2025 +
+
+
+
+
+ + ♻ ☆ Aerial Robots Carrying Flexible Cables: Dynamic Shape Optimal Control + via Spectral Method Model + + +
+ In this work, we present a model-based optimal boundary control design for an +aerial robotic system composed of a quadrotor carrying a flexible cable. The +whole system is modeled by partial differential equations (PDEs) combined with +boundary conditions described by ordinary differential equations (ODEs). The +proper orthogonal decomposition (POD) method is adopted to project the original +infinite-dimensional system on a finite low-dimensional space spanned by +orthogonal basis functions. Based on such a reduced order model, nonlinear +model predictive control (NMPC) is implemented online to realize both position +and shape trajectory tracking of the flexible cable in an optimal predictive +fashion. The proposed POD-based reduced modeling and optimal control paradigms +are verified in simulation using an accurate high-dimensional FDM-based model +and experimentally using a real quadrotor and a cable. The results show the +viability of the POD-based predictive control approach (allowing closing the +control loop on the full system state) and its superior performance compared to +an optimally tuned PID controller (allowing closing the control loop on the +quadrotor state only). + +
+
+
+
+
+ + ♻ ☆ Mixed Strategy Nash Equilibrium for Crowd Navigation IJRR + + +
+ Robots navigating in crowded areas should negotiate free space with humans +rather than fully controlling collision avoidance, as this can lead to freezing +behavior. Game theory provides a framework for the robot to reason about +potential cooperation from humans for collision avoidance during path planning. +In particular, the mixed strategy Nash equilibrium captures the negotiation +behavior under uncertainty, making it well suited for crowd navigation. +However, computing the mixed strategy Nash equilibrium is often prohibitively +expensive for real-time decision-making. In this paper, we propose an iterative +Bayesian update scheme over probability distributions of trajectories. The +algorithm simultaneously generates a stochastic plan for the robot and +probabilistic predictions of other pedestrians' paths. We prove that the +proposed algorithm is equivalent to solving a mixed strategy game for crowd +navigation, and the algorithm guarantees the recovery of the global Nash +equilibrium of the game. We name our algorithm Bayesian Recursive Nash +Equilibrium (BRNE) and develop a real-time model prediction crowd navigation +framework. Since BRNE is not solving a general-purpose mixed strategy Nash +equilibrium but a tailored formula specifically for crowd navigation, it can +compute the solution in real-time on a low-power embedded computer. We evaluate +BRNE in both simulated environments and real-world pedestrian datasets. BRNE +consistently outperforms non-learning and learning-based methods regarding +safety and navigation efficiency. It also reaches human-level crowd navigation +performance in the pedestrian dataset benchmark. Lastly, we demonstrate the +practicality of our algorithm with real humans on an untethered quadruped robot +with fully onboard perception and computation. + +
+
+ comment: Accepted to The International Journal of Robotics Research (IJRR) +
+
+
+
+
+ + ♻ ☆ A Degree of Flowability for Virtual Tubes + + +
+ With the rapid development of robotics swarm technology, there are more tasks +that require the swarm to pass through complicated environments safely and +efficiently. Virtual tube technology is a novel way to achieve this goal. +Virtual tubes are free spaces connecting two places that provide safety +boundaries and direction of motion for swarm robotics. How to determine the +design quality of a virtual tube is a fundamental problem. For such a purpose, +this paper presents a degree of flowability (DOF) for two-dimensional virtual +tubes according to a minimum energy principle. After that, methods to calculate +DOF are proposed with a feasibility analysis. Simulations of swarm robotics in +different kinds of two-dimensional virtual tubes are performed to demonstrate +the effectiveness of the proposed method of calculating DOF. + +
+
+ comment: 22 pages, 16 figures. This is a preprint, currently under review for + publication in Robotics and Autonomous Systems, Elsevier. Version 2 is + submitted to fix the rendering fault in HTML and correct spelling mistakes in + the abstract and the references +
+
+
+
+
+ + ♻ ☆ A Fast and Model Based Approach for Evaluating Task-Competence of + Antagonistic Continuum Arms + + +
+ Soft robot arms have made significant progress towards completing human-scale +tasks, but designing arms for tasks with specific load and workspace +requirements remains difficult. A key challenge is the lack of model-based +design tools, forcing advancement to occur through empirical iteration and +observation. Existing models are focused on control and rely on parameter fits, +which means they cannot provide general conclusions about the mapping between +design and performance or the influence of factors outside the fitting data. As +a first step toward model-based design tools, we introduce a novel method of +analyzing whether a proposed arm design can complete desired tasks. Our method +is informative, interpretable, and fast; it provides novel metrics for +quantifying a proposed arm design's ability to perform a task, it yields a +graphical interpretation of performance through segment forces, and computing +it is over 80x faster than optimization based methods. Our formulation focuses +on antagonistic, pneumatically-driven soft arms. We demonstrate our approach +through example analysis, and also through consideration of antagonistic vs +non-antagonistic designs. Our method enables fast, direct and task-specific +comparison of these two architectures, and provides a new visualization of the +comparative mechanics. While only a first step, the proposed approach will +support advancement of model-based design tools, leading to highly capable soft +arms. + +
+
+ comment: 8 pages, 7 figures. Submission for the 8th IEEE-RAS International + Conference on Soft Robotics (RoboSoft 2025). For code, proofs, and other + supplementary information, see + https://github.com/wfan19/antagonistic-task-competency +
+
+
+
+
+ + ♻ ☆ ManiWAV: Learning Robot Manipulation from In-the-Wild Audio-Visual Data + + +
+ Audio signals provide rich information for the robot interaction and object +properties through contact. This information can surprisingly ease the learning +of contact-rich robot manipulation skills, especially when the visual +information alone is ambiguous or incomplete. However, the usage of audio data +in robot manipulation has been constrained to teleoperated demonstrations +collected by either attaching a microphone to the robot or object, which +significantly limits its usage in robot learning pipelines. In this work, we +introduce ManiWAV: an 'ear-in-hand' data collection device to collect +in-the-wild human demonstrations with synchronous audio and visual feedback, +and a corresponding policy interface to learn robot manipulation policy +directly from the demonstrations. We demonstrate the capabilities of our system +through four contact-rich manipulation tasks that require either passively +sensing the contact events and modes, or actively sensing the object surface +materials and states. In addition, we show that our system can generalize to +unseen in-the-wild environments by learning from diverse in-the-wild human +demonstrations. + +
+
+ comment: Conference on Robot Learning (CoRL) 2024; Project website: + https://maniwav.github.io/ +
+
+
+
+
+ + ♻ ☆ Intelligent Mobility System with Integrated Motion Planning and Control + Utilizing Infrastructure Sensor Nodes + + +
+ This paper introduces a framework for an indoor autonomous mobility system +that can perform patient transfers and materials handling. Unlike traditional +systems that rely on onboard perception sensors, the proposed approach +leverages a global perception and localization (PL) through Infrastructure +Sensor Nodes (ISNs) and cloud computing technology. Using the global PL, an +integrated Model Predictive Control (MPC)-based local planning and tracking +controller augmented with Artificial Potential Field (APF) is developed, +enabling reliable and efficient motion planning and obstacle avoidance ability +while tracking predefined reference motions. Simulation results demonstrate the +effectiveness of the proposed MPC controller in smoothly navigating around both +static and dynamic obstacles. The proposed system has the potential to extend +to intelligent connected autonomous vehicles, such as electric or cargo +transport vehicles with four-wheel independent drive/steering (4WID-4WIS) +configurations. + +
+
+
+
+
+ + ♻ ☆ MOSAIC: A Modular System for Assistive and Interactive Cooking + + +
+ We present MOSAIC, a modular architecture for home robots to perform complex +collaborative tasks, such as cooking with everyday users. MOSAIC tightly +collaborates with humans, interacts with users using natural language, +coordinates multiple robots, and manages an open vocabulary of everyday +objects. At its core, MOSAIC employs modularity: it leverages multiple +large-scale pre-trained models for general tasks like language and image +recognition, while using streamlined modules designed for task-specific +control. We extensively evaluate MOSAIC on 60 end-to-end trials where two +robots collaborate with a human user to cook a combination of 6 recipes. We +also extensively test individual modules with 180 episodes of visuomotor +picking, 60 episodes of human motion forecasting, and 46 online user +evaluations of the task planner. We show that MOSAIC is able to efficiently +collaborate with humans by running the overall system end-to-end with a real +human user, completing 68.3% (41/60) collaborative cooking trials of 6 +different recipes with a subtask completion rate of 91.6%. Finally, we discuss +the limitations of the current system and exciting open challenges in this +domain. The project's website is at https://portal-cornell.github.io/MOSAIC/ + +
+
+ comment: 22 pages, 13 figures; CoRL 2024 +
+
+
+
+
+ + ♻ ☆ Incorporating Control Inputs in Continuous-Time Gaussian Process State + Estimation for Robotics + + +
+ Continuous-time batch state estimation using Gaussian processes is an +efficient approach to estimate the trajectories of robots over time. In the +past, relatively simple physics-motivated priors have been considered for such +approaches, using assumptions such as constant velocity or acceleration. This +paper presents an approach to incorporating exogenous control inputs, such as +velocity or acceleration commands, into the continuous Gaussian process +state-estimation framework. It is shown that this approach generalizes across +different domains in robotics, making it applicable to both the estimation of +continuous-time trajectories for mobile robots and the estimation of +quasi-static continuum robot shapes. Results show that incorporating control +inputs leads to more informed priors, potentially requiring less measurements +and estimation nodes to obtain accurate estimates. This makes the approach +particularly useful in situations in which limited sensing is available. + +
+
+ comment: 17 pages, 5 figures, submitted to Robotica +
+
+
+
+
+ + ♻ ☆ AutoJoin: Efficient Adversarial Training against Gradient-Free + Perturbations for Robust Maneuvering via Denoising Autoencoder and Joint + Learning + + +
+ With the growing use of machine learning algorithms and ubiquitous sensors, +many `perception-to-control' systems are being developed and deployed. To +ensure their trustworthiness, improving their robustness through adversarial +training is one potential approach. We propose a gradient-free adversarial +training technique, named AutoJoin, to effectively and efficiently produce +robust models for image-based maneuvering. Compared to other state-of-the-art +methods with testing on over 5M images, AutoJoin achieves significant +performance increases up to the 40% range against perturbations while improving +on clean performance up to 300%. AutoJoin is also highly efficient, saving up +to 86% time per training epoch and 90% training data over other +state-of-the-art techniques. The core idea of AutoJoin is to use a decoder +attachment to the original regression model creating a denoising autoencoder +within the architecture. This architecture allows the tasks `maneuvering' and +`denoising sensor input' to be jointly learnt and reinforce each other's +performance. + +
+
+
+
+
+ + ♻ ☆ Collision probability reduction method for tracking control in automatic + docking / berthing using reinforcement learning + + +
+ Automation of berthing maneuvers in shipping is a pressing issue as the +berthing maneuver is one of the most stressful tasks seafarers undertake. +Berthing control problems are often tackled via tracking a predefined +trajectory or path. Maintaining a tracking error of zero under an uncertain +environment is impossible; the tracking controller is nonetheless required to +bring vessels close to desired berths. The tracking controller must prioritize +the avoidance of tracking errors that may cause collisions with obstacles. This +paper proposes a training method based on reinforcement learning for a +trajectory tracking controller that reduces the probability of collisions with +static obstacles. Via numerical simulations, we show that the proposed method +reduces the probability of collisions during berthing maneuvers. Furthermore, +this paper shows the tracking performance in a model experiment. + +
+
+ comment: 14 pages, 15 figures, Published by Journal of Marine Science and + Technology +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 142 + +
+
+
+ + ☆ Adaptive Caching for Faster Video Generation with Diffusion Transformers + + +
+ Generating temporally-consistent high-fidelity videos can be computationally +expensive, especially over longer temporal spans. More-recent Diffusion +Transformers (DiTs) -- despite making significant headway in this context -- +have only heightened such challenges as they rely on larger models and heavier +attention mechanisms, resulting in slower inference speeds. In this paper, we +introduce a training-free method to accelerate video DiTs, termed Adaptive +Caching (AdaCache), which is motivated by the fact that "not all videos are +created equal": meaning, some videos require fewer denoising steps to attain a +reasonable quality than others. Building on this, we not only cache +computations through the diffusion process, but also devise a caching schedule +tailored to each video generation, maximizing the quality-latency trade-off. We +further introduce a Motion Regularization (MoReg) scheme to utilize video +information within AdaCache, essentially controlling the compute allocation +based on motion content. Altogether, our plug-and-play contributions grant +significant inference speedups (e.g. up to 4.7x on Open-Sora 720p - 2s video +generation) without sacrificing the generation quality, across multiple video +DiT baselines. + +
+
+ comment: Project-page is available at https://adacache-dit.github.io +
+
+
+
+
+ + ☆ AutoVFX: Physically Realistic Video Editing from Natural Language + Instructions + + +
+ Modern visual effects (VFX) software has made it possible for skilled artists +to create imagery of virtually anything. However, the creation process remains +laborious, complex, and largely inaccessible to everyday users. In this work, +we present AutoVFX, a framework that automatically creates realistic and +dynamic VFX videos from a single video and natural language instructions. By +carefully integrating neural scene modeling, LLM-based code generation, and +physical simulation, AutoVFX is able to provide physically-grounded, +photorealistic editing effects that can be controlled directly using natural +language instructions. We conduct extensive experiments to validate AutoVFX's +efficacy across a diverse spectrum of videos and instructions. Quantitative and +qualitative results suggest that AutoVFX outperforms all competing methods by a +large margin in generative quality, instruction alignment, editing versatility, +and physical plausibility. + +
+
+ comment: Project page: https://haoyuhsu.github.io/autovfx-website/ +
+
+
+
+
+ + ☆ Training-free Regional Prompting for Diffusion Transformers + + +
+ Diffusion models have demonstrated excellent capabilities in text-to-image +generation. Their semantic understanding (i.e., prompt following) ability has +also been greatly improved with large language models (e.g., T5, Llama). +However, existing models cannot perfectly handle long and complex text prompts, +especially when the text prompts contain various objects with numerous +attributes and interrelated spatial relationships. While many regional +prompting methods have been proposed for UNet-based models (SD1.5, SDXL), but +there are still no implementations based on the recent Diffusion Transformer +(DiT) architecture, such as SD3 and FLUX.1.In this report, we propose and +implement regional prompting for FLUX.1 based on attention manipulation, which +enables DiT with fined-grained compositional text-to-image generation +capability in a training-free manner. Code is available at +https://github.com/antonioo-c/Regional-Prompting-FLUX. + +
+
+ comment: Code is available at + https://github.com/antonioo-c/Regional-Prompting-FLUX +
+
+
+
+
+ + ☆ Adaptive Length Image Tokenization via Recurrent Allocation + + +
+ Current vision systems typically assign fixed-length representations to +images, regardless of the information content. This contrasts with human +intelligence - and even large language models - which allocate varying +representational capacities based on entropy, context and familiarity. Inspired +by this, we propose an approach to learn variable-length token representations +for 2D images. Our encoder-decoder architecture recursively processes 2D image +tokens, distilling them into 1D latent tokens over multiple iterations of +recurrent rollouts. Each iteration refines the 2D tokens, updates the existing +1D latent tokens, and adaptively increases representational capacity by adding +new tokens. This enables compression of images into a variable number of +tokens, ranging from 32 to 256. We validate our tokenizer using reconstruction +loss and FID metrics, demonstrating that token count aligns with image entropy, +familiarity and downstream task requirements. Recurrent token processing with +increasing representational capacity in each iteration shows signs of token +specialization, revealing potential for object / part discovery. + +
+
+ comment: Code at: https://github.com/ShivamDuggal4/adaptive-length-tokenizer +
+
+
+
+
+ + ☆ How Far is Video Generation from World Model: A Physical Law Perspective + + +
+ OpenAI's Sora highlights the potential of video generation for developing +world models that adhere to fundamental physical laws. However, the ability of +video generation models to discover such laws purely from visual data without +human priors can be questioned. A world model learning the true law should give +predictions robust to nuances and correctly extrapolate on unseen scenarios. In +this work, we evaluate across three key scenarios: in-distribution, +out-of-distribution, and combinatorial generalization. We developed a 2D +simulation testbed for object movement and collisions to generate videos +deterministically governed by one or more classical mechanics laws. This +provides an unlimited supply of data for large-scale experimentation and +enables quantitative evaluation of whether the generated videos adhere to +physical laws. We trained diffusion-based video generation models to predict +object movements based on initial frames. Our scaling experiments show perfect +generalization within the distribution, measurable scaling behavior for +combinatorial generalization, but failure in out-of-distribution scenarios. +Further experiments reveal two key insights about the generalization mechanisms +of these models: (1) the models fail to abstract general physical rules and +instead exhibit "case-based" generalization behavior, i.e., mimicking the +closest training example; (2) when generalizing to new cases, models are +observed to prioritize different factors when referencing training data: color +> size > velocity > shape. Our study suggests that scaling alone is +insufficient for video generation models to uncover fundamental physical laws, +despite its role in Sora's broader success. See our project page at +https://phyworld.github.io + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ Learning General-Purpose Biomedical Volume Representations using + Randomized Synthesis + + +
+ Current volumetric biomedical foundation models struggle to generalize as +public 3D datasets are small and do not cover the broad diversity of medical +procedures, conditions, anatomical regions, and imaging protocols. We address +this by creating a representation learning method that instead anticipates +strong domain shifts at training time itself. We first propose a data engine +that synthesizes highly variable training samples that enable generalization to +new biomedical contexts. To then train a single 3D network for any voxel-level +task, we develop a contrastive learning method that pretrains the network to be +stable against nuisance imaging variation simulated by the data engine, a key +inductive bias for generalization. This network's features can be used as +robust representations of input images for downstream tasks and its weights +provide a strong, dataset-agnostic initialization for finetuning on new +datasets. As a result, we set new standards across both multimodality +registration and few-shot segmentation, a first for any 3D biomedical vision +model, all without (pre-)training on any existing dataset of real images. + +
+
+ comment: Code and model weights available at + https://github.com/neel-dey/anatomix +
+
+
+
+
+ + ☆ Machine learning identification of maternal inflammatory response and + histologic choroamnionitis from placental membrane whole slide images + + +
+ The placenta forms a critical barrier to infection through pregnancy, labor +and, delivery. Inflammatory processes in the placenta have short-term, and +long-term consequences for offspring health. Digital pathology and machine +learning can play an important role in understanding placental inflammation, +and there have been very few investigations into methods for predicting and +understanding Maternal Inflammatory Response (MIR). This work intends to +investigate the potential of using machine learning to understand MIR based on +whole slide images (WSI), and establish early benchmarks. To that end, we use +Multiple Instance Learning framework with 3 feature extractors: ImageNet-based +EfficientNet-v2s, and 2 histopathology foundation models, UNI and Phikon to +investigate predictability of MIR stage from histopathology WSIs. We also +interpret predictions from these models using the learned attention maps from +these models. We also use the MIL framework for predicting white blood cells +count (WBC) and maximum fever temperature ($T_{max}$). Attention-based MIL +models are able to classify MIR with a balanced accuracy of up to 88.5% with a +Cohen's Kappa ($\kappa$) of up to 0.772. Furthermore, we found that the +pathology foundation models (UNI and Phikon) are both able to achieve higher +performance with balanced accuracy and $\kappa$, compared to ImageNet-based +feature extractor (EfficientNet-v2s). For WBC and $T_{max}$ prediction, we +found mild correlation between actual values and those predicted from +histopathology WSIs. We used MIL framework for predicting MIR stage from WSIs, +and compared effectiveness of foundation models as feature extractors, with +that of an ImageNet-based model. We further investigated model failure cases +and found them to be either edge cases prone to interobserver variability, +examples of pathologist's overreach, or mislabeled due to processing errors. + +
+
+
+
+
+ + ☆ Physically Based Neural Bidirectional Reflectance Distribution Function + + +
+ We introduce the physically based neural bidirectional reflectance +distribution function (PBNBRDF), a novel, continuous representation for +material appearance based on neural fields. Our model accurately reconstructs +real-world materials while uniquely enforcing physical properties for realistic +BRDFs, specifically Helmholtz reciprocity via reparametrization and energy +passivity via efficient analytical integration. We conduct a systematic +analysis demonstrating the benefits of adhering to these physical laws on the +visual quality of reconstructed materials. Additionally, we enhance the color +accuracy of neural BRDFs by introducing chromaticity enforcement supervising +the norms of RGB channels. Through both qualitative and quantitative +experiments on multiple databases of measured real-world BRDFs, we show that +adhering to these physical constraints enables neural fields to more faithfully +and stably represent the original data and achieve higher rendering quality. + +
+
+
+
+
+ + ☆ MVPaint: Synchronized Multi-View Diffusion for Painting Anything 3D + + +
+ Texturing is a crucial step in the 3D asset production workflow, which +enhances the visual appeal and diversity of 3D assets. Despite recent +advancements in Text-to-Texture (T2T) generation, existing methods often yield +subpar results, primarily due to local discontinuities, inconsistencies across +multiple views, and their heavy dependence on UV unwrapping outcomes. To tackle +these challenges, we propose a novel generation-refinement 3D texturing +framework called MVPaint, which can generate high-resolution, seamless textures +while emphasizing multi-view consistency. MVPaint mainly consists of three key +modules. 1) Synchronized Multi-view Generation (SMG). Given a 3D mesh model, +MVPaint first simultaneously generates multi-view images by employing an SMG +model, which leads to coarse texturing results with unpainted parts due to +missing observations. 2) Spatial-aware 3D Inpainting (S3I). To ensure complete +3D texturing, we introduce the S3I method, specifically designed to effectively +texture previously unobserved areas. 3) UV Refinement (UVR). Furthermore, +MVPaint employs a UVR module to improve the texture quality in the UV space, +which first performs a UV-space Super-Resolution, followed by a Spatial-aware +Seam-Smoothing algorithm for revising spatial texturing discontinuities caused +by UV unwrapping. Moreover, we establish two T2T evaluation benchmarks: the +Objaverse T2T benchmark and the GSO T2T benchmark, based on selected +high-quality 3D meshes from the Objaverse dataset and the entire GSO dataset, +respectively. Extensive experimental results demonstrate that MVPaint surpasses +existing state-of-the-art methods. Notably, MVPaint could generate +high-fidelity textures with minimal Janus issues and highly enhanced cross-view +consistency. + +
+
+ comment: Project Page: https://mvpaint.github.io +
+
+
+
+
+ + ☆ Diffusion-based Generative Multicasting with Intent-aware Semantic + Decomposition + + +
+ Generative diffusion models (GDMs) have recently shown great success in +synthesizing multimedia signals with high perceptual quality enabling highly +efficient semantic communications in future wireless networks. In this paper, +we develop an intent-aware generative semantic multicasting framework utilizing +pre-trained diffusion models. In the proposed framework, the transmitter +decomposes the source signal to multiple semantic classes based on the +multi-user intent, i.e. each user is assumed to be interested in details of +only a subset of the semantic classes. The transmitter then sends to each user +only its intended classes, and multicasts a highly compressed semantic map to +all users over shared wireless resources that allows them to locally synthesize +the other classes, i.e. non-intended classes, utilizing pre-trained diffusion +models. The signal retrieved at each user is thereby partially reconstructed +and partially synthesized utilizing the received semantic map. This improves +utilization of the wireless resources, with better preserving privacy of the +non-intended classes. We design a communication/computation-aware scheme for +per-class adaptation of the communication parameters, such as the transmission +power and compression rate to minimize the total latency of retrieving signals +at multiple receivers, tailored to the prevailing channel conditions as well as +the users reconstruction/synthesis distortion/perception requirements. The +simulation results demonstrate significantly reduced per-user latency compared +with non-generative and intent-unaware multicasting benchmarks while +maintaining high perceptual quality of the signals retrieved at the users. + +
+
+
+
+
+ + ☆ PPLLaVA: Varied Video Sequence Understanding With Prompt Guidance + + +
+ The past year has witnessed the significant advancement of video-based large +language models. However, the challenge of developing a unified model for both +short and long video understanding remains unresolved. Most existing video LLMs +cannot handle hour-long videos, while methods custom for long videos tend to be +ineffective for shorter videos and images. In this paper, we identify the key +issue as the redundant content in videos. To address this, we propose a novel +pooling strategy that simultaneously achieves token compression and +instruction-aware visual feature aggregation. Our model is termed Prompt-guided +Pooling LLaVA, or PPLLaVA for short. Specifically, PPLLaVA consists of three +core components: the CLIP-based visual-prompt alignment that extracts visual +information relevant to the user's instructions, the prompt-guided pooling that +compresses the visual sequence to arbitrary scales using convolution-style +pooling, and the clip context extension designed for lengthy prompt common in +visual dialogue. Moreover, our codebase also integrates the most advanced video +Direct Preference Optimization (DPO) and visual interleave training. Extensive +experiments have validated the performance of our model. With superior +throughput and only 1024 visual context, PPLLaVA achieves better results on +image benchmarks as a video LLM, while achieving state-of-the-art performance +across various video benchmarks, excelling in tasks ranging from caption +generation to multiple-choice questions, and handling video lengths from +seconds to hours. Codes have been available at +https://github.com/farewellthree/PPLLaVA. + +
+
+
+
+
+ + ☆ GenXD: Generating Any 3D and 4D Scenes + + +
+ Recent developments in 2D visual generation have been remarkably successful. +However, 3D and 4D generation remain challenging in real-world applications due +to the lack of large-scale 4D data and effective model design. In this paper, +we propose to jointly investigate general 3D and 4D generation by leveraging +camera and object movements commonly observed in daily life. Due to the lack of +real-world 4D data in the community, we first propose a data curation pipeline +to obtain camera poses and object motion strength from videos. Based on this +pipeline, we introduce a large-scale real-world 4D scene dataset: CamVid-30K. +By leveraging all the 3D and 4D data, we develop our framework, GenXD, which +allows us to produce any 3D or 4D scene. We propose multiview-temporal modules, +which disentangle camera and object movements, to seamlessly learn from both 3D +and 4D data. Additionally, GenXD employs masked latent conditions to support a +variety of conditioning views. GenXD can generate videos that follow the camera +trajectory as well as consistent 3D views that can be lifted into 3D +representations. We perform extensive evaluations across various real-world and +synthetic datasets, demonstrating GenXD's effectiveness and versatility +compared to previous methods in 3D and 4D generation. + +
+
+
+
+
+ + ☆ Grouped Discrete Representation for Object-Centric Learning + + +
+ Object-Centric Learning (OCL) can discover objects in images or videos by +simply reconstructing the input. For better object discovery, representative +OCL methods reconstruct the input as its Variational Autoencoder (VAE) +intermediate representation, which suppresses pixel noises and promotes object +separability by discretizing continuous super-pixels with template features. +However, treating features as units overlooks their composing attributes, thus +impeding model generalization; indexing features with scalar numbers loses +attribute-level similarities and differences, thus hindering model convergence. +We propose \textit{Grouped Discrete Representation} (GDR) for OCL. We decompose +features into combinatorial attributes via organized channel grouping, and +compose these attributes into discrete representation via tuple indexes. +Experiments show that our GDR improves both Transformer- and Diffusion-based +OCL methods consistently on various datasets. Visualizations show that our GDR +captures better object separability. + +
+
+
+
+
+ + ☆ Hunyuan3D-1.0: A Unified Framework for Text-to-3D and Image-to-3D + Generation + + +
+ While 3D generative models have greatly improved artists' workflows, the +existing diffusion models for 3D generation suffer from slow generation and +poor generalization. To address this issue, we propose a two-stage approach +named Hunyuan3D-1.0 including a lite version and a standard version, that both +support text- and image-conditioned generation. In the first stage, we employ a +multi-view diffusion model that efficiently generates multi-view RGB in +approximately 4 seconds. These multi-view images capture rich details of the 3D +asset from different viewpoints, relaxing the tasks from single-view to +multi-view reconstruction. In the second stage, we introduce a feed-forward +reconstruction model that rapidly and faithfully reconstructs the 3D asset +given the generated multi-view images in approximately 7 seconds. The +reconstruction network learns to handle noises and in-consistency introduced by +the multi-view diffusion and leverages the available information from the +condition image to efficiently recover the 3D structure. % Extensive +experimental results demonstrate the effectiveness of Hunyuan3D-1.0 in +generating high-quality 3D assets. Our framework involves the text-to-image +model ~\ie, Hunyuan-DiT, making it a unified framework to support both text- +and image-conditioned 3D generation. Our standard version has $10\times$ more +parameters than our lite and other existing model. Our Hunyuan3D-1.0 achieves +an impressive balance between speed and quality, significantly reducing +generation time while maintaining the quality and diversity of the produced +assets. + +
+
+
+
+
+ + ☆ Conformal-in-the-Loop for Learning with Imbalanced Noisy Data + + +
+ Class imbalance and label noise are pervasive in large-scale datasets, yet +much of machine learning research assumes well-labeled, balanced data, which +rarely reflects real world conditions. Existing approaches typically address +either label noise or class imbalance in isolation, leading to suboptimal +results when both issues coexist. In this work, we propose +Conformal-in-the-Loop (CitL), a novel training framework that addresses both +challenges with a conformal prediction-based approach. CitL evaluates sample +uncertainty to adjust weights and prune unreliable examples, enhancing model +resilience and accuracy with minimal computational cost. Our extensive +experiments include a detailed analysis showing how CitL effectively emphasizes +impactful data in noisy, imbalanced datasets. Our results show that CitL +consistently boosts model performance, achieving up to a 6.1% increase in +classification accuracy and a 5.0 mIoU improvement in segmentation. Our code is +publicly available: CitL. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Unified Speech Recognition: A Single Model for Auditory, Visual, and + Audiovisual Inputs NeurIPS 2024 + + +
+ Research in auditory, visual, and audiovisual speech recognition (ASR, VSR, +and AVSR, respectively) has traditionally been conducted independently. Even +recent self-supervised studies addressing two or all three tasks simultaneously +tend to yield separate models, leading to disjoint inference pipelines with +increased memory requirements and redundancies. This paper proposes unified +training strategies for these systems. We demonstrate that training a single +model for all three tasks enhances VSR and AVSR performance, overcoming typical +optimisation challenges when training from scratch. Moreover, we introduce a +greedy pseudo-labelling approach to more effectively leverage unlabelled +samples, addressing shortcomings in related self-supervised methods. Finally, +we develop a self-supervised pre-training method within our framework, proving +its effectiveness alongside our semi-supervised approach. Despite using a +single model for all tasks, our unified approach achieves state-of-the-art +performance compared to recent methods on LRS3 and LRS2 for ASR, VSR, and AVSR, +as well as on the newly released WildVSR dataset. Code and models are available +at https://github.com/ahaliassos/usr. + +
+
+ comment: NeurIPS 2024. Code: https://github.com/ahaliassos/usr +
+
+
+
+
+ + ☆ 3D Audio-Visual Segmentation NeurIPS 2024 + + +
+ Recognizing the sounding objects in scenes is a longstanding objective in +embodied AI, with diverse applications in robotics and AR/VR/MR. To that end, +Audio-Visual Segmentation (AVS), taking as condition an audio signal to +identify the masks of the target sounding objects in an input image with +synchronous camera and microphone sensors, has been recently advanced. However, +this paradigm is still insufficient for real-world operation, as the mapping +from 2D images to 3D scenes is missing. To address this fundamental limitation, +we introduce a novel research problem, 3D Audio-Visual Segmentation, extending +the existing AVS to the 3D output space. This problem poses more challenges due +to variations in camera extrinsics, audio scattering, occlusions, and diverse +acoustics across sounding object categories. To facilitate this research, we +create the very first simulation based benchmark, 3DAVS-S34-O7, providing +photorealistic 3D scene environments with grounded spatial audio under +single-instance and multi-instance settings, across 34 scenes and 7 object +categories. This is made possible by re-purposing the Habitat simulator to +generate comprehensive annotations of sounding object locations and +corresponding 3D masks. Subsequently, we propose a new approach, EchoSegnet, +characterized by integrating the ready-to-use knowledge from pretrained 2D +audio-visual foundation models synergistically with 3D visual scene +representation through spatial audio-aware mask alignment and refinement. +Extensive experiments demonstrate that EchoSegnet can effectively segment +sounding objects in 3D space on our new benchmark, representing a significant +advancement in the field of embodied AI. Project page: +https://surrey-uplab.github.io/research/3d-audio-visual-segmentation/ + +
+
+ comment: Accepted at the NeurIPS 2024 Workshop on Audio Imagination +
+
+
+
+
+ + ☆ FewViewGS: Gaussian Splatting with Few View Matching and Multi-stage + Training NeurIPS2024 + + +
+ The field of novel view synthesis from images has seen rapid advancements +with the introduction of Neural Radiance Fields (NeRF) and more recently with +3D Gaussian Splatting. Gaussian Splatting became widely adopted due to its +efficiency and ability to render novel views accurately. While Gaussian +Splatting performs well when a sufficient amount of training images are +available, its unstructured explicit representation tends to overfit in +scenarios with sparse input images, resulting in poor rendering performance. To +address this, we present a 3D Gaussian-based novel view synthesis method using +sparse input images that can accurately render the scene from the viewpoints +not covered by the training images. We propose a multi-stage training scheme +with matching-based consistency constraints imposed on the novel views without +relying on pre-trained depth estimation or diffusion models. This is achieved +by using the matches of the available training images to supervise the +generation of the novel views sampled between the training frames with color, +geometry, and semantic losses. In addition, we introduce a locality preserving +regularization for 3D Gaussians which removes rendering artifacts by preserving +the local color structure of the scene. Evaluation on synthetic and real-world +datasets demonstrates competitive or superior performance of our method in +few-shot novel view synthesis compared to existing state-of-the-art methods. + +
+
+ comment: Accepted by NeurIPS2024 +
+
+
+
+
+ + ☆ SIRA: Scalable Inter-frame Relation and Association for Radar Perception CVPR2024 + + +
+ Conventional radar feature extraction faces limitations due to low spatial +resolution, noise, multipath reflection, the presence of ghost targets, and +motion blur. Such limitations can be exacerbated by nonlinear object motion, +particularly from an ego-centric viewpoint. It becomes evident that to address +these challenges, the key lies in exploiting temporal feature relation over an +extended horizon and enforcing spatial motion consistency for effective +association. To this end, this paper proposes SIRA (Scalable Inter-frame +Relation and Association) with two designs. First, inspired by Swin +Transformer, we introduce extended temporal relation, generalizing the existing +temporal relation layer from two consecutive frames to multiple inter-frames +with temporally regrouped window attention for scalability. Second, we propose +motion consistency track with the concept of a pseudo-tracklet generated from +observational data for better trajectory prediction and subsequent object +association. Our approach achieves 58.11 mAP@0.5 for oriented object detection +and 47.79 MOTA for multiple object tracking on the Radiate dataset, surpassing +previous state-of-the-art by a margin of +4.11 mAP@0.5 and +9.94 MOTA, +respectively. + +
+
+ comment: 25 pages, Accepted to CVPR2024 +
+
+
+
+
+ + ☆ One VLM to Keep it Learning: Generation and Balancing for Data-free + Continual Visual Question Answering + + +
+ Vision-Language Models (VLMs) have shown significant promise in Visual +Question Answering (VQA) tasks by leveraging web-scale multimodal datasets. +However, these models often struggle with continual learning due to +catastrophic forgetting when adapting to new tasks. As an effective remedy to +mitigate catastrophic forgetting, rehearsal strategy uses the data of past +tasks upon learning new task. However, such strategy incurs the need of storing +past data, which might not be feasible due to hardware constraints or privacy +concerns. In this work, we propose the first data-free method that leverages +the language generation capability of a VLM, instead of relying on external +models, to produce pseudo-rehearsal data for addressing continual VQA. Our +proposal, named as GaB, generates pseudo-rehearsal data by posing previous task +questions on new task data. Yet, despite being effective, the distribution of +generated questions skews towards the most frequently posed questions due to +the limited and task-specific training data. To mitigate this issue, we +introduce a pseudo-rehearsal balancing module that aligns the generated data +towards the ground-truth data distribution using either the question +meta-statistics or an unsupervised clustering method. We evaluate our proposed +method on two recent benchmarks, \ie VQACL-VQAv2 and CLOVE-function benchmarks. +GaB outperforms all the data-free baselines with substantial improvement in +maintaining VQA performance across evolving tasks, while being on-par with +methods with access to the past data. + +
+
+
+
+
+ + ☆ Digi2Real: Bridging the Realism Gap in Synthetic Data Face Recognition + via Foundation Models + + +
+ The accuracy of face recognition systems has improved significantly in the +past few years, thanks to the large amount of data collected and the +advancement in neural network architectures. However, these large-scale +datasets are often collected without explicit consent, raising ethical and +privacy concerns. To address this, there have been proposals to use synthetic +datasets for training face recognition models. Yet, such models still rely on +real data to train the generative models and generally exhibit inferior +performance compared to those trained on real datasets. One of these datasets, +DigiFace, uses a graphics pipeline to generate different identities and +different intra-class variations without using real data in training the +models. However, the performance of this approach is poor on face recognition +benchmarks, possibly due to the lack of realism in the images generated from +the graphics pipeline. In this work, we introduce a novel framework for realism +transfer aimed at enhancing the realism of synthetically generated face images. +Our method leverages the large-scale face foundation model, and we adapt the +pipeline for realism enhancement. By integrating the controllable aspects of +the graphics pipeline with our realism enhancement technique, we generate a +large amount of realistic variations-combining the advantages of both +approaches. Our empirical evaluations demonstrate that models trained using our +enhanced dataset significantly improve the performance of face recognition +systems over the baseline. The source code and datasets will be made available +publicly. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Double Descent Meets Out-of-Distribution Detection: Theoretical Insights + and Empirical Analysis on the role of model complexity + + +
+ While overparameterization is known to benefit generalization, its impact on +Out-Of-Distribution (OOD) detection is less understood. This paper investigates +the influence of model complexity in OOD detection. We propose an expected OOD +risk metric to evaluate classifiers confidence on both training and OOD +samples. Leveraging Random Matrix Theory, we derive bounds for the expected OOD +risk of binary least-squares classifiers applied to Gaussian data. We show that +the OOD risk depicts an infinite peak, when the number of parameters is equal +to the number of samples, which we associate with the double descent +phenomenon. Our experimental study on different OOD detection methods across +multiple neural architectures extends our theoretical insights and highlights a +double descent curve. Our observations suggest that overparameterization does +not necessarily lead to better OOD detection. Using the Neural Collapse +framework, we provide insights to better understand this behavior. To +facilitate reproducibility, our code will be made publicly available upon +publication. + +
+
+
+
+
+ + ☆ Detect an Object At Once without Fine-tuning + + +
+ When presented with one or a few photos of a previously unseen object, humans +can instantly recognize it in different scenes. Although the human brain +mechanism behind this phenomenon is still not fully understood, this work +introduces a novel technical realization of this task. It consists of two +phases: (1) generating a Similarity Density Map (SDM) by convolving the scene +image with the given object image patch(es) so that the highlight areas in the +SDM indicate the possible locations; (2) obtaining the object occupied areas in +the scene through a Region Alignment Network (RAN). The RAN is constructed on a +backbone of Deep Siamese Network (DSN), and different from the traditional +DSNs, it aims to obtain the object accurate regions by regressing the location +and area differences between the ground truths and the predicted ones indicated +by the highlight areas in SDM. By pre-learning from labels annotated in +traditional datasets, the SDM-RAN can detect previously unknown objects without +fine-tuning. Experiments were conducted on the MS COCO, PASCAL VOC datasets. +The results indicate that the proposed method outperforms state-of-the-art +methods on the same task. + +
+
+
+
+
+ + ☆ CleAR: Robust Context-Guided Generative Lighting Estimation for Mobile + Augmented Reality + + +
+ High-quality environment lighting is the foundation of creating immersive +user experiences in mobile augmented reality (AR) applications. However, +achieving visually coherent environment lighting estimation for Mobile AR is +challenging due to several key limitations associated with AR device sensing +capabilities, including limitations in device camera FoV and pixel dynamic +ranges. Recent advancements in generative AI, which can generate high-quality +images from different types of prompts, including texts and images, present a +potential solution for high-quality lighting estimation. Still, to effectively +use generative image diffusion models, we must address their key limitations of +generation hallucination and slow inference process. To do so, in this work, we +design and implement a generative lighting estimation system called CleAR that +can produce high-quality and diverse environment maps in the format of +360$^\circ$ images. Specifically, we design a two-step generation pipeline +guided by AR environment context data to ensure the results follow physical +environment visual context and color appearances. To improve the estimation +robustness under different lighting conditions, we design a real-time +refinement component to adjust lighting estimation results on AR devices. To +train and test our generative models, we curate a large-scale environment +lighting estimation dataset with diverse lighting conditions. Through +quantitative evaluation and user study, we show that CleAR outperforms +state-of-the-art lighting estimation methods on both estimation accuracy and +robustness. Moreover, CleAR supports real-time refinement of lighting +estimation results, ensuring robust and timely environment lighting updates for +AR applications. Our end-to-end generative estimation takes as fast as 3.2 +seconds, outperforming state-of-the-art methods by 110x. + +
+
+
+
+
+ + ☆ SAFE: Slow and Fast Parameter-Efficient Tuning for Continual Learning + with Pre-Trained Models NeurIPS 2024 + + +
+ Continual learning aims to incrementally acquire new concepts in data streams +while resisting forgetting previous knowledge. With the rise of powerful +pre-trained models (PTMs), there is a growing interest in training incremental +learning systems using these foundation models, rather than learning from +scratch. Existing works often view PTMs as a strong initial point and directly +apply parameter-efficient tuning (PET) in the first session for adapting to +downstream tasks. In the following sessions, most methods freeze model +parameters for tackling forgetting issues. However, applying PET directly to +downstream data cannot fully explore the inherent knowledge in PTMs. +Additionally, freezing the parameters in incremental sessions hinders models' +plasticity to novel concepts not covered in the first session. To solve the +above issues, we propose a Slow And Fast parameter-Efficient tuning (SAFE) +framework. In particular, to inherit general knowledge from foundation models, +we include a transfer loss function by measuring the correlation between the +PTM and the PET-applied model. After calibrating in the first session, the slow +efficient tuning parameters can capture more informative features, improving +generalization to incoming classes. Moreover, to further incorporate novel +concepts, we strike a balance between stability and plasticity by fixing slow +efficient tuning parameters and continuously updating the fast ones. +Specifically, a cross-classification loss with feature alignment is proposed to +circumvent catastrophic forgetting. During inference, we introduce an +entropy-based aggregation strategy to dynamically utilize the complementarity +in the slow and fast learners. Extensive experiments on seven benchmark +datasets verify the effectiveness of our method by significantly surpassing the +state-of-the-art. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + Improving Domain Generalization in Self-supervised Monocular Depth + Estimation via Stabilized Adversarial Training + + +
+ Learning a self-supervised Monocular Depth Estimation (MDE) model with great +generalization remains significantly challenging. Despite the success of +adversarial augmentation in the supervised learning generalization, naively +incorporating it into self-supervised MDE models potentially causes +over-regularization, suffering from severe performance degradation. In this +paper, we conduct qualitative analysis and illuminate the main causes: (i) +inherent sensitivity in the UNet-alike depth network and (ii) dual optimization +conflict caused by over-regularization. To tackle these issues, we propose a +general adversarial training framework, named Stabilized Conflict-optimization +Adversarial Training (SCAT), integrating adversarial data augmentation into +self-supervised MDE methods to achieve a balance between stability and +generalization. Specifically, we devise an effective scaling depth network that +tunes the coefficients of long skip connection and effectively stabilizes the +training process. Then, we propose a conflict gradient surgery strategy, which +progressively integrates the adversarial gradient and optimizes the model +toward a conflict-free direction. Extensive experiments on five benchmarks +demonstrate that SCAT can achieve state-of-the-art performance and +significantly improve the generalization capability of existing self-supervised +MDE methods. + +
+
+
+
+
+ + ☆ Advanced computer vision for extracting georeferenced vehicle + trajectories from drone imagery + + +
+ This paper presents a framework for extracting georeferenced vehicle +trajectories from high-altitude drone footage, addressing key challenges in +urban traffic monitoring and limitations of traditional ground-based systems. +We employ state-of-the-art computer vision and deep learning to create an +end-to-end pipeline that enhances vehicle detection, tracking, and trajectory +stabilization. Conducted in the Songdo International Business District, South +Korea, the study used a multi-drone experiment over 20 intersections, capturing +approximately 12TB of 4K video data over four days. We developed a novel track +stabilization method that uses detected vehicle bounding boxes as exclusion +masks during image registration, which, combined with advanced georeferencing +techniques, accurately transforms vehicle coordinates into real-world +geographical data. Additionally, our framework includes robust vehicle +dimension estimation and detailed road segmentation for in-depth traffic +analysis. The framework produced two high-quality datasets: the Songdo Traffic +dataset, comprising nearly 1 million unique vehicle trajectories, and the +Songdo Vision dataset, containing over 5,000 human-annotated frames with about +300,000 vehicle instances in four classes. Comparisons between drone-derived +data and high-precision sensor data from an instrumented probe vehicle +highlight the accuracy and consistency of our framework's extraction in dense +urban settings. By publicly releasing these datasets and the pipeline source +code, this work sets new benchmarks for data quality, reproducibility, and +scalability in traffic research. Results demonstrate the potential of +integrating drone technology with advanced computer vision for precise, +cost-effective urban traffic monitoring, providing valuable resources for the +research community to develop intelligent transportation systems and improve +traffic management strategies. + +
+
+
+
+
+ + ☆ Advancements and limitations of LLMs in replicating human color-word + associations + + +
+ Color-word associations play a fundamental role in human cognition and design +applications. Large Language Models (LLMs) have become widely available and +demonstrated intelligent behaviors in various benchmarks with natural +conversation skills. However, their ability to replicate human color-word +associations remains understudied. We compared multiple generations of LLMs +(from GPT-3 to GPT- 4o) against human color-word associations using data +collected from over 10,000 Japanese participants, involving 17 colors and words +from eight categories in Japanese. Our findings reveal a clear progression in +LLM performance across generations, with GPT-4o achieving the highest accuracy +in predicting the best voted word for each color and category, particularly +when using visual inputs rather than text-based color codes. However, the +highest median performance was approximately 50% even for GPT4-o with visual +inputs (chance level is 10%), and the performance levels varied significantly +across word categories and colors, indicating a failure to fully replicate +human color-word associations. On the other hand, color discrimination ability +estimated from our color-word association data showed that LLMs demonstrated +high correlation with human color discrimination patterns, similarly to +previous studies. Our study highlights both the advancements in LLM +capabilities and their persistent limitations, suggesting differences in +semantic memory structures between humans and LLMs in representing color-word +associations. + +
+
+ comment: 20 pages, 7 figures, 3 tables +
+
+
+
+
+ + ☆ Multi-modal biometric authentication: Leveraging shared layer + architectures for enhanced security + + +
+ In this study, we introduce a novel multi-modal biometric authentication +system that integrates facial, vocal, and signature data to enhance security +measures. Utilizing a combination of Convolutional Neural Networks (CNNs) and +Recurrent Neural Networks (RNNs), our model architecture uniquely incorporates +dual shared layers alongside modality-specific enhancements for comprehensive +feature extraction. The system undergoes rigorous training with a joint loss +function, optimizing for accuracy across diverse biometric inputs. +Feature-level fusion via Principal Component Analysis (PCA) and classification +through Gradient Boosting Machines (GBM) further refine the authentication +process. Our approach demonstrates significant improvements in authentication +accuracy and robustness, paving the way for advanced secure identity +verification solutions. + +
+
+
+
+
+ + ☆ Deep Learning on 3D Semantic Segmentation: A Detailed Review + + +
+ In this paper an exhaustive review and comprehensive analysis of recent and +former deep learning methods in 3D Semantic Segmentation (3DSS) is presented. +In the related literature, the taxonomy scheme used for the classification of +the 3DSS deep learning methods is ambiguous. Based on the taxonomy schemes of 9 +existing review papers, a new taxonomy scheme of the 3DSS deep learning methods +is proposed, aiming to standardize it and improve the comparability and clarity +across related studies. Furthermore, an extensive overview of the available +3DSS indoor and outdoor datasets is provided along with their links. The core +part of the review is the detailed presentation of recent and former 3DSS deep +learning methods and their classification using the proposed taxonomy scheme +along with their GitHub repositories. Additionally, a brief but informative +analysis of the evaluation metrics and loss functions used in 3DSS is included. +Finally, a fruitful discussion of the examined 3DSS methods and datasets, is +presented to foster new research directions and applications in the field of +3DSS. Supplementary, to this review a GitHub repository is provided +(https://github.com/thobet/Deep-Learning-on-3D-Semantic-Segmentation-a- +Detailed-Review) including a quick classification of over 400 3DSS methods, +using the proposed taxonomy scheme. + +
+
+
+
+
+ + ☆ Differentially Private Integrated Decision Gradients (IDG-DP) for + Radar-based Human Activity Recognition + + +
+ Human motion analysis offers significant potential for healthcare monitoring +and early detection of diseases. The advent of radar-based sensing systems has +captured the spotlight for they are able to operate without physical contact +and they can integrate with pre-existing Wi-Fi networks. They are also seen as +less privacy-invasive compared to camera-based systems. However, recent +research has shown high accuracy in recognizing subjects or gender from radar +gait patterns, raising privacy concerns. This study addresses these issues by +investigating privacy vulnerabilities in radar-based Human Activity Recognition +(HAR) systems and proposing a novel method for privacy preservation using +Differential Privacy (DP) driven by attributions derived with Integrated +Decision Gradient (IDG) algorithm. We investigate Black-box Membership +Inference Attack (MIA) Models in HAR settings across various levels of +attacker-accessible information. We extensively evaluated the effectiveness of +the proposed IDG-DP method by designing a CNN-based HAR model and rigorously +assessing its resilience against MIAs. Experimental results demonstrate the +potential of IDG-DP in mitigating privacy attacks while maintaining utility +across all settings, particularly excelling against label-only and shadow model +black-box MIA attacks. This work represents a crucial step towards balancing +the need for effective radar-based HAR with robust privacy protection in +healthcare environments. + +
+
+
+
+
+ + ☆ The evolution of volumetric video: A survey of smart transcoding and + compression approaches + + +
+ Volumetric video, the capture and display of three-dimensional (3D) imagery, +has emerged as a revolutionary technology poised to transform the media +landscape, enabling immersive experiences that transcend the limitations of +traditional 2D video. One of the key challenges in this domain is the efficient +delivery of these high-bandwidth, data-intensive volumetric video streams, +which requires innovative transcoding and compression techniques. This research +paper explores the state-of-the-art in volumetric video compression and +delivery, with a focus on the potential of AI-driven solutions to address the +unique challenges posed by this emerging medium. + +
+
+
+
+
+ + ☆ GraphVL: Graph-Enhanced Semantic Modeling via Vision-Language Models for + Generalized Class Discovery + + +
+ Generalized Category Discovery (GCD) aims to cluster unlabeled images into +known and novel categories using labeled images from known classes. To address +the challenge of transferring features from known to unknown classes while +mitigating model bias, we introduce GraphVL, a novel approach for +vision-language modeling in GCD, leveraging CLIP. Our method integrates a graph +convolutional network (GCN) with CLIP's text encoder to preserve class +neighborhood structure. We also employ a lightweight visual projector for image +data, ensuring discriminative features through margin-based contrastive losses +for image-text mapping. This neighborhood preservation criterion effectively +regulates the semantic space, making it less sensitive to known classes. +Additionally, we learn textual prompts from known classes and align them to +create a more contextually meaningful semantic feature space for the GCN layer +using a contextual similarity loss. Finally, we represent unlabeled samples +based on their semantic distance to class prompts from the GCN, enabling +semi-supervised clustering for class discovery and minimizing errors. Our +experiments on seven benchmark datasets consistently demonstrate the +superiority of GraphVL when integrated with the CLIP backbone. + +
+
+ comment: Accepted in ACM ICVGIP 2024 +
+
+
+
+
+ + ☆ Model Integrity when Unlearning with T2I Diffusion Models + + +
+ The rapid advancement of text-to-image Diffusion Models has led to their +widespread public accessibility. However these models, trained on large +internet datasets, can sometimes generate undesirable outputs. To mitigate +this, approximate Machine Unlearning algorithms have been proposed to modify +model weights to reduce the generation of specific types of images, +characterized by samples from a ``forget distribution'', while preserving the +model's ability to generate other images, characterized by samples from a +``retain distribution''. While these methods aim to minimize the influence of +training data in the forget distribution without extensive additional +computation, we point out that they can compromise the model's integrity by +inadvertently affecting generation for images in the retain distribution. +Recognizing the limitations of FID and CLIPScore in capturing these effects, we +introduce a novel retention metric that directly assesses the perceptual +difference between outputs generated by the original and the unlearned models. +We then propose unlearning algorithms that demonstrate superior effectiveness +in preserving model integrity compared to existing baselines. Given their +straightforward implementation, these algorithms serve as valuable benchmarks +for future advancements in approximate Machine Unlearning for Diffusion Models. + +
+
+
+
+
+ + ☆ AM Flow: Adapters for Temporal Processing in Action Recognition + + +
+ Deep learning models, in particular \textit{image} models, have recently +gained generalisability and robustness. %are becoming more general and robust +by the day. In this work, we propose to exploit such advances in the realm of +\textit{video} classification. Video foundation models suffer from the +requirement of extensive pretraining and a large training time. Towards +mitigating such limitations, we propose "\textit{Attention Map (AM) Flow}" for +image models, a method for identifying pixels relevant to motion in each input +video frame. In this context, we propose two methods to compute AM flow, +depending on camera motion. AM flow allows the separation of spatial and +temporal processing, while providing improved results over combined +spatio-temporal processing (as in video models). Adapters, one of the popular +techniques in parameter efficient transfer learning, facilitate the +incorporation of AM flow into pretrained image models, mitigating the need for +full-finetuning. We extend adapters to "\textit{temporal processing adapters}" +by incorporating a temporal processing unit into the adapters. Our work +achieves faster convergence, therefore reducing the number of epochs needed for +training. Moreover, we endow an image model with the ability to achieve +state-of-the-art results on popular action recognition datasets. This reduces +training time and simplifies pretraining. We present experiments on +Kinetics-400, Something-Something v2, and Toyota Smarthome datasets, showcasing +state-of-the-art or comparable results. + +
+
+
+
+
+ + ☆ Exploiting Unlabeled Data with Multiple Expert Teachers for Open + Vocabulary Aerial Object Detection and Its Orientation Adaptation + + +
+ In recent years, aerial object detection has been increasingly pivotal in +various earth observation applications. However, current algorithms are limited +to detecting a set of pre-defined object categories, demanding sufficient +annotated training samples, and fail to detect novel object categories. In this +paper, we put forth a novel formulation of the aerial object detection problem, +namely open-vocabulary aerial object detection (OVAD), which can detect objects +beyond training categories without costly collecting new labeled data. We +propose CastDet, a CLIP-activated student-teacher detection framework that +serves as the first OVAD detector specifically designed for the challenging +aerial scenario, where objects often exhibit weak appearance features and +arbitrary orientations. Our framework integrates a robust localization teacher +along with several box selection strategies to generate high-quality proposals +for novel objects. Additionally, the RemoteCLIP model is adopted as an +omniscient teacher, which provides rich knowledge to enhance classification +capabilities for novel categories. A dynamic label queue is devised to maintain +high-quality pseudo-labels during training. By doing so, the proposed CastDet +boosts not only novel object proposals but also classification. Furthermore, we +extend our approach from horizontal OVAD to oriented OVAD with tailored +algorithm designs to effectively manage bounding box representation and +pseudo-label generation. Extensive experiments for both tasks on multiple +existing aerial object detection datasets demonstrate the effectiveness of our +approach. The code is available at https://github.com/lizzy8587/CastDet. + +
+
+
+
+
+ + ☆ Addressing Representation Collapse in Vector Quantized Models with One + Linear Layer + + +
+ Vector Quantization (VQ) is a widely used method for converting continuous +representations into discrete codes, which has become fundamental in +unsupervised representation learning and latent generative models. However, VQ +models are often hindered by the problem of representation collapse in the +latent space, which leads to low codebook utilization and limits the +scalability of the codebook for large-scale training. Existing methods designed +to mitigate representation collapse typically reduce the dimensionality of +latent space at the expense of model capacity, which do not fully resolve the +core issue. In this study, we conduct a theoretical analysis of representation +collapse in VQ models and identify its primary cause as the disjoint +optimization of the codebook, where only a small subset of code vectors are +updated through gradient descent. To address this issue, we propose +\textbf{SimVQ}, a novel method which reparameterizes the code vectors through a +linear transformation layer based on a learnable latent basis. This +transformation optimizes the \textit{entire linear space} spanned by the +codebook, rather than merely updating \textit{the code vector} selected by the +nearest-neighbor search in vanilla VQ models. Although it is commonly +understood that the multiplication of two linear matrices is equivalent to +applying a single linear layer, our approach works surprisingly well in +resolving the collapse issue in VQ models with just one linear layer. We +validate the efficacy of SimVQ through extensive experiments across various +modalities, including image and audio data with different model architectures. +Our code is available at \url{https://github.com/youngsheen/SimVQ}. + +
+
+
+
+
+ + ☆ Tree level change detection over Ahmedabad city using very high + resolution satellite images and Deep Learning + + +
+ In this study, 0.5m high resolution satellite datasets over Indian urban +region was used to demonstrate the applicability of deep learning models over +Ahmedabad, India. Here, YOLOv7 instance segmentation model was trained on well +curated trees canopy dataset (6500 images) in order to carry out the change +detection. During training, evaluation metrics such as bounding box regression +and mask regression loss, mean average precision (mAP) and stochastic gradient +descent algorithm were used for evaluating and optimizing the performance of +model. After the 500 epochs, the mAP of 0.715 and 0.699 for individual tree +detection and tree canopy mask segmentation were obtained. However, by further +tuning hyper parameters of the model, maximum accuracy of 80 % of trees +detection with false segmentation rate of 2% on data was obtained. + +
+
+
+
+
+ + ☆ QCS:Feature Refining from Quadruplet Cross Similarity for Facial + Expression Recognition + + +
+ On facial expression datasets with complex and numerous feature types, where +the significance and dominance of labeled features are difficult to predict, +facial expression recognition(FER) encounters the challenges of inter-class +similarity and intra-class variances, making it difficult to mine effective +features. We aim to solely leverage the feature similarity among facial samples +to address this. We introduce the Cross Similarity Attention (CSA), an +input-output position-sensitive attention mechanism that harnesses feature +similarity across different images to compute the corresponding global spatial +attention. Based on this, we propose a four-branch circular framework, called +Quadruplet Cross Similarity (QCS), to extract discriminative features from the +same class and eliminate redundant ones from different classes synchronously to +refine cleaner features. The symmetry of the network ensures balanced and +stable training and reduces the amount of CSA interaction matrix. Contrastive +residual distillation is utilized to transfer the information learned in the +cross module back to the base network. The cross-attention module exists during +training, and only one base branch is retained during inference. our proposed +QCS model outperforms state-of-the-art methods on several popular FER datasets, +without requiring additional landmark information or other extra training data. +The code is available at https://github.com/birdwcp/QCS. + +
+
+
+
+
+ + ☆ Typicalness-Aware Learning for Failure Detection NeurIPS 2024 + + +
+ Deep neural networks (DNNs) often suffer from the overconfidence issue, where +incorrect predictions are made with high confidence scores, hindering the +applications in critical systems. In this paper, we propose a novel approach +called Typicalness-Aware Learning (TAL) to address this issue and improve +failure detection performance. We observe that, with the cross-entropy loss, +model predictions are optimized to align with the corresponding labels via +increasing logit magnitude or refining logit direction. However, regarding +atypical samples, the image content and their labels may exhibit disparities. +This discrepancy can lead to overfitting on atypical samples, ultimately +resulting in the overconfidence issue that we aim to address. To tackle the +problem, we have devised a metric that quantifies the typicalness of each +sample, enabling the dynamic adjustment of the logit magnitude during the +training process. By allowing atypical samples to be adequately fitted while +preserving reliable logit direction, the problem of overconfidence can be +mitigated. TAL has been extensively evaluated on benchmark datasets, and the +results demonstrate its superiority over existing failure detection methods. +Specifically, TAL achieves a more than 5% improvement on CIFAR100 in terms of +the Area Under the Risk-Coverage Curve (AURC) compared to the state-of-the-art. +Code is available at https://github.com/liuyijungoon/TAL. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ SPECTRUM: Semantic Processing and Emotion-informed video-Captioning + Through Retrieval and Understanding Modalities + + +
+ Capturing a video's meaning and critical concepts by analyzing the subtle +details is a fundamental yet challenging task in video captioning. Identifying +the dominant emotional tone in a video significantly enhances the perception of +its context. Despite a strong emphasis on video captioning, existing models +often need to adequately address emotional themes, resulting in suboptimal +captioning results. To address these limitations, this paper proposes a novel +Semantic Processing and Emotion-informed video-Captioning Through Retrieval and +Understanding Modalities (SPECTRUM) framework to empower the generation of +emotionally and semantically credible captions. Leveraging our pioneering +structure, SPECTRUM discerns multimodal semantics and emotional themes using +Visual Text Attribute Investigation (VTAI) and determines the orientation of +descriptive captions through a Holistic Concept-Oriented Theme (HCOT), +expressing emotionally-informed and field-acquainted references. They exploit +video-to-text retrieval capabilities and the multifaceted nature of video +content to estimate the emotional probabilities of candidate captions. Then, +the dominant theme of the video is determined by appropriately weighting +embedded attribute vectors and applying coarse- and fine-grained emotional +concepts, which define the video's contextual alignment. Furthermore, using two +loss functions, SPECTRUM is optimized to integrate emotional information and +minimize prediction errors. Extensive experiments on the EmVidCap, MSVD, and +MSRVTT video captioning datasets demonstrate that our model significantly +surpasses state-of-the-art methods. Quantitative and qualitative evaluations +highlight the model's ability to accurately capture and convey video emotions +and multimodal attributes. + +
+
+
+
+
+ + ☆ Active Gaze Behavior Boosts Self-Supervised Object Learning + + +
+ Due to significant variations in the projection of the same object from +different viewpoints, machine learning algorithms struggle to recognize the +same object across various perspectives. In contrast, toddlers quickly learn to +recognize objects from different viewpoints with almost no supervision. Recent +works argue that toddlers develop this ability by mapping close-in-time visual +inputs to similar representations while interacting with objects. High acuity +vision is only available in the central visual field, which may explain why +toddlers (much like adults) constantly move their gaze around during such +interactions. It is unclear whether/how much toddlers curate their visual +experience through these eye movements to support learning object +representations. In this work, we explore whether a bio inspired visual +learning model can harness toddlers' gaze behavior during a play session to +develop view-invariant object recognition. Exploiting head-mounted eye tracking +during dyadic play, we simulate toddlers' central visual field experience by +cropping image regions centered on the gaze location. This visual stream feeds +a time-based self-supervised learning algorithm. Our experiments demonstrate +that toddlers' gaze strategy supports the learning of invariant object +representations. Our analysis also reveals that the limited size of the central +visual field where acuity is high is crucial for this. We further find that +toddlers' visual experience elicits more robust representations compared to +adults' mostly because toddlers look at objects they hold themselves for longer +bouts. Overall, our work reveals how toddlers' gaze behavior supports +self-supervised learning of view-invariant object recognition. + +
+
+ comment: 16 pages, 11 figures +
+
+
+
+
+ + ☆ UnSegMedGAT: Unsupervised Medical Image Segmentation using Graph + Attention Networks Clustering + + +
+ The data-intensive nature of supervised classification drives the interest of +the researchers towards unsupervised approaches, especially for problems such +as medical image segmentation, where labeled data is scarce. Building on the +recent advancements of Vision transformers (ViT) in computer vision, we propose +an unsupervised segmentation framework using a pre-trained Dino-ViT. In the +proposed method, we leverage the inherent graph structure within the image to +realize a significant performance gain for segmentation in medical images. For +this, we introduce a modularity-based loss function coupled with a Graph +Attention Network (GAT) to effectively capture the inherent graph topology +within the image. Our method achieves state-of-the-art performance, even +significantly surpassing or matching that of existing (semi)supervised +technique such as MedSAM which is a Segment Anything Model in medical images. +We demonstrate this using two challenging medical image datasets ISIC-2018 and +CVC-ColonDB. This work underscores the potential of unsupervised approaches in +advancing medical image analysis in scenarios where labeled data is scarce. The +github repository of the code is available on +[https://github.com/mudit-adityaja/UnSegMedGAT]. + +
+
+
+
+
+ + ☆ Deep Learning for Leopard Individual Identification: An Adaptive Angular + Margin Approach + + +
+ Accurate identification of individual leopards across camera trap images is +critical for population monitoring and ecological studies. This paper +introduces a deep learning framework to distinguish between individual leopards +based on their unique spot patterns. This approach employs a novel adaptive +angular margin method in the form of a modified CosFace architecture. In +addition, I propose a preprocessing pipeline that combines RGB channels with an +edge detection channel to underscore the critical features learned by the +model. + This approach significantly outperforms the Triplet Network baseline, +achieving a Dynamic Top-5 Average Precision of 0.8814 and a Top-5 Rank Match +Detection of 0.9533, demonstrating its potential for open-set learning in +wildlife identification. While not surpassing the performance of the SIFT-based +Hotspotter algorithm, this method represents a substantial advancement in +applying deep learning to patterned wildlife identification. + This research contributes to the field of computer vision and provides a +valuable tool for biologists aiming to study and protect leopard populations. +It also serves as a stepping stone for applying the power of deep learning in +Capture-Recapture studies for other patterned species. + +
+
+
+
+
+ + ☆ Robust plug-and-play methods for highly accelerated non-Cartesian MRI + reconstruction + + +
+ Achieving high-quality Magnetic Resonance Imaging (MRI) reconstruction at +accelerated acquisition rates remains challenging due to the inherent ill-posed +nature of the inverse problem. Traditional Compressed Sensing (CS) methods, +while robust across varying acquisition settings, struggle to maintain good +reconstruction quality at high acceleration factors ($\ge$ 8). Recent advances +in deep learning have improved reconstruction quality, but purely data-driven +methods are prone to overfitting and hallucination effects, notably when the +acquisition setting is varying. Plug-and-Play (PnP) approaches have been +proposed to mitigate the pitfalls of both frameworks. In a nutshell, PnP +algorithms amount to replacing suboptimal handcrafted CS priors with powerful +denoising deep neural network (DNNs). However, in MRI reconstruction, existing +PnP methods often yield suboptimal results due to instabilities in the proximal +gradient descent (PGD) schemes and the lack of curated, noiseless datasets for +training robust denoisers. In this work, we propose a fully unsupervised +preprocessing pipeline to generate clean, noiseless complex MRI signals from +multicoil data, enabling training of a high-performance denoising DNN. +Furthermore, we introduce an annealed Half-Quadratic Splitting (HQS) algorithm +to address the instability issues, leading to significant improvements over +existing PnP algorithms. When combined with preconditioning techniques, our +approach achieves state-of-the-art results, providing a robust and efficient +solution for high-quality MRI reconstruction. + +
+
+
+
+
+ + ☆ Learning Where to Edit Vision Transformers + + +
+ Model editing aims to data-efficiently correct predictive errors of large +pre-trained models while ensuring generalization to neighboring failures and +locality to minimize unintended effects on unrelated examples. While +significant progress has been made in editing Transformer-based large language +models, effective strategies for editing vision Transformers (ViTs) in computer +vision remain largely untapped. In this paper, we take initial steps towards +correcting predictive errors of ViTs, particularly those arising from +subpopulation shifts. Taking a locate-then-edit approach, we first address the +where-to-edit challenge by meta-learning a hypernetwork on CutMix-augmented +data generated for editing reliability. This trained hypernetwork produces +generalizable binary masks that identify a sparse subset of structured model +parameters, responsive to real-world failure samples. Afterward, we solve the +how-to-edit problem by simply fine-tuning the identified parameters using a +variant of gradient descent to achieve successful edits. To validate our +method, we construct an editing benchmark that introduces subpopulation shifts +towards natural underrepresented images and AI-generated images, thereby +revealing the limitations of pre-trained ViTs for object recognition. Our +approach not only achieves superior performance on the proposed benchmark but +also allows for adjustable trade-offs between generalization and locality. Our +code is available at https://github.com/hustyyq/Where-to-Edit. + +
+
+
+
+
+ + ☆ Exploiting Contextual Uncertainty of Visual Data for Efficient Training + of Deep Models + + +
+ Objects, in the real world, rarely occur in isolation and exhibit typical +arrangements governed by their independent utility, and their expected +interaction with humans and other objects in the context. For example, a chair +is expected near a table, and a computer is expected on top. Humans use this +spatial context and relative placement as an important cue for visual +recognition in case of ambiguities. Similar to human's, DNN's exploit +contextual information from data to learn representations. Our research focuses +on harnessing the contextual aspects of visual data to optimize data annotation +and enhance the training of deep networks. Our contributions can be summarized +as follows: (1) We introduce the notion of contextual diversity for active +learning CDAL and show its applicability in three different visual tasks +semantic segmentation, object detection and image classification, (2) We +propose a data repair algorithm to curate contextually fair data to reduce +model bias, enabling the model to detect objects out of their obvious context, +(3) We propose Class-based annotation, where contextually relevant classes are +selected that are complementary for model training under domain shift. +Understanding the importance of well-curated data, we also emphasize the +necessity of involving humans in the loop to achieve accurate annotations and +to develop novel interaction strategies that allow humans to serve as +fact-checkers. In line with this we are working on developing image retrieval +system for wildlife camera trap images and reliable warning system for poor +quality rural roads. For large-scale annotation, we are employing a strategic +combination of human expertise and zero-shot models, while also integrating +human input at various stages for continuous feedback. + +
+
+ comment: ICVGIP, Young Researchers Symposium +
+
+
+
+
+ + ☆ Real-Time Polygonal Semantic Mapping for Humanoid Robot Stair Climbing + + +
+ We present a novel algorithm for real-time planar semantic mapping tailored +for humanoid robots navigating complex terrains such as staircases. Our method +is adaptable to any odometry input and leverages GPU-accelerated processes for +planar extraction, enabling the rapid generation of globally consistent +semantic maps. We utilize an anisotropic diffusion filter on depth images to +effectively minimize noise from gradient jumps while preserving essential edge +details, enhancing normal vector images' accuracy and smoothness. Both the +anisotropic diffusion and the RANSAC-based plane extraction processes are +optimized for parallel processing on GPUs, significantly enhancing +computational efficiency. Our approach achieves real-time performance, +processing single frames at rates exceeding $30~Hz$, which facilitates detailed +plane extraction and map management swiftly and efficiently. Extensive testing +underscores the algorithm's capabilities in real-time scenarios and +demonstrates its practical application in humanoid robot gait planning, +significantly improving its ability to navigate dynamic environments. + +
+
+ comment: Accepted by The 2024 IEEE-RAS International Conference on Humanoid + Robots. The code: https://github.com/BTFrontier/polygon_mapping +
+
+
+
+
+ + ☆ Masked Autoencoders are Parameter-Efficient Federated Continual Learners + + +
+ Federated learning is a specific distributed learning paradigm in which a +central server aggregates updates from multiple clients' local models, thereby +enabling the server to learn without requiring clients to upload their private +data, maintaining data privacy. While existing federated learning methods are +primarily designed for static data, real-world applications often require +clients to learn new categories over time. This challenge necessitates the +integration of continual learning techniques, resulting in federated continual +learning (FCL). Although advanced prompt-based continual learning methods +leverage pre-trained transformers to mitigate catastrophic forgetting, they do +not adequately address the non-IID challenges in federated learning. To address +both catastrophic forgetting and non-IID issues, we propose to use masked +autoencoders (MAEs) as parameter-efficient federated continual learners, called +pMAE. pMAE learns reconstructive prompt on the client side through image +reconstruction using MAEs. On the server side, it reconstructs the uploaded +restore information to capture the data distribution across previous tasks and +different clients, using these reconstructed images to finetune discriminative +prompt and classifier parameters designed for classification, thereby +alleviating catastrophic forgetting and non-IID challenges on a global scale. +Experimental results demonstrate that pMAE achieves performance comparable to +existing prompt-based methods and can enhance their effectiveness, particularly +when using self-supervised pre-trained transformers as the backbone. Code is +available at: https://github.com/ycheoo/pMAE. + +
+
+
+
+
+ + ☆ FPPL: An Efficient and Non-IID Robust Federated Continual Learning + Framework + + +
+ Federated continual learning (FCL) aims to learn from sequential data stream +in the decentralized federated learning setting, while simultaneously +mitigating the catastrophic forgetting issue in classical continual learning. +Existing FCL methods usually employ typical rehearsal mechanisms, which could +result in privacy violations or additional onerous storage and computational +burdens. In this work, an efficient and non-IID robust federated continual +learning framework, called Federated Prototype-Augmented Prompt Learning +(FPPL), is proposed. The FPPL can collaboratively learn lightweight prompts +augmented by prototypes without rehearsal. On the client side, a fusion +function is employed to fully leverage the knowledge contained in task-specific +prompts for alleviating catastrophic forgetting. Additionally, global +prototypes aggregated from the server are used to obtain unified representation +through contrastive learning, mitigating the impact of non-IID-derived data +heterogeneity. On the server side, locally uploaded prototypes are utilized to +perform debiasing on the classifier, further alleviating the performance +degradation caused by both non-IID and catastrophic forgetting. Empirical +evaluations demonstrate the effectiveness of FPPL, achieving notable +performance with an efficient design while remaining robust to diverse non-IID +degrees. Code is available at: https://github.com/ycheoo/FPPL. + +
+
+
+
+
+ + ☆ MBDRes-U-Net: Multi-Scale Lightweight Brain Tumor Segmentation Network + + +
+ Accurate segmentation of brain tumors plays a key role in the diagnosis and +treatment of brain tumor diseases. It serves as a critical technology for +quantifying tumors and extracting their features. With the increasing +application of deep learning methods, the computational burden has become +progressively heavier. To achieve a lightweight model with good segmentation +performance, this study proposes the MBDRes-U-Net model using the +three-dimensional (3D) U-Net codec framework, which integrates multibranch +residual blocks and fused attention into the model. The computational burden of +the model is reduced by the branch strategy, which effectively uses the rich +local features in multimodal images and enhances the segmentation performance +of subtumor regions. Additionally, during encoding, an adaptive weighted +expansion convolution layer is introduced into the multi-branch residual block, +which enriches the feature expression and improves the segmentation accuracy of +the model. Experiments on the Brain Tumor Segmentation (BraTS) Challenge 2018 +and 2019 datasets show that the architecture could maintain a high precision of +brain tumor segmentation while considerably reducing the calculation +overhead.Our code is released at +https://github.com/Huaibei-normal-university-cv-laboratory/mbdresunet + +
+
+ comment: Brain tumor segmentation, lightweight model, Brain Tumor Segmentation + (BraTS) Challenge, group convolution +
+
+
+
+
+ + ☆ A Global Depth-Range-Free Multi-View Stereo Transformer Network with + Pose Embedding + + +
+ In this paper, we propose a novel multi-view stereo (MVS) framework that gets +rid of the depth range prior. Unlike recent prior-free MVS methods that work in +a pair-wise manner, our method simultaneously considers all the source images. +Specifically, we introduce a Multi-view Disparity Attention (MDA) module to +aggregate long-range context information within and across multi-view images. +Considering the asymmetry of the epipolar disparity flow, the key to our method +lies in accurately modeling multi-view geometric constraints. We integrate pose +embedding to encapsulate information such as multi-view camera poses, providing +implicit geometric constraints for multi-view disparity feature fusion +dominated by attention. Additionally, we construct corresponding hidden states +for each source image due to significant differences in the observation quality +of the same pixel in the reference frame across multiple source frames. We +explicitly estimate the quality of the current pixel corresponding to sampled +points on the epipolar line of the source image and dynamically update hidden +states through the uncertainty estimation module. Extensive results on the DTU +dataset and Tanks&Temple benchmark demonstrate the effectiveness of our method. +The code is available at our project page: +https://zju3dv.github.io/GD-PoseMVS/. + +
+
+
+
+
+ + ☆ LiDAttack: Robust Black-box Attack on LiDAR-based Object Detection + + +
+ Since DNN is vulnerable to carefully crafted adversarial examples, +adversarial attack on LiDAR sensors have been extensively studied. We introduce +a robust black-box attack dubbed LiDAttack. It utilizes a genetic algorithm +with a simulated annealing strategy to strictly limit the location and number +of perturbation points, achieving a stealthy and effective attack. And it +simulates scanning deviations, allowing it to adapt to dynamic changes in real +world scenario variations. Extensive experiments are conducted on 3 datasets +(i.e., KITTI, nuScenes, and self-constructed data) with 3 dominant object +detection models (i.e., PointRCNN, PointPillar, and PV-RCNN++). The results +reveal the efficiency of the LiDAttack when targeting a wide range of object +detection models, with an attack success rate (ASR) up to 90%. + +
+
+
+
+
+ + ☆ Mining and Transferring Feature-Geometry Coherence for Unsupervised + Point Cloud Registration NeurIPS2024 + + +
+ Point cloud registration, a fundamental task in 3D vision, has achieved +remarkable success with learning-based methods in outdoor environments. +Unsupervised outdoor point cloud registration methods have recently emerged to +circumvent the need for costly pose annotations. However, they fail to +establish reliable optimization objectives for unsupervised training, either +relying on overly strong geometric assumptions, or suffering from poor-quality +pseudo-labels due to inadequate integration of low-level geometric and +high-level contextual information. We have observed that in the feature space, +latent new inlier correspondences tend to cluster around respective positive +anchors that summarize features of existing inliers. Motivated by this +observation, we propose a novel unsupervised registration method termed INTEGER +to incorporate high-level contextual information for reliable pseudo-label +mining. Specifically, we propose the Feature-Geometry Coherence Mining module +to dynamically adapt the teacher for each mini-batch of data during training +and discover reliable pseudo-labels by considering both high-level feature +representations and low-level geometric cues. Furthermore, we propose +Anchor-Based Contrastive Learning to facilitate contrastive learning with +anchors for a robust feature space. Lastly, we introduce a Mixed-Density +Student to learn density-invariant features, addressing challenges related to +density variation and low overlap in the outdoor scenario. Extensive +experiments on KITTI and nuScenes datasets demonstrate that our INTEGER +achieves competitive performance in terms of accuracy and generalizability. + +
+
+ comment: Accepted by NeurIPS2024 +
+
+
+
+
+ + ☆ A Novel Deep Learning Tractography Fiber Clustering Framework for + Functionally Consistent White Matter Parcellation Using Multimodal Diffusion + MRI and Functional MRI + + +
+ Tractography fiber clustering using diffusion MRI (dMRI) is a crucial +strategy for white matter (WM) parcellation. Current methods primarily use the +geometric information of fibers (i.e., the spatial trajectories) to group +similar fibers into clusters, overlooking the important functional signals +present along the fiber tracts. There is increasing evidence that neural +activity in the WM can be measured using functional MRI (fMRI), offering +potentially valuable multimodal information for fiber clustering. In this +paper, we develop a novel deep learning fiber clustering framework, namely Deep +Multi-view Fiber Clustering (DMVFC), that uses joint dMRI and fMRI data to +enable functionally consistent WM parcellation. DMVFC can effectively integrate +the geometric characteristics of the WM fibers with the fMRI BOLD signals along +the fiber tracts. It includes two major components: 1) a multi-view pretraining +module to compute embedding features from fiber geometric information and +functional signals separately, and 2) a collaborative fine-tuning module to +simultaneously refine the two kinds of embeddings. In the experiments, we +compare DMVFC with two state-of-the-art fiber clustering methods and +demonstrate superior performance in achieving functionally meaningful and +consistent WM parcellation results. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ☆ GVKF: Gaussian Voxel Kernel Functions for Highly Efficient Surface + Reconstruction in Open Scenes NeurIPS 2024 + + +
+ In this paper we present a novel method for efficient and effective 3D +surface reconstruction in open scenes. Existing Neural Radiance Fields (NeRF) +based works typically require extensive training and rendering time due to the +adopted implicit representations. In contrast, 3D Gaussian splatting (3DGS) +uses an explicit and discrete representation, hence the reconstructed surface +is built by the huge number of Gaussian primitives, which leads to excessive +memory consumption and rough surface details in sparse Gaussian areas. To +address these issues, we propose Gaussian Voxel Kernel Functions (GVKF), which +establish a continuous scene representation based on discrete 3DGS through +kernel regression. The GVKF integrates fast 3DGS rasterization and highly +effective scene implicit representations, achieving high-fidelity open scene +surface reconstruction. Experiments on challenging scene datasets demonstrate +the efficiency and effectiveness of our proposed GVKF, featuring with high +reconstruction quality, real-time rendering speed, significant savings in +storage and training memory consumption. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Silver medal Solution for Image Matching Challenge 2024 + + +
+ Image Matching Challenge 2024 is a competition focused on building 3D maps +from diverse image sets, requiring participants to solve fundamental computer +vision challenges in image matching across varying angles, lighting, and +seasonal changes. This project develops a Pipeline method that combines +multiple advanced techniques: using pre-trained EfficientNet-B7 for initial +feature extraction and cosine distance-based image pair filtering, employing +both KeyNetAffNetHardNet and SuperPoint for keypoint feature extraction, +utilizing AdaLAM and SuperGlue for keypoint matching, and finally applying +Pycolmap for 3D spatial analysis. The methodology achieved an excellent score +of 0.167 on the private leaderboard, with experimental results demonstrating +that the combination of KeyNetAffNetHardNet and SuperPoint provides significant +advantages in keypoint detection and matching, particularly when dealing with +challenging variations in surface texture and environmental conditions that +typically degrade traditional algorithm performance. + +
+
+
+
+
+ + ☆ KptLLM: Unveiling the Power of Large Language Model for Keypoint + Comprehension NeurIPS 2024 + + +
+ Recent advancements in Multimodal Large Language Models (MLLMs) have greatly +improved their abilities in image understanding. However, these models often +struggle with grasping pixel-level semantic details, e.g., the keypoints of an +object. To bridge this gap, we introduce the novel challenge of Semantic +Keypoint Comprehension, which aims to comprehend keypoints across different +task scenarios, including keypoint semantic understanding, visual prompt-based +keypoint detection, and textual prompt-based keypoint detection. Moreover, we +introduce KptLLM, a unified multimodal model that utilizes an +identify-then-detect strategy to effectively address these challenges. KptLLM +underscores the initial discernment of semantics in keypoints, followed by the +precise determination of their positions through a chain-of-thought process. +With several carefully designed modules, KptLLM adeptly handles various +modality inputs, facilitating the interpretation of both semantic contents and +keypoint locations. Our extensive experiments demonstrate KptLLM's superiority +in various keypoint detection benchmarks and its unique semantic capabilities +in interpreting keypoints. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ OwMatch: Conditional Self-Labeling with Consistency for Open-World + Semi-Supervised Learning NeurIPS 2024 + + +
+ Semi-supervised learning (SSL) offers a robust framework for harnessing the +potential of unannotated data. Traditionally, SSL mandates that all classes +possess labeled instances. However, the emergence of open-world SSL (OwSSL) +introduces a more practical challenge, wherein unlabeled data may encompass +samples from unseen classes. This scenario leads to misclassification of unseen +classes as known ones, consequently undermining classification accuracy. To +overcome this challenge, this study revisits two methodologies from +self-supervised and semi-supervised learning, self-labeling and consistency, +tailoring them to address the OwSSL problem. Specifically, we propose an +effective framework called OwMatch, combining conditional self-labeling and +open-world hierarchical thresholding. Theoretically, we analyze the estimation +of class distribution on unlabeled data through rigorous statistical analysis, +thus demonstrating that OwMatch can ensure the unbiasedness of the self-label +assignment estimator with reliability. Comprehensive empirical analyses +demonstrate that our method yields substantial performance enhancements across +both known and unknown classes in comparison to previous studies. Code is +available at https://github.com/niusj03/OwMatch. + +
+
+ comment: NeurIPS 2024 camera-ready (10 pages, 4 figures) with the appendices + (10 pages, 7 figures) +
+
+
+
+
+ + ☆ Distribution alignment based transfer fusion frameworks on quantum + devices for seeking quantum advantages + + +
+ The scarcity of labelled data is specifically an urgent challenge in the +field of quantum machine learning (QML). Two transfer fusion frameworks are +proposed in this paper to predict the labels of a target domain data by +aligning its distribution to a different but related labelled source domain on +quantum devices. The frameworks fuses the quantum data from two different, but +related domains through a quantum information infusion channel. The predicting +tasks in the target domain can be achieved with quantum advantages by +post-processing quantum measurement results. One framework, the quantum basic +linear algebra subroutines (QBLAS) based implementation, can theoretically +achieve the procedure of transfer fusion with quadratic speedup on a universal +quantum computer. In addition, the other framework, a hardware-scalable +architecture, is implemented on the noisy intermediate-scale quantum (NISQ) +devices through a variational hybrid quantum-classical procedure. Numerical +experiments on the synthetic and handwritten digits datasets demonstrate that +the variatioinal transfer fusion (TF) framework can reach state-of-the-art +(SOTA) quantum DA method performance. + +
+
+
+
+
+ + ☆ DiffuMask-Editor: A Novel Paradigm of Integration Between the + Segmentation Diffusion Model and Image Editing to Improve Segmentation + Ability + + +
+ Semantic segmentation models, like mask2former, often demand a substantial +amount of manually annotated data, which is time-consuming and inefficient to +acquire. Leveraging state-of-the-art text-to-image models like Midjourney and +Stable Diffusion has emerged as an effective strategy for automatically +generating synthetic data instead of human annotations. However, prior +approaches have been constrained to synthesizing single-instance images due to +the instability inherent in generating multiple instances with Stable +Diffusion. To expand the domains and diversity of synthetic datasets, this +paper introduces a novel paradigm named DiffuMask-Editor, which combines the +Diffusion Model for Segmentation with Image Editing. By integrating multiple +objects into images using Text2Image models, our method facilitates the +creation of more realistic datasets that closely resemble open-world settings +while simultaneously generating accurate masks. Our approach significantly +reduces the laborious effort associated with manual annotation while ensuring +precise mask generation. Experimental results demonstrate that synthetic data +generated by DiffuMask-Editor enable segmentation methods to achieve superior +performance compared to real data. Particularly in zero-shot backgrounds, +DiffuMask-Editor achieves new state-of-the-art results on Unseen classes of VOC +2012. The code and models will be publicly available soon. + +
+
+ comment: 13 pages,4 figures +
+
+
+
+
+ + ☆ Bootstrapping Top-down Information for Self-modulating Slot Attention NeurIPS2 + + +
+ Object-centric learning (OCL) aims to learn representations of individual +objects within visual scenes without manual supervision, facilitating efficient +and effective visual reasoning. Traditional OCL methods primarily employ +bottom-up approaches that aggregate homogeneous visual features to represent +objects. However, in complex visual environments, these methods often fall +short due to the heterogeneous nature of visual features within an object. To +address this, we propose a novel OCL framework incorporating a top-down +pathway. This pathway first bootstraps the semantics of individual objects and +then modulates the model to prioritize features relevant to these semantics. By +dynamically modulating the model based on its own output, our top-down pathway +enhances the representational quality of objects. Our framework achieves +state-of-the-art performance across multiple synthetic and real-world +object-discovery benchmarks. + +
+
+ comment: Accepted to NeurIPS2 2024 +
+
+
+
+
+ + ☆ Expanding Sparse Tuning for Low Memory Usage NeurIPS 2024 + + +
+ Parameter-efficient fine-tuning (PEFT) is an effective method for adapting +pre-trained vision models to downstream tasks by tuning a small subset of +parameters. Among PEFT methods, sparse tuning achieves superior performance by +only adjusting the weights most relevant to downstream tasks, rather than +densely tuning the whole weight matrix. However, this performance improvement +has been accompanied by increases in memory usage, which stems from two +factors, i.e., the storage of the whole weight matrix as learnable parameters +in the optimizer and the additional storage of tunable weight indexes. In this +paper, we propose a method named SNELL (Sparse tuning with kerNELized LoRA) for +sparse tuning with low memory usage. To achieve low memory usage, SNELL +decomposes the tunable matrix for sparsification into two learnable low-rank +matrices, saving from the costly storage of the whole original matrix. A +competition-based sparsification mechanism is further proposed to avoid the +storage of tunable weight indexes. To maintain the effectiveness of sparse +tuning with low-rank matrices, we extend the low-rank decomposition by applying +nonlinear kernel functions to the whole-matrix merging. Consequently, we gain +an increase in the rank of the merged matrix, enhancing the ability of SNELL in +adapting the pre-trained models to downstream tasks. Extensive experiments on +multiple downstream tasks show that SNELL achieves state-of-the-art performance +with low memory usage, endowing PEFT with sparse tuning to large-scale models. +Codes are available at https://github.com/ssfgunner/SNELL. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ AIWR: Aerial Image Water Resource Dataset for Segmentation Analysis + + +
+ Effective water resource management is crucial in agricultural regions like +northeastern Thailand, where limited water retention in sandy soils poses +significant challenges. In response to this issue, the Aerial Image Water +Resource (AIWR) dataset was developed, comprising 800 aerial images focused on +natural and artificial water bodies in this region. The dataset was created +using Bing Maps and follows the standards of the Fundamental Geographic Data +Set (FGDS). It includes ground truth annotations validated by experts in remote +sensing, making it an invaluable resource for researchers in geoinformatics, +computer vision, and artificial intelligence. The AIWR dataset presents +considerable challenges, such as segmentation due to variations in the size, +color, shape, and similarity of water bodies, which often resemble other land +use categories. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ Non rigid geometric distortions correction -- Application to atmospheric + turbulence stabilization + + +
+ A novel approach is presented to recover an image degraded by atmospheric +turbulence. Given a sequence of frames affected by turbulence, we construct a +variational model to characterize the static image. The optimization problem is +solved by Bregman Iteration and the operator splitting method. Our algorithm is +simple, efficient, and can be easily generalized for different scenarios. + +
+
+
+
+
+ + ☆ MSTA3D: Multi-scale Twin-attention for 3D Instance Segmentation + + +
+ Recently, transformer-based techniques incorporating superpoints have become +prevalent in 3D instance segmentation. However, they often encounter an +over-segmentation problem, especially noticeable with large objects. +Additionally, unreliable mask predictions stemming from superpoint mask +prediction further compound this issue. To address these challenges, we propose +a novel framework called MSTA3D. It leverages multi-scale feature +representation and introduces a twin-attention mechanism to effectively capture +them. Furthermore, MSTA3D integrates a box query with a box regularizer, +offering a complementary spatial constraint alongside semantic queries. +Experimental evaluations on ScanNetV2, ScanNet200 and S3DIS datasets +demonstrate that our approach surpasses state-of-the-art 3D instance +segmentation methods. + +
+
+ comment: 14 pages, 9 figures, 7 tables, conference +
+
+
+
+
+ + ☆ Learning predictable and robust neural representations by straightening + image sequences NeurIPS 2024 + + +
+ Prediction is a fundamental capability of all living organisms, and has been +proposed as an objective for learning sensory representations. Recent work +demonstrates that in primate visual systems, prediction is facilitated by +neural representations that follow straighter temporal trajectories than their +initial photoreceptor encoding, which allows for prediction by linear +extrapolation. Inspired by these experimental findings, we develop a +self-supervised learning (SSL) objective that explicitly quantifies and +promotes straightening. We demonstrate the power of this objective in training +deep feedforward neural networks on smoothly-rendered synthetic image sequences +that mimic commonly-occurring properties of natural videos. The learned model +contains neural embeddings that are predictive, but also factorize the +geometric, photometric, and semantic attributes of objects. The representations +also prove more robust to noise and adversarial attacks compared to previous +SSL methods that optimize for invariance to random augmentations. Moreover, +these beneficial properties can be transferred to other training procedures by +using the straightening objective as a regularizer, suggesting a broader +utility for straightening as a principle for robust unsupervised learning. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ☆ ARN-LSTM: A Multi-Stream Attention-Based Model for Action Recognition + with Temporal Dynamics + + +
+ This paper presents ARN-LSTM, a novel multi-stream action recognition model +designed to address the challenge of simultaneously capturing spatial motion +and temporal dynamics in action sequences. Traditional methods often focus +solely on spatial or temporal features, limiting their ability to comprehend +complex human activities fully. Our proposed model integrates joint, motion, +and temporal information through a multi-stream fusion architecture. +Specifically, it comprises a joint stream for extracting skeleton features, a +temporal stream for capturing dynamic temporal features, and an ARN-LSTM block +that utilizes Time-Distributed Long Short-Term Memory (TD-LSTM) layers followed +by an Attention Relation Network (ARN) to model temporal relations. The outputs +from these streams are fused in a fully connected layer to provide the final +action prediction. Evaluations on the NTU RGB+D 60 and NTU RGB+D 120 datasets +demonstrate the effectiveness of our model, achieving effective performance, +particularly in group activity recognition. + +
+
+
+
+
+ + ☆ Automatic Structured Pruning for Efficient Architecture in Federated + Learning + + +
+ In Federated Learning (FL), training is conducted on client devices, +typically with limited computational resources and storage capacity. To address +these constraints, we propose an automatic pruning scheme tailored for FL +systems. Our solution improves computation efficiency on client devices, while +minimizing communication costs. One of the challenges of tuning pruning +hyper-parameters in FL systems is the restricted access to local data. Thus, we +introduce an automatic pruning paradigm that dynamically determines pruning +boundaries. Additionally, we utilized a structured pruning algorithm optimized +for mobile devices that lack hardware support for sparse computations. +Experimental results demonstrate the effectiveness of our approach, achieving +accuracy comparable to existing methods. Our method notably reduces the number +of parameters by 89% and FLOPS by 90%, with minimal impact on the accuracy of +the FEMNIST and CelebFaces datasets. Furthermore, our pruning method decreases +communication overhead by up to 5x and halves inference time when deployed on +Android devices. + +
+
+
+
+
+ + ☆ Disentangled PET Lesion Segmentation + + +
+ PET imaging is an invaluable tool in clinical settings as it captures the +functional activity of both healthy anatomy and cancerous lesions. Developing +automatic lesion segmentation methods for PET images is crucial since manual +lesion segmentation is laborious and prone to inter- and intra-observer +variability. We propose PET-Disentangler, a 3D disentanglement method that uses +a 3D UNet-like encoder-decoder architecture to disentangle disease and normal +healthy anatomical features with losses for segmentation, reconstruction, and +healthy component plausibility. A critic network is used to encourage the +healthy latent features to match the distribution of healthy samples and thus +encourages these features to not contain any lesion-related features. Our +quantitative results show that PET-Disentangler is less prone to incorrectly +declaring healthy and high tracer uptake regions as cancerous lesions, since +such uptake pattern would be assigned to the disentangled healthy component. + +
+
+ comment: 4 pages, 2 figures, 1 table +
+
+
+
+
+ + ☆ ChatTracker: Enhancing Visual Tracking Performance via Chatting with + Multimodal Large Language Model + + +
+ Visual object tracking aims to locate a targeted object in a video sequence +based on an initial bounding box. Recently, Vision-Language~(VL) trackers have +proposed to utilize additional natural language descriptions to enhance +versatility in various applications. However, VL trackers are still inferior to +State-of-The-Art (SoTA) visual trackers in terms of tracking performance. We +found that this inferiority primarily results from their heavy reliance on +manual textual annotations, which include the frequent provision of ambiguous +language descriptions. In this paper, we propose ChatTracker to leverage the +wealth of world knowledge in the Multimodal Large Language Model (MLLM) to +generate high-quality language descriptions and enhance tracking performance. +To this end, we propose a novel reflection-based prompt optimization module to +iteratively refine the ambiguous and inaccurate descriptions of the target with +tracking feedback. To further utilize semantic information produced by MLLM, a +simple yet effective VL tracking framework is proposed and can be easily +integrated as a plug-and-play module to boost the performance of both VL and +visual trackers. Experimental results show that our proposed ChatTracker +achieves a performance comparable to existing methods. + +
+
+
+
+
+ + ☆ Multi-task Geometric Estimation of Depth and Surface Normal from + Monocular 360° Images + + +
+ Geometric estimation is required for scene understanding and analysis in +panoramic 360{\deg} images. Current methods usually predict a single feature, +such as depth or surface normal. These methods can lack robustness, especially +when dealing with intricate textures or complex object surfaces. We introduce a +novel multi-task learning (MTL) network that simultaneously estimates depth and +surface normals from 360{\deg} images. Our first innovation is our MTL +architecture, which enhances predictions for both tasks by integrating +geometric information from depth and surface normal estimation, enabling a +deeper understanding of 3D scene structure. Another innovation is our fusion +module, which bridges the two tasks, allowing the network to learn shared +representations that improve accuracy and robustness. Experimental results +demonstrate that our MTL architecture significantly outperforms +state-of-the-art methods in both depth and surface normal estimation, showing +superior performance in complex and diverse scenes. Our model's effectiveness +and generalizability, particularly in handling intricate surface textures, +establish it as a new benchmark in 360{\deg} image geometric estimation. The +code and model are available at +\url{https://github.com/huangkun101230/360MTLGeometricEstimation}. + +
+
+ comment: 18 pages, this paper is accepted by Computational Visual Media + Journal (CVMJ) but not pushlished yet +
+
+
+
+
+ + ☆ Rotation Perturbation Robustness in Point Cloud Analysis: A Perspective + of Manifold Distillation + + +
+ Point cloud is often regarded as a discrete sampling of Riemannian manifold +and plays a pivotal role in the 3D image interpretation. Particularly, rotation +perturbation, an unexpected small change in rotation caused by various factors +(like equipment offset, system instability, measurement errors and so on), can +easily lead to the inferior results in point cloud learning tasks. However, +classical point cloud learning methods are sensitive to rotation perturbation, +and the existing networks with rotation robustness also have much room for +improvements in terms of performance and noise tolerance. Given these, this +paper remodels the point cloud from the perspective of manifold as well as +designs a manifold distillation method to achieve the robustness of rotation +perturbation without any coordinate transformation. In brief, during the +training phase, we introduce a teacher network to learn the rotation robustness +information and transfer this information to the student network through online +distillation. In the inference phase, the student network directly utilizes the +original 3D coordinate information to achieve the robustness of rotation +perturbation. Experiments carried out on four different datasets verify the +effectiveness of our method. Averagely, on the Modelnet40 and ScanobjectNN +classification datasets with random rotation perturbations, our classification +accuracy has respectively improved by 4.92% and 4.41%, compared to popular +rotation-robust networks; on the ShapeNet and S3DIS segmentation datasets, +compared to the rotation-robust networks, the improvements of mIoU are 7.36% +and 4.82%, respectively. Besides, from the experimental results, the proposed +algorithm also shows excellent performance in resisting noise and outliers. + +
+
+ comment: 13 pages, 8 figures, submitted to TCSVT +
+
+
+
+
+ + ☆ Learning from Convolution-based Unlearnable Datastes + + +
+ The construction of large datasets for deep learning has raised concerns +regarding unauthorized use of online data, leading to increased interest in +protecting data from third-parties who want to use it for training. The +Convolution-based Unlearnable DAtaset (CUDA) method aims to make data +unlearnable by applying class-wise blurs to every image in the dataset so that +neural networks learn relations between blur kernels and labels, as opposed to +informative features for classifying clean data. In this work, we evaluate +whether CUDA data remains unlearnable after image sharpening and frequency +filtering, finding that this combination of simple transforms improves the +utility of CUDA data for training. In particular, we observe a substantial +increase in test accuracy over adversarial training for models trained with +CUDA unlearnable data from CIFAR-10, CIFAR-100, and ImageNet-100. In training +models to high accuracy using unlearnable data, we underscore the need for +ongoing refinement in data poisoning techniques to ensure data privacy. Our +method opens new avenues for enhancing the robustness of unlearnable datasets +by highlighting that simple methods such as sharpening and frequency filtering +are capable of breaking convolution-based unlearnable datasets. + +
+
+
+
+
+ + ☆ Not Just Object, But State: Compositional Incremental Learning without + Forgetting + + +
+ Most incremental learners excessively prioritize coarse classes of objects +while neglecting various kinds of states (e.g. color and material) attached to +the objects. As a result, they are limited in the ability to reason +fine-grained compositionality of state-object pairs. To remedy this limitation, +we propose a novel task called Compositional Incremental Learning +(composition-IL), enabling the model to recognize state-object compositions as +a whole in an incremental learning fashion. Since the lack of suitable +benchmarks, we re-organize two existing datasets and make them tailored for +composition-IL. Then, we propose a prompt-based Composition Incremental Learner +(CompILer), to overcome the ambiguous composition boundary problem which +challenges composition-IL largely. Specifically, we exploit multi-pool prompt +learning, which is regularized by inter-pool prompt discrepancy and intra-pool +prompt diversity. Besides, we devise object-injected state prompting by using +object prompts to guide the selection of state prompts. Furthermore, we fuse +the selected prompts by a generalized-mean strategy, to eliminate irrelevant +information learned in the prompts. Extensive experiments on two datasets +exhibit state-of-the-art performance achieved by CompILer. + +
+
+
+
+
+ + ☆ Next Best View For Point-Cloud Model Acquisition: Bayesian Approximation + and Uncertainty Analysis + + +
+ The Next Best View problem is a computer vision problem widely studied in +robotics. To solve it, several methodologies have been proposed over the years. +Some, more recently, propose the use of deep learning models. Predictions +obtained with the help of deep learning models naturally have some uncertainty +associated with them. Despite this, the standard models do not allow for their +quantification. However, Bayesian estimation theory contributed to the +demonstration that dropout layers allow to estimate prediction uncertainty in +neural networks. + This work adapts the point-net-based neural network for Next-Best-View +(PC-NBV). It incorporates dropout layers into the model's architecture, thus +allowing the computation of the uncertainty estimate associated with its +predictions. The aim of the work is to improve the network's accuracy in +correctly predicting the next best viewpoint, proposing a way to make the 3D +reconstruction process more efficient. + Two uncertainty measurements capable of reflecting the prediction's error and +accuracy, respectively, were obtained. These enabled the reduction of the +model's error and the increase in its accuracy from 30\% to 80\% by identifying +and disregarding predictions with high values of uncertainty. Another method +that directly uses these uncertainty metrics to improve the final prediction +was also proposed. However, it showed very residual improvements. + +
+
+
+
+
+ + ☆ A Probabilistic Formulation of LiDAR Mapping with Neural Radiance Fields + + +
+ In this paper we reexamine the process through which a Neural Radiance Field +(NeRF) can be trained to produce novel LiDAR views of a scene. Unlike image +applications where camera pixels integrate light over time, LiDAR pulses arrive +at specific times. As such, multiple LiDAR returns are possible for any given +detector and the classification of these returns is inherently probabilistic. +Applying a traditional NeRF training routine can result in the network learning +phantom surfaces in free space between conflicting range measurements, similar +to how floater aberrations may be produced by an image model. We show that by +formulating loss as an integral of probability (rather than as an integral of +optical density) the network can learn multiple peaks for a given ray, allowing +the sampling of first, nth, or strongest returns from a single output channel. +Code is available at https://github.com/mcdermatt/PLINK + +
+
+
+
+
+ + ☆ Multi-Transmotion: Pre-trained Model for Human Motion Prediction + + +
+ The ability of intelligent systems to predict human behaviors is crucial, +particularly in fields such as autonomous vehicle navigation and social +robotics. However, the complexity of human motion have prevented the +development of a standardized dataset for human motion prediction, thereby +hindering the establishment of pre-trained models. In this paper, we address +these limitations by integrating multiple datasets, encompassing both +trajectory and 3D pose keypoints, to propose a pre-trained model for human +motion prediction. We merge seven distinct datasets across varying modalities +and standardize their formats. To facilitate multimodal pre-training, we +introduce Multi-Transmotion, an innovative transformer-based model designed for +cross-modality pre-training. Additionally, we present a novel masking strategy +to capture rich representations. Our methodology demonstrates competitive +performance across various datasets on several downstream tasks, including +trajectory prediction in the NBA and JTA datasets, as well as pose prediction +in the AMASS and 3DPW datasets. The code is publicly available: +https://github.com/vita-epfl/multi-transmotion + +
+
+ comment: CoRL 2024 +
+
+
+
+
+ + ☆ Semantic-Aligned Adversarial Evolution Triangle for High-Transferability + Vision-Language Attack + + +
+ Vision-language pre-training (VLP) models excel at interpreting both images +and text but remain vulnerable to multimodal adversarial examples (AEs). +Advancing the generation of transferable AEs, which succeed across unseen +models, is key to developing more robust and practical VLP models. Previous +approaches augment image-text pairs to enhance diversity within the adversarial +example generation process, aiming to improve transferability by expanding the +contrast space of image-text features. However, these methods focus solely on +diversity around the current AEs, yielding limited gains in transferability. To +address this issue, we propose to increase the diversity of AEs by leveraging +the intersection regions along the adversarial trajectory during optimization. +Specifically, we propose sampling from adversarial evolution triangles composed +of clean, historical, and current adversarial examples to enhance adversarial +diversity. We provide a theoretical analysis to demonstrate the effectiveness +of the proposed adversarial evolution triangle. Moreover, we find that +redundant inactive dimensions can dominate similarity calculations, distorting +feature matching and making AEs model-dependent with reduced transferability. +Hence, we propose to generate AEs in the semantic image-text feature contrast +space, which can project the original feature space into a semantic corpus +subspace. The proposed semantic-aligned subspace can reduce the image feature +redundancy, thereby improving adversarial transferability. Extensive +experiments across different datasets and models demonstrate that the proposed +method can effectively improve adversarial transferability and outperform +state-of-the-art adversarial attack methods. The code is released at +https://github.com/jiaxiaojunQAQ/SA-AET. + +
+
+
+
+
+ + ☆ Active Prompt Tuning Enables Gpt-40 To Do Efficient Classification Of + Microscopy Images + + +
+ Traditional deep learning-based methods for classifying cellular features in +microscopy images require time- and labor-intensive processes for training +models. Among the current limitations are major time commitments from domain +experts for accurate ground truth preparation; and the need for a large amount +of input image data. We previously proposed a solution that overcomes these +challenges using OpenAI's GPT-4(V) model on a pilot dataset (Iba-1 +immuno-stained tissue sections from 11 mouse brains). Results on the pilot +dataset were equivalent in accuracy and with a substantial improvement in +throughput efficiency compared to the baseline using a traditional +Convolutional Neural Net (CNN)-based approach. + The present study builds upon this framework using a second unique and +substantially larger dataset of microscopy images. Our current approach uses a +newer and faster model, GPT-4o, along with improved prompts. It was evaluated +on a microscopy image dataset captured at low (10x) magnification from +cresyl-violet-stained sections through the cerebellum of a total of 18 mouse +brains (9 Lurcher mice, 9 wild-type controls). We used our approach to classify +these images either as a control group or Lurcher mutant. Using 6 mice in the +prompt set the results were correct classification for 11 out of the 12 mice +(92%) with 96% higher efficiency, reduced image requirements, and lower demands +on time and effort of domain experts compared to the baseline method (snapshot +ensemble of CNN models). These results confirm that our approach is effective +across multiple datasets from different brain regions and magnifications, with +minimal overhead. + +
+
+
+
+
+ + ☆ FUSECAPS: Investigating Feature Fusion Based Framework for Capsule + Endoscopy Image Classification + + +
+ In order to improve model accuracy, generalization, and class imbalance +issues, this work offers a strong methodology for classifying endoscopic +images. We suggest a hybrid feature extraction method that combines +convolutional neural networks (CNNs), multi-layer perceptrons (MLPs), and +radiomics. Rich, multi-scale feature extraction is made possible by this +combination, which captures both deep and handmade representations. These +features are then used by a classification head to classify diseases, producing +a model with higher generalization and accuracy. In this framework we have +achieved a validation accuracy of 76.2% in the capsule endoscopy video frame +classification task. + +
+
+
+
+
+ + ☆ Data-Driven Hierarchical Open Set Recognition ICRA + + +
+ This paper presents a novel data-driven hierarchical approach to open set +recognition (OSR) for robust perception in robotics and computer vision, +utilizing constrained agglomerative clustering to automatically build a +hierarchy of known classes in embedding space without requiring manual +relational information. The method, demonstrated on the Animals with Attributes +2 (AwA2) dataset, achieves competitive results with an AUC ROC score of 0.82 +and utility score of 0.85, while introducing two classification approaches +(score-based and traversal-based) and a new Concentration Centrality (CC) +metric for measuring hierarchical classification consistency. Although not +surpassing existing models in accuracy, the approach provides valuable +additional information about unknown classes through automatically generated +hierarchies, requires no supplementary information beyond typical supervised +model requirements, and introduces the Class Concentration Centrality (CCC) +metric for evaluating unknown class placement consistency, with future work +aimed at improving accuracy, validating the CC metric, and expanding to +Large-Scale Open-Set Classification Protocols for ImageNet. + +
+
+ comment: Accepted as Extended Abstract to the IEEE ICRA@40 2024 +
+
+
+
+
+ + ♻ ☆ EMMA: End-to-End Multimodal Model for Autonomous Driving + + +
+ We introduce EMMA, an End-to-end Multimodal Model for Autonomous driving. +Built on a multi-modal large language model foundation, EMMA directly maps raw +camera sensor data into various driving-specific outputs, including planner +trajectories, perception objects, and road graph elements. EMMA maximizes the +utility of world knowledge from the pre-trained large language models, by +representing all non-sensor inputs (e.g. navigation instructions and ego +vehicle status) and outputs (e.g. trajectories and 3D locations) as natural +language text. This approach allows EMMA to jointly process various driving +tasks in a unified language space, and generate the outputs for each task using +task-specific prompts. Empirically, we demonstrate EMMA's effectiveness by +achieving state-of-the-art performance in motion planning on nuScenes as well +as competitive results on the Waymo Open Motion Dataset (WOMD). EMMA also +yields competitive results for camera-primary 3D object detection on the Waymo +Open Dataset (WOD). We show that co-training EMMA with planner trajectories, +object detection, and road graph tasks yields improvements across all three +domains, highlighting EMMA's potential as a generalist model for autonomous +driving applications. However, EMMA also exhibits certain limitations: it can +process only a small amount of image frames, does not incorporate accurate 3D +sensing modalities like LiDAR or radar and is computationally expensive. We +hope that our results will inspire further research to mitigate these issues +and to further evolve the state of the art in autonomous driving model +architectures. + +
+
+ comment: Blog post: https://waymo.com/blog/2024/10/introducing-emma/ +
+
+
+
+
+ + ♻ ☆ Taxonomy-Aware Continual Semantic Segmentation in Hyperbolic Spaces for + Open-World Perception + + +
+ Semantic segmentation models are typically trained on a fixed set of classes, +limiting their applicability in open-world scenarios. Class-incremental +semantic segmentation aims to update models with emerging new classes while +preventing catastrophic forgetting of previously learned ones. However, +existing methods impose strict rigidity on old classes, reducing their +effectiveness in learning new incremental classes. In this work, we propose +Taxonomy-Oriented Poincar\'e-regularized Incremental-Class Segmentation +(TOPICS) that learns feature embeddings in hyperbolic space following explicit +taxonomy-tree structures. This supervision provides plasticity for old classes, +updating ancestors based on new classes while integrating new classes at +fitting positions. Additionally, we maintain implicit class relational +constraints on the geometric basis of the Poincar\'e ball. This ensures that +the latent space can continuously adapt to new constraints while maintaining a +robust structure to combat catastrophic forgetting. We also establish eight +realistic incremental learning protocols for autonomous driving scenarios, +where novel classes can originate from known classes or the background. +Extensive evaluations of TOPICS on the Cityscapes and Mapillary Vistas 2.0 +benchmarks demonstrate that it achieves state-of-the-art performance. We make +the code and trained models publicly available at +http://topics.cs.uni-freiburg.de. + +
+
+
+
+
+ + ♻ ☆ Interpreting CLIP with Sparse Linear Concept Embeddings (SpLiCE) NeurIPS 2024 + + +
+ CLIP embeddings have demonstrated remarkable performance across a wide range +of multimodal applications. However, these high-dimensional, dense vector +representations are not easily interpretable, limiting our understanding of the +rich structure of CLIP and its use in downstream applications that require +transparency. In this work, we show that the semantic structure of CLIP's +latent space can be leveraged to provide interpretability, allowing for the +decomposition of representations into semantic concepts. We formulate this +problem as one of sparse recovery and propose a novel method, Sparse Linear +Concept Embeddings, for transforming CLIP representations into sparse linear +combinations of human-interpretable concepts. Distinct from previous work, +SpLiCE is task-agnostic and can be used, without training, to explain and even +replace traditional dense CLIP representations, maintaining high downstream +performance while significantly improving their interpretability. We also +demonstrate significant use cases of SpLiCE representations including detecting +spurious correlations and model editing. + +
+
+ comment: 25 pages, 15 figures, NeurIPS 2024. Code is provided at + https://github.com/AI4LIFE-GROUP/SpLiCE +
+
+
+
+
+ + ♻ ☆ xMIL: Insightful Explanations for Multiple Instance Learning in + Histopathology + + +
+ Multiple instance learning (MIL) is an effective and widely used approach for +weakly supervised machine learning. In histopathology, MIL models have achieved +remarkable success in tasks like tumor detection, biomarker prediction, and +outcome prognostication. However, MIL explanation methods are still lagging +behind, as they are limited to small bag sizes or disregard instance +interactions. We revisit MIL through the lens of explainable AI (XAI) and +introduce xMIL, a refined framework with more general assumptions. We +demonstrate how to obtain improved MIL explanations using layer-wise relevance +propagation (LRP) and conduct extensive evaluation experiments on three toy +settings and four real-world histopathology datasets. Our approach consistently +outperforms previous explanation attempts with particularly improved +faithfulness scores on challenging biomarker prediction tasks. Finally, we +showcase how xMIL explanations enable pathologists to extract insights from MIL +models, representing a significant advance for knowledge discovery and model +debugging in digital histopathology. Codes are available at: +https://github.com/tubml-pathology/xMIL. + +
+
+
+
+
+ + ♻ ☆ Fashion-VDM: Video Diffusion Model for Virtual Try-On SIGGRAPH + + +
+ We present Fashion-VDM, a video diffusion model (VDM) for generating virtual +try-on videos. Given an input garment image and person video, our method aims +to generate a high-quality try-on video of the person wearing the given +garment, while preserving the person's identity and motion. Image-based virtual +try-on has shown impressive results; however, existing video virtual try-on +(VVT) methods are still lacking garment details and temporal consistency. To +address these issues, we propose a diffusion-based architecture for video +virtual try-on, split classifier-free guidance for increased control over the +conditioning inputs, and a progressive temporal training strategy for +single-pass 64-frame, 512px video generation. We also demonstrate the +effectiveness of joint image-video training for video try-on, especially when +video data is limited. Our qualitative and quantitative experiments show that +our approach sets the new state-of-the-art for video virtual try-on. For +additional results, visit our project page: +https://johannakarras.github.io/Fashion-VDM. + +
+
+ comment: Accepted to SIGGRAPH Asia 2024 +
+
+
+
+
+ + ♻ ☆ SPEAK: Speech-Driven Pose and Emotion-Adjustable Talking Head Generation + + +
+ Most earlier researches on talking face generation have focused on the +synchronization of lip motion and speech content. However, head pose and facial +emotions are equally important characteristics of natural faces. While +audio-driven talking face generation has seen notable advancements, existing +methods either overlook facial emotions or are limited to specific individuals +and cannot be applied to arbitrary subjects. In this paper, we propose a novel +one-shot Talking Head Generation framework (SPEAK) that distinguishes itself +from the general Talking Face Generation by enabling emotional and postural +control. Specifically, we introduce Inter-Reconstructed Feature Disentanglement +(IRFD) module to decouple facial features into three latent spaces. Then we +design a face editing module that modifies speech content and facial latent +codes into a single latent space. Subsequently, we present a novel generator +that employs modified latent codes derived from the editing module to regulate +emotional expression, head poses, and speech content in synthesizing facial +animations. Extensive trials demonstrate that our method ensures lip +synchronization with the audio while enabling decoupled control of facial +features, it can generate realistic talking head with coordinated lip motions, +authentic facial emotions, and smooth head movements. The demo video is +available: https://anonymous.4open.science/r/SPEAK-8A22 + +
+
+
+
+
+ + ♻ ☆ GSCo: Towards Generalizable AI in Medicine via Generalist-Specialist + Collaboration + + +
+ Generalist foundation models (GFMs) are renowned for their exceptional +capability and flexibility in effectively generalizing across diverse tasks and +modalities. In the field of medicine, while GFMs exhibit superior +generalizability based on their extensive intrinsic knowledge as well as +proficiency in instruction following and in-context learning, specialist models +excel in precision due to their domain knowledge. In this work, for the first +time, we explore the synergy between the GFM and specialist models, to enable +precise medical image analysis on a broader scope. Specifically, we propose a +cooperative framework, Generalist-Specialist Collaboration (GSCo), which +consists of two stages, namely the construction of GFM and specialists, and +collaborative inference on downstream tasks. In the construction stage, we +develop MedDr, the largest open-source GFM tailored for medicine, showcasing +exceptional instruction-following and in-context learning capabilities. +Meanwhile, a series of lightweight specialists are crafted for downstream tasks +with low computational cost. In the collaborative inference stage, we introduce +two cooperative mechanisms, Mixture-of-Expert Diagnosis and Retrieval-Augmented +Diagnosis, to harvest the generalist's in-context learning abilities alongside +the specialists' domain expertise. For a comprehensive evaluation, we curate a +large-scale benchmark featuring 28 datasets and about 250,000 images. Extensive +results demonstrate that MedDr consistently outperforms state-of-the-art GFMs +on downstream datasets. Furthermore, GSCo exceeds both GFMs and specialists +across all out-of-domain disease diagnosis datasets. These findings indicate a +significant paradigm shift in the application of GFMs, transitioning from +separate models for specific tasks to a collaborative approach between GFMs and +specialists, thereby advancing the frontiers of generalizable AI in medicine. + +
+
+
+
+
+ + ♻ ☆ Fast yet Safe: Early-Exiting with Risk Control + + +
+ Scaling machine learning models significantly improves their performance. +However, such gains come at the cost of inference being slow and +resource-intensive. Early-exit neural networks (EENNs) offer a promising +solution: they accelerate inference by allowing intermediate layers to exit and +produce a prediction early. Yet a fundamental issue with EENNs is how to +determine when to exit without severely degrading performance. In other words, +when is it 'safe' for an EENN to go 'fast'? To address this issue, we +investigate how to adapt frameworks of risk control to EENNs. Risk control +offers a distribution-free, post-hoc solution that tunes the EENN's exiting +mechanism so that exits only occur when the output is of sufficient quality. We +empirically validate our insights on a range of vision and language tasks, +demonstrating that risk control can produce substantial computational savings, +all the while preserving user-specified performance goals. + +
+
+ comment: 27 pages, 13 figures, 4 tables (incl. appendix) +
+
+
+
+
+ + ♻ ☆ Manipulation Facing Threats: Evaluating Physical Vulnerabilities in + End-to-End Vision Language Action Models + + +
+ Recently, driven by advancements in Multimodal Large Language Models (MLLMs), +Vision Language Action Models (VLAMs) are being proposed to achieve better +performance in open-vocabulary scenarios for robotic manipulation tasks. Since +manipulation tasks involve direct interaction with the physical world, ensuring +robustness and safety during the execution of this task is always a very +critical issue. In this paper, by synthesizing current safety research on MLLMs +and the specific application scenarios of the manipulation task in the physical +world, we comprehensively evaluate VLAMs in the face of potential physical +threats. Specifically, we propose the Physical Vulnerability Evaluating +Pipeline (PVEP) that can incorporate as many visual modal physical threats as +possible for evaluating the physical robustness of VLAMs. The physical threats +in PVEP specifically include Out-of-Distribution, Typography-based Visual +Prompts, and Adversarial Patch Attacks. By comparing the performance +fluctuations of VLAMs before and after being attacked, we provide generalizable +Analyses of how VLAMs respond to different physical security threats. Our +project page is in this link: +https://chaducheng.github.io/Manipulat-Facing-Threats/. + +
+
+
+
+
+ + ♻ ☆ UniRGB-IR: A Unified Framework for RGB-Infrared Semantic Tasks via + Adapter Tuning + + +
+ Semantic analysis on visible (RGB) and infrared (IR) images has gained +attention for its ability to be more accurate and robust under low-illumination +and complex weather conditions. Due to the lack of pre-trained foundation +models on the large-scale infrared image datasets, existing methods prefer to +design task-specific frameworks and directly fine-tune them with pre-trained +foundation models on their RGB-IR semantic relevance datasets, which results in +poor scalability and limited generalization. In this work, we propose a general +and efficient framework called UniRGB-IR to unify RGB-IR semantic tasks, in +which a novel adapter is developed to efficiently introduce richer RGB-IR +features into the pre-trained RGB-based foundation model. Specifically, our +framework consists of a RGB-based foundation model, a Multi-modal Feature Pool +(MFP) module and a Supplementary Feature Injector (SFI) module. The MFP and SFI +modules cooperate with each other as an adapter to effectively complement the +RGB-based features with the rich RGB-IR features. During training process, we +freeze the entire foundation model to inherit prior knowledge and only optimize +the proposed adapter. Furthermore, to verify the effectiveness of our +framework, we utilize the vanilla vision transformer (ViT-Base) as the +pre-trained foundation model to perform extensive experiments. Experimental +results on various RGB-IR downstream tasks demonstrate that our method can +achieve state-of-the-art performance. The source code and results are available +at https://github.com/PoTsui99/UniRGB-IR.git. + +
+
+
+
+
+ + ♻ ☆ Modular Quantization-Aware Training for 6D Object Pose Estimation + + +
+ Edge applications, such as collaborative robotics and spacecraft rendezvous, +demand efficient 6D object pose estimation on resource-constrained embedded +platforms. Existing 6D pose estimation networks are often too large for such +deployments, necessitating compression while maintaining reliable performance. +To address this challenge, we introduce Modular Quantization-Aware Training +(MQAT), an adaptive and mixed-precision quantization-aware training strategy +that exploits the modular structure of modern 6D pose estimation architectures. +MQAT guides a systematic gradated modular quantization sequence and determines +module-specific bit precisions, leading to quantized models that outperform +those produced by state-of-the-art uniform and mixed-precision quantization +techniques. Our experiments showcase the generality of MQAT across datasets, +architectures, and quantization algorithms. Remarkably, MQAT-trained quantized +models achieve a significant accuracy boost (>7%) over the baseline +full-precision network while reducing model size by a factor of 4x or more. Our +project website is at: https://saqibjaved1.github.io/MQAT_/ + +
+
+ comment: Accepted to Transactions on Machine Learning Research (TMLR), 2024 +
+
+
+
+
+ + ♻ ☆ PointNCBW: Towards Dataset Ownership Verification for Point Clouds via + Negative Clean-label Backdoor Watermark + + +
+ Recently, point clouds have been widely used in computer vision, whereas +their collection is time-consuming and expensive. As such, point cloud datasets +are the valuable intellectual property of their owners and deserve protection. +To detect and prevent unauthorized use of these datasets, especially for +commercial or open-sourced ones that cannot be sold again or used commercially +without permission, we intend to identify whether a suspicious third-party +model is trained on our protected dataset under the black-box setting. We +achieve this goal by designing a scalable clean-label backdoor-based dataset +watermark for point clouds that ensures both effectiveness and stealthiness. +Unlike existing clean-label watermark schemes, which are susceptible to the +number of categories, our method could watermark samples from all classes +instead of only from the target one. Accordingly, it can still preserve high +effectiveness even on large-scale datasets with many classes. Specifically, we +perturb selected point clouds with non-target categories in both shape-wise and +point-wise manners before inserting trigger patterns without changing their +labels. The features of perturbed samples are similar to those of benign +samples from the target class. As such, models trained on the watermarked +dataset will have a distinctive yet stealthy backdoor behavior, i.e., +misclassifying samples from the target class whenever triggers appear, since +the trained DNNs will treat the inserted trigger pattern as a signal to deny +predicting the target label. We also design a hypothesis-test-guided dataset +ownership verification based on the proposed watermark. Extensive experiments +on benchmark datasets are conducted, verifying the effectiveness of our method +and its resistance to potential removal methods. + +
+
+ comment: This paper was accepted by IEEE Transactions on Information Forensics + and Security (TIFS), 2024. 16 pages +
+
+
+
+
+ + ♻ ☆ FilterViT and DropoutViT: Lightweight Vision Transformer Models for + Efficient Attention Mechanisms + + +
+ In this study, we introduce FilterViT, an enhanced version of MobileViT, +which leverages an attention-based mechanism for early-stage downsampling. +Traditional QKV operations on high-resolution feature maps are computationally +intensive due to the abundance of tokens. To address this, we propose a filter +attention mechanism using a convolutional neural network (CNN) to generate an +importance mask, focusing attention on key image regions. The method +significantly reduces computational complexity while maintaining +interpretability, as it highlights essential image areas. Experimental results +show that FilterViT achieves substantial gains in both efficiency and accuracy +compared to other models. We also introduce DropoutViT, a variant that uses a +stochastic approach for pixel selection, further enhancing robustness. + +
+
+
+
+
+ + ♻ ☆ Advanced Vision Transformers and Open-Set Learning for Robust Mosquito + Classification: A Novel Approach to Entomological Studies + + +
+ Mosquito-related diseases pose a significant threat to global public health, +necessitating efficient and accurate mosquito classification for effective +surveillance and control. This work presents an innovative approach to mosquito +classification by leveraging state-of-the-art vision transformers and open-set +learning techniques. A novel framework has been introduced that integrates +Transformer-based deep learning models with comprehensive data augmentation and +preprocessing methods, enabling robust and precise identification of ten +mosquito species. The Swin Transformer model achieves the best performance for +traditional closed-set learning with 99.80% accuracy and 0.998 F1 score. The +lightweight MobileViT technique attains an almost similar accuracy of 98.90% +with significantly reduced parameters and model complexities. Next, the applied +deep learning models' adaptability and generalizability in a static environment +have been enhanced by using new classes of data samples during the inference +stage that have not been included in the training set. The proposed framework's +ability to handle unseen classes like insects similar to mosquitoes, even +humans, through open-set learning further enhances its practical applicability +by employing the OpenMax technique and Weibull distribution. The traditional +CNN model, Xception, outperforms the latest transformer with higher accuracy +and F1 score for open-set learning. The study's findings highlight the +transformative potential of advanced deep-learning architectures in entomology, +providing a strong groundwork for future research and development in mosquito +surveillance and vector control. The implications of this work extend beyond +mosquito classification, offering valuable insights for broader ecological and +environmental monitoring applications. + +
+
+ comment: 23 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Model Pairing Using Embedding Translation for Backdoor Attack Detection + on Open-Set Classification Tasks NeurIPS 2024 + + +
+ Backdoor attacks allow an attacker to embed a specific vulnerability in a +machine learning algorithm, activated when an attacker-chosen pattern is +presented, causing a specific misprediction. The need to identify backdoors in +biometric scenarios has led us to propose a novel technique with different +trade-offs. In this paper we propose to use model pairs on open-set +classification tasks for detecting backdoors. Using a simple linear operation +to project embeddings from a probe model's embedding space to a reference +model's embedding space, we can compare both embeddings and compute a +similarity score. We show that this score, can be an indicator for the presence +of a backdoor despite models being of different architectures, having been +trained independently and on different datasets. This technique allows for the +detection of backdoors on models designed for open-set classification tasks, +which is little studied in the literature. Additionally, we show that backdoors +can be detected even when both models are backdoored. The source code is made +available for reproducibility purposes. + +
+
+ comment: Accepted in NeurIPS 2024 Safe Generative AI Workshop (oral + presentation) +
+
+
+
+
+ + ♻ ☆ Framer: Interactive Frame Interpolation + + +
+ We propose Framer for interactive frame interpolation, which targets +producing smoothly transitioning frames between two images as per user +creativity. Concretely, besides taking the start and end frames as inputs, our +approach supports customizing the transition process by tailoring the +trajectory of some selected keypoints. Such a design enjoys two clear benefits. +First, incorporating human interaction mitigates the issue arising from +numerous possibilities of transforming one image to another, and in turn +enables finer control of local motions. Second, as the most basic form of +interaction, keypoints help establish the correspondence across frames, +enhancing the model to handle challenging cases (e.g., objects on the start and +end frames are of different shapes and styles). It is noteworthy that our +system also offers an "autopilot" mode, where we introduce a module to estimate +the keypoints and refine the trajectory automatically, to simplify the usage in +practice. Extensive experimental results demonstrate the appealing performance +of Framer on various applications, such as image morphing, time-lapse video +generation, cartoon interpolation, etc. The code, the model, and the interface +will be released to facilitate further research. + +
+
+ comment: Project page: https://aim-uofa.github.io/Framer/ +
+
+
+
+
+ + ♻ ☆ SegEarth-OV: Towards Training-Free Open-Vocabulary Segmentation for + Remote Sensing Images + + +
+ Remote sensing image plays an irreplaceable role in fields such as +agriculture, water resources, military, and disaster relief. Pixel-level +interpretation is a critical aspect of remote sensing image applications; +however, a prevalent limitation remains the need for extensive manual +annotation. For this, we try to introduce open-vocabulary semantic segmentation +(OVSS) into the remote sensing context. However, due to the sensitivity of +remote sensing images to low-resolution features, distorted target shapes and +ill-fitting boundaries are exhibited in the prediction mask. To tackle this +issue, we propose a simple and general upsampler, SimFeatUp, to restore lost +spatial information in deep features in a training-free style. Further, based +on the observation of the abnormal response of local patch tokens to [CLS] +token in CLIP, we propose to execute a straightforward subtraction operation to +alleviate the global bias in patch tokens. Extensive experiments are conducted +on 17 remote sensing datasets spanning semantic segmentation, building +extraction, road detection, and flood detection tasks. Our method achieves an +average of 5.8%, 8.2%, 4.0%, and 15.3% improvement over state-of-the-art +methods on 4 tasks. All codes are released. +\url{https://earth-insights.github.io/SegEarth-OV} + +
+
+
+
+
+ + ♻ ☆ CMMMU: A Chinese Massive Multi-discipline Multimodal Understanding + Benchmark + + +
+ As the capabilities of large multimodal models (LMMs) continue to advance, +evaluating the performance of LMMs emerges as an increasing need. Additionally, +there is an even larger gap in evaluating the advanced knowledge and reasoning +abilities of LMMs in non-English contexts such as Chinese. We introduce CMMMU, +a new Chinese Massive Multi-discipline Multimodal Understanding benchmark +designed to evaluate LMMs on tasks demanding college-level subject knowledge +and deliberate reasoning in a Chinese context. CMMMU is inspired by and +strictly follows the annotation and analysis pattern of MMMU. CMMMU includes +12k manually collected multimodal questions from college exams, quizzes, and +textbooks, covering six core disciplines: Art & Design, Business, Science, +Health & Medicine, Humanities & Social Science, and Tech & Engineering, like +its companion, MMMU. These questions span 30 subjects and comprise 39 highly +heterogeneous image types, such as charts, diagrams, maps, tables, music +sheets, and chemical structures. CMMMU focuses on complex perception and +reasoning with domain-specific knowledge in the Chinese context. We evaluate 11 +open-source LLMs and one proprietary GPT-4V(ision). Even GPT-4V only achieves +accuracies of 42%, indicating a large space for improvement. CMMMU will boost +the community to build the next-generation LMMs towards expert artificial +intelligence and promote the democratization of LMMs by providing diverse +language contexts. + +
+
+
+
+
+ + ♻ ☆ BiVLC: Extending Vision-Language Compositionality Evaluation with + Text-to-Image Retrieval NeurIPS 24 + + +
+ Existing Vision-Language Compositionality (VLC) benchmarks like SugarCrepe +are formulated as image-to-text retrieval problems, where, given an image, the +models need to select between the correct textual description and a synthetic +hard negative text. In this work, we present the Bidirectional Vision-Language +Compositionality (BiVLC) dataset. The novelty of BiVLC is to add a synthetic +hard negative image generated from the synthetic text, resulting in two +image-to-text retrieval examples (one for each image) and, more importantly, +two text-to-image retrieval examples (one for each text). Human annotators +filter out ill-formed examples ensuring the validity of the benchmark. The +experiments on BiVLC uncover a weakness of current multimodal models, as they +perform poorly in the text-to-image direction. In fact, when considering both +retrieval directions, the conclusions obtained in previous works change +significantly. In addition to the benchmark, we show that a contrastive model +trained using synthetic images and texts significantly improves over the base +model in SugarCrepe and in BiVLC for both retrieval directions. The gap to +human performance in BiVLC confirms that Vision-Language Compositionality is +still a challenging problem. BiVLC and code are available at +https://imirandam.github.io/BiVLC_project_page. + +
+
+ comment: Accepted to NeurIPS 24 Datasets and Benchmarks Track; Project page + at: https://imirandam.github.io/BiVLC_project_page/ +
+
+
+
+
+ + ♻ ☆ Hyper-SD: Trajectory Segmented Consistency Model for Efficient Image + Synthesis NeurIPS 2024 + + +
+ Recently, a series of diffusion-aware distillation algorithms have emerged to +alleviate the computational overhead associated with the multi-step inference +process of Diffusion Models (DMs). Current distillation techniques often +dichotomize into two distinct aspects: i) ODE Trajectory Preservation; and ii) +ODE Trajectory Reformulation. However, these approaches suffer from severe +performance degradation or domain shifts. To address these limitations, we +propose Hyper-SD, a novel framework that synergistically amalgamates the +advantages of ODE Trajectory Preservation and Reformulation, while maintaining +near-lossless performance during step compression. Firstly, we introduce +Trajectory Segmented Consistency Distillation to progressively perform +consistent distillation within pre-defined time-step segments, which +facilitates the preservation of the original ODE trajectory from a higher-order +perspective. Secondly, we incorporate human feedback learning to boost the +performance of the model in a low-step regime and mitigate the performance loss +incurred by the distillation process. Thirdly, we integrate score distillation +to further improve the low-step generation capability of the model and offer +the first attempt to leverage a unified LoRA to support the inference process +at all steps. Extensive experiments and user studies demonstrate that Hyper-SD +achieves SOTA performance from 1 to 8 inference steps for both SDXL and SD1.5. +For example, Hyper-SDXL surpasses SDXL-Lightning by +0.68 in CLIP Score and ++0.51 in Aes Score in the 1-step inference. + +
+
+ comment: Accepted by NeurIPS 2024 (Camera-Ready Version). Project Page: + https://hyper-sd.github.io/ +
+
+
+
+
+ + ♻ ☆ RaLF: Flow-based Global and Metric Radar Localization in LiDAR Maps + + +
+ Localization is paramount for autonomous robots. While camera and LiDAR-based +approaches have been extensively investigated, they are affected by adverse +illumination and weather conditions. Therefore, radar sensors have recently +gained attention due to their intrinsic robustness to such conditions. In this +paper, we propose RaLF, a novel deep neural network-based approach for +localizing radar scans in a LiDAR map of the environment, by jointly learning +to address both place recognition and metric localization. RaLF is composed of +radar and LiDAR feature encoders, a place recognition head that generates +global descriptors, and a metric localization head that predicts the 3-DoF +transformation between the radar scan and the map. We tackle the place +recognition task by learning a shared embedding space between the two +modalities via cross-modal metric learning. Additionally, we perform metric +localization by predicting pixel-level flow vectors that align the query radar +scan with the LiDAR map. We extensively evaluate our approach on multiple +real-world driving datasets and show that RaLF achieves state-of-the-art +performance for both place recognition and metric localization. Moreover, we +demonstrate that our approach can effectively generalize to different cities +and sensor setups than the ones used during training. We make the code and +trained models publicly available at http://ralf.cs.uni-freiburg.de. + +
+
+
+
+
+ + ♻ ☆ Local Concept Embeddings for Analysis of Concept Distributions in DNN + Feature Spaces + + +
+ Insights into the learned latent representations are imperative for verifying +deep neural networks (DNNs) in critical computer vision (CV) tasks. Therefore, +state-of-the-art supervised Concept-based eXplainable Artificial Intelligence +(C-XAI) methods associate user-defined concepts like ``car'' each with a single +vector in the DNN latent space (concept embedding vector). In the case of +concept segmentation, these linearly separate between activation map pixels +belonging to a concept and those belonging to background. Existing methods for +concept segmentation, however, fall short of capturing sub-concepts (e.g., +``proximate car'' and ``distant car''), and concept overlap (e.g., between +``bus'' and ``truck''). In other words, they do not capture the full +distribution of concept representatives in latent space. For the first time, +this work shows that these simplifications are frequently broken and that +distribution information can be particularly useful for understanding +DNN-learned notions of sub-concepts, concept confusion, and concept outliers. +To allow exploration of learned concept distributions, we propose a novel local +concept analysis framework. Instead of optimizing a single global concept +vector on the complete dataset, it generates a local concept embedding (LoCE) +vector for each individual sample. We use the distribution formed by LoCEs to +explore the latent concept distribution by fitting Gaussian mixture models +(GMMs), hierarchical clustering, and concept-level information retrieval and +outlier detection. Despite its context sensitivity, our method's concept +segmentation performance is competitive to global baselines. Analysis results +are obtained on two datasets and five diverse vision DNN architectures, +including vision transformers (ViTs). + +
+
+
+
+
+ + ♻ ☆ BodySLAM: A Generalized Monocular Visual SLAM Framework for Surgical + Applications + + +
+ Endoscopic surgery relies on two-dimensional views, posing challenges for +surgeons in depth perception and instrument manipulation. While Monocular +Visual Simultaneous Localization and Mapping (MVSLAM) has emerged as a +promising solution, its implementation in endoscopic procedures faces +significant challenges due to hardware limitations, such as the use of a +monocular camera and the absence of odometry sensors. This study presents +BodySLAM, a robust deep learning-based MVSLAM approach that addresses these +challenges through three key components: CycleVO, a novel unsupervised +monocular pose estimation module; the integration of the state-of-the-art Zoe +architecture for monocular depth estimation; and a 3D reconstruction module +creating a coherent surgical map. The approach is rigorously evaluated using +three publicly available datasets (Hamlyn, EndoSLAM, and SCARED) spanning +laparoscopy, gastroscopy, and colonoscopy scenarios, and benchmarked against +four state-of-the-art methods. Results demonstrate that CycleVO exhibited +competitive performance with the lowest inference time among pose estimation +methods, while maintaining robust generalization capabilities, whereas Zoe +significantly outperformed existing algorithms for depth estimation in +endoscopy. BodySLAM's strong performance across diverse endoscopic scenarios +demonstrates its potential as a viable MVSLAM solution for endoscopic +applications. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ PitRSDNet: Predicting Intra-operative Remaining Surgery Duration in + Endoscopic Pituitary Surgery + + +
+ Accurate intra-operative Remaining Surgery Duration (RSD) predictions allow +for anaesthetists to more accurately decide when to administer anaesthetic +agents and drugs, as well as to notify hospital staff to send in the next +patient. Therefore RSD plays an important role in improving patient care and +minimising surgical theatre costs via efficient scheduling. In endoscopic +pituitary surgery, it is uniquely challenging due to variable workflow +sequences with a selection of optional steps contributing to high variability +in surgery duration. This paper presents PitRSDNet for predicting RSD during +pituitary surgery, a spatio-temporal neural network model that learns from +historical data focusing on workflow sequences. PitRSDNet integrates workflow +knowledge into RSD prediction in two forms: 1) multi-task learning for +concurrently predicting step and RSD; and 2) incorporating prior steps as +context in temporal learning and inference. PitRSDNet is trained and evaluated +on a new endoscopic pituitary surgery dataset with 88 videos to show +competitive performance improvements over previous statistical and machine +learning methods. The findings also highlight how PitRSDNet improve RSD +precision on outlier cases utilising the knowledge of prior steps. + +
+
+ comment: Accepted to the Augmented Environments for Computer-Assisted + Interventions (AE-CAI) Workshop at the Medical Image Computing and + Computer-Assisted Interventions (MICCAI) Conference 2024 +
+
+
+
+
+ + ♻ ☆ DualDn: Dual-domain Denoising via Differentiable ISP ECCV 2024 + + +
+ Image denoising is a critical component in a camera's Image Signal Processing +(ISP) pipeline. There are two typical ways to inject a denoiser into the ISP +pipeline: applying a denoiser directly to captured raw frames (raw domain) or +to the ISP's output sRGB images (sRGB domain). However, both approaches have +their limitations. Residual noise from raw-domain denoising can be amplified by +the subsequent ISP processing, and the sRGB domain struggles to handle +spatially varying noise since it only sees noise distorted by the ISP. +Consequently, most raw or sRGB domain denoising works only for specific noise +distributions and ISP configurations. To address these challenges, we propose +DualDn, a novel learning-based dual-domain denoising. Unlike previous +single-domain denoising, DualDn consists of two denoising networks: one in the +raw domain and one in the sRGB domain. The raw domain denoising adapts to +sensor-specific noise as well as spatially varying noise levels, while the sRGB +domain denoising adapts to ISP variations and removes residual noise amplified +by the ISP. Both denoising networks are connected with a differentiable ISP, +which is trained end-to-end and discarded during the inference stage. With this +design, DualDn achieves greater generalizability compared to most +learning-based denoising methods, as it can adapt to different unseen noises, +ISP parameters, and even novel ISP pipelines. Experiments show that DualDn +achieves state-of-the-art performance and can adapt to different denoising +architectures. Moreover, DualDn can be used as a plug-and-play denoising module +with real cameras without retraining, and still demonstrate better performance +than commercial on-camera denoising. The project website is available at: +https://openimaginglab.github.io/DualDn/ + +
+
+ comment: Accepted at ECCV 2024, Project page: + https://openimaginglab.github.io/DualDn/ +
+
+
+
+
+ + ♻ ☆ A citizen science toolkit to collect human perceptions of urban + environments using open street view images + + +
+ Street View Imagery (SVI) is a valuable data source for studies (e.g., +environmental assessments, green space identification or land cover +classification). While commercial SVI is available, such providers commonly +restrict copying or reuse in ways necessary for research. Open SVI datasets are +readily available from less restrictive sources, such as Mapillary, but due to +the heterogeneity of the images, these require substantial preprocessing, +filtering, and careful quality checks. We present an efficient method for +automated downloading, processing, cropping, and filtering open SVI, to be used +in a survey of human perceptions of the streets portrayed in these images. We +demonstrate our open-source reusable SVI preparation and smartphone-friendly +perception-survey software with Amsterdam (Netherlands) as the case study. +Using a citizen science approach, we collected from 331 people 22,637 ratings +about their perceptions for various criteria. We have published our software in +a public repository for future re-use and reproducibility. + +
+
+
+
+
+ + ♻ ☆ MV2Cyl: Reconstructing 3D Extrusion Cylinders from Multi-View Images NeurIPS 2024 + + +
+ We present MV2Cyl, a novel method for reconstructing 3D from 2D multi-view +images, not merely as a field or raw geometry but as a sketch-extrude CAD +model. Extracting extrusion cylinders from raw 3D geometry has been extensively +researched in computer vision, while the processing of 3D data through neural +networks has remained a bottleneck. Since 3D scans are generally accompanied by +multi-view images, leveraging 2D convolutional neural networks allows these +images to be exploited as a rich source for extracting extrusion cylinder +information. However, we observe that extracting only the surface information +of the extrudes and utilizing it results in suboptimal outcomes due to the +challenges in the occlusion and surface segmentation. By synergizing with the +extracted base curve information, we achieve the optimal reconstruction result +with the best accuracy in 2D sketch and extrude parameter estimation. Our +experiments, comparing our method with previous work that takes a raw 3D point +cloud as input, demonstrate the effectiveness of our approach by taking +advantage of multi-view images. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Private Attribute Inference from Images with Vision-Language Models + + +
+ As large language models (LLMs) become ubiquitous in our daily tasks and +digital interactions, associated privacy risks are increasingly in focus. While +LLM privacy research has primarily focused on the leakage of model training +data, it has recently been shown that LLMs can make accurate privacy-infringing +inferences from previously unseen texts. With the rise of vision-language +models (VLMs), capable of understanding both images and text, a key question is +whether this concern transfers to the previously unexplored domain of benign +images posted online. To answer this question, we compile an image dataset with +human-annotated labels of the image owner's personal attributes. In order to +understand the privacy risks posed by VLMs beyond traditional human attribute +recognition, our dataset consists of images where the inferable private +attributes do not stem from direct depictions of humans. On this dataset, we +evaluate 7 state-of-the-art VLMs, finding that they can infer various personal +attributes at up to 77.6% accuracy. Concerningly, we observe that accuracy +scales with the general capabilities of the models, implying that future models +can be misused as stronger inferential adversaries, establishing an imperative +for the development of adequate defenses. + +
+
+
+
+
+ + ♻ ☆ Visual Self-supervised Learning Scheme for Dense Prediction Tasks on + X-ray Images + + +
+ Recently, significant advancements in artificial intelligence have been +attributed to the integration of self-supervised learning (SSL) scheme. While +SSL has shown impressive achievements in natural language processing (NLP), its +progress in computer vision has comparatively lagged behind. However, the +incorporation of contrastive learning into existing visual SSL models has led +to considerable progress, often surpassing supervised counterparts. +Nonetheless, these improvements have been mostly limited to classification +tasks. Moreover, few studies have evaluated visual SSL models in real-world +scenarios, as most have focused on datasets with class-wise portrait images, +notably ImageNet. Here, we focus on dense prediction tasks using security +inspection x-ray images to evaluate our proposed model, Segment Localization +(SegLoc). Based upon the Instance Localization (InsLoc) model, SegLoc addresses +one of the key challenges of contrastive learning, i.e., false negative pairs +of query embeddings. Our pre-training dataset is synthesized by cutting, +transforming, and pasting labeled segments from an existing labeled dataset +(PIDray) as foregrounds onto instances from an unlabeled dataset (SIXray) as +backgrounds. Furthermore, we fully leverage the labeled data by incorporating +the concept, one queue per class, into the MoCo-v2 memory bank, thereby +avoiding false negative pairs. In our experiments, SegLoc outperformed random +initialization by 3% to 6% while underperformed supervised initialization, in +terms of AR and AP metrics across different IoU values over 20 to 30 +pre-training epochs. + +
+
+
+
+
+ + ♻ ☆ PT43D: A Probabilistic Transformer for Generating 3D Shapes from Single + Highly-Ambiguous RGB Images + + +
+ Generating 3D shapes from single RGB images is essential in various +applications such as robotics. Current approaches typically target images +containing clear and complete visual descriptions of the object, without +considering common realistic cases where observations of objects that are +largely occluded or truncated. We thus propose a transformer-based +autoregressive model to generate the probabilistic distribution of 3D shapes +conditioned on an RGB image containing potentially highly ambiguous +observations of the object. To handle realistic scenarios such as occlusion or +field-of-view truncation, we create simulated image-to-shape training pairs +that enable improved fine-tuning for real-world scenarios. We then adopt +cross-attention to effectively identify the most relevant region of interest +from the input image for shape generation. This enables inference of sampled +shapes with reasonable diversity and strong alignment with the input image. We +train and test our model on our synthetic data then fine-tune and test it on +real-world data. Experiments demonstrate that our model outperforms state of +the art in both scenarios. + +
+
+ comment: 10 pages, 6 figures. Accepted to BMVC 2024 +
+
+
+
+
+ + ♻ ☆ 3D Equivariant Pose Regression via Direct Wigner-D Harmonics Prediction NeurIPS 2024 + + +
+ Determining the 3D orientations of an object in an image, known as +single-image pose estimation, is a crucial task in 3D vision applications. +Existing methods typically learn 3D rotations parametrized in the spatial +domain using Euler angles or quaternions, but these representations often +introduce discontinuities and singularities. SO(3)-equivariant networks enable +the structured capture of pose patterns with data-efficient learning, but the +parametrizations in spatial domain are incompatible with their architecture, +particularly spherical CNNs, which operate in the frequency domain to enhance +computational efficiency. To overcome these issues, we propose a +frequency-domain approach that directly predicts Wigner-D coefficients for 3D +rotation regression, aligning with the operations of spherical CNNs. Our +SO(3)-equivariant pose harmonics predictor overcomes the limitations of spatial +parameterizations, ensuring consistent pose estimation under arbitrary +rotations. Trained with a frequency-domain regression loss, our method achieves +state-of-the-art results on benchmarks such as ModelNet10-SO(3) and PASCAL3D+, +with significant improvements in accuracy, robustness, and data efficiency. + +
+
+ comment: Accepted to NeurIPS 2024, Project webpage at + http://cvlab.postech.ac.kr/research/3D_EquiPose +
+
+
+
+
+ + ♻ ☆ QIS : Interactive Segmentation via Quasi-Conformal Mappings + + +
+ Image segmentation plays a crucial role in extracting important objects of +interest from images, enabling various applications. While existing methods +have shown success in segmenting clean images, they often struggle to produce +accurate segmentation results when dealing with degraded images, such as those +containing noise or occlusions. To address this challenge, interactive +segmentation has emerged as a promising approach, allowing users to provide +meaningful input to guide the segmentation process. However, an important +problem in interactive segmentation lies in determining how to incorporate +minimal yet meaningful user guidance into the segmentation model. In this +paper, we propose the quasi-conformal interactive segmentation (QIS) model, +which incorporates user input in the form of positive and negative clicks. +Users mark a few pixels belonging to the object region as positive clicks, +indicating that the segmentation model should include a region around these +clicks. Conversely, negative clicks are provided on pixels belonging to the +background, instructing the model to exclude the region near these clicks from +the segmentation mask. Additionally, the segmentation mask is obtained by +deforming a template mask with the same topology as the object of interest +using an orientation-preserving quasiconformal mapping. This approach helps to +avoid topological errors in the segmentation results. We provide a thorough +analysis of the proposed model, including theoretical support for the ability +of QIS to include or exclude regions of interest or disinterest based on the +user's indication. To evaluate the performance of QIS, we conduct experiments +on synthesized images, medical images, natural images and noisy natural images. +The results demonstrate the efficacy of our proposed method. + +
+
+ comment: 34 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Target Detection of Safety Protective Gear Using the Improved YOLOv5 + + +
+ In high-risk railway construction, personal protective equipment monitoring +is critical but challenging due to small and frequently obstructed targets. We +propose YOLO-EA, an innovative model that enhances safety measure detection by +integrating ECA into its backbone's convolutional layers, improving discernment +of minuscule objects like hardhats. YOLO-EA further refines target recognition +under occlusion by replacing GIoU with EIoU loss. YOLO-EA's effectiveness was +empirically substantiated using a dataset derived from real-world railway +construction site surveillance footage. It outperforms YOLOv5, achieving 98.9% +precision and 94.7% recall, up 2.5% and 0.5% respectively, while maintaining +real-time performance at 70.774 fps. This highly efficient and precise YOLO-EA +holds great promise for practical application in intricate construction +scenarios, enforcing stringent safety compliance during complex railway +construction projects. + +
+
+
+
+
+ + ♻ ☆ S3PT: Scene Semantics and Structure Guided Clustering to Boost + Self-Supervised Pre-Training for Autonomous Driving + + +
+ Recent self-supervised clustering-based pre-training techniques like DINO and +Cribo have shown impressive results for downstream detection and segmentation +tasks. However, real-world applications such as autonomous driving face +challenges with imbalanced object class and size distributions and complex +scene geometries. In this paper, we propose S3PT a novel scene semantics and +structure guided clustering to provide more scene-consistent objectives for +self-supervised training. Specifically, our contributions are threefold: First, +we incorporate semantic distribution consistent clustering to encourage better +representation of rare classes such as motorcycles or animals. Second, we +introduce object diversity consistent spatial clustering, to handle imbalanced +and diverse object sizes, ranging from large background areas to small objects +such as pedestrians and traffic signs. Third, we propose a depth-guided spatial +clustering to regularize learning based on geometric information of the scene, +thus further refining region separation on the feature level. Our learned +representations significantly improve performance in downstream semantic +segmentation and 3D object detection tasks on the nuScenes, nuImages, and +Cityscapes datasets and show promising domain translation properties. + +
+
+ comment: Accepted for WACV 2025 +
+
+
+
+
+ + ♻ ☆ EfficientNet with Hybrid Attention Mechanisms for Enhanced Breast + Histopathology Classification: A Comprehensive Approach + + +
+ Breast cancer histopathology image classification is crucial for early cancer +detection, offering the potential to reduce mortality rates through timely +diagnosis. This paper introduces a novel approach integrating Hybrid +EfficientNet models with advanced attention mechanisms, including Convolutional +Block Attention Module (CBAM), Self-Attention, and Deformable Attention, to +enhance feature extraction and focus on critical image regions. We evaluate the +performance of our models across multiple magnification scales using publicly +available histopathological datasets. Our method achieves significant +improvements, with accuracy reaching 98.42% at 400X magnification, surpassing +several state-of-the-art models, including VGG and ResNet architectures. The +results are validated using metrics such as accuracy, F1-score, precision, and +recall, demonstrating the clinical potential of our model in improving +diagnostic accuracy. Furthermore, the proposed method shows increased +computational efficiency, making it suitable for integration into real-time +diagnostic workflows. + +
+
+
+
+
+ + ♻ ☆ VGA: Vision GUI Assistant -- Minimizing Hallucinations through + Image-Centric Fine-Tuning + + +
+ Recent advances in Large Vision-Language Models (LVLMs) have significantly +improve performance in image comprehension tasks, such as formatted charts and +rich-content images. Yet, Graphical User Interface (GUI) pose a greater +challenge due to their structured format and detailed textual information. +Existing LVLMs often overly depend on internal knowledge and neglect image +content, resulting in hallucinations and incorrect responses in GUI +comprehension. To address these issues, we introduce VGA, a fine-tuned model +designed for comprehensive GUI understanding. Our model aims to enhance the +interpretation of visual data of GUI and reduce hallucinations. We first +construct a Vision Question Answering (VQA) dataset of 63.8k high-quality +examples with our propose Referent Method, which ensures the model's responses +are highly depend on visual content within the image. We then design a +two-stage fine-tuning method called Foundation and Advanced Comprehension (FAC) +to enhance both the model's ability to extract information from image content +and alignment with human intent. Experiments show that our approach enhances +the model's ability to extract information from images and achieves +state-of-the-art results in GUI understanding tasks. Our dataset and +fine-tuning script will be released soon. + +
+
+ comment: Accepted by EMNLP2024 +
+
+
+
+
+ + ♻ ☆ OneDiff: A Generalist Model for Image Difference Captioning + + +
+ In computer vision, Image Difference Captioning (IDC) is crucial for +accurately describing variations between closely related images. Traditional +IDC methods often rely on specialist models, which restrict their applicability +across varied contexts. This paper introduces the OneDiff model, a novel +generalist approach that utilizes a robust vision-language model architecture, +integrating a siamese image encoder with a Visual Delta Module. This innovative +configuration allows for the precise detection and articulation of fine-grained +differences between image pairs. OneDiff is trained through a dual-phase +strategy, encompassing Coupled Sample Training and multi-task learning across a +diverse array of data types, supported by our newly developed DiffCap Dataset. +This dataset merges real-world and synthetic data, enhancing the training +process and bolstering the model's robustness. Extensive testing on diverse IDC +benchmarks, such as Spot-the-Diff, Image-Editing-Request, and Birds-to-Words, +shows that OneDiff consistently outperforms existing state-of-the-art models in +accuracy and adaptability, achieving improvements of up to 97% CIDEr points in +average. By setting a new benchmark in IDC, OneDiff paves the way for more +versatile and effective applications in detecting and describing visual +differences. The code, models, and data will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ GACL: Exemplar-Free Generalized Analytic Continual Learning + + +
+ Class incremental learning (CIL) trains a network on sequential tasks with +separated categories in each task but suffers from catastrophic forgetting, +where models quickly lose previously learned knowledge when acquiring new +tasks. The generalized CIL (GCIL) aims to address the CIL problem in a more +real-world scenario, where incoming data have mixed data categories and unknown +sample size distribution. Existing attempts for the GCIL either have poor +performance or invade data privacy by saving exemplars. In this paper, we +propose a new exemplar-free GCIL technique named generalized analytic continual +learning (GACL). The GACL adopts analytic learning (a gradient-free training +technique) and delivers an analytical (i.e., closed-form) solution to the GCIL +scenario. This solution is derived via decomposing the incoming data into +exposed and unexposed classes, thereby attaining a weight-invariant property, a +rare yet valuable property supporting an equivalence between incremental +learning and its joint training. Such an equivalence is crucial in GCIL +settings as data distributions among different tasks no longer pose challenges +to adopting our GACL. Theoretically, this equivalence property is validated +through matrix analysis tools. Empirically, we conduct extensive experiments +where, compared with existing GCIL methods, our GACL exhibits a consistently +leading performance across various datasets and GCIL settings. Source code is +available at https://github.com/CHEN-YIZHU/GACL. + +
+
+
+
+
+ + ♻ ☆ Visual Anchors Are Strong Information Aggregators For Multimodal Large + Language Model + + +
+ In the realm of Multimodal Large Language Models (MLLMs), vision-language +connector plays a crucial role to link the pre-trained vision encoders with +Large Language Models (LLMs). Despite its importance, the vision-language +connector has been relatively less explored. In this study, we aim to propose a +strong vision-language connector that enables MLLMs to achieve high accuracy +while maintain low computation cost. We first reveal the existence of the +visual anchors in Vision Transformer and propose a cost-effective search +algorithm to extract them. Building on these findings, we introduce the Anchor +Former (AcFormer), a novel vision-language connector designed to leverage the +rich prior knowledge obtained from these visual anchors during pretraining, +guiding the aggregation of information. Through extensive experimentation, we +demonstrate that the proposed method significantly reduces computational costs +by nearly two-thirds compared with baseline, while simultaneously outperforming +baseline methods. This highlights the effectiveness and efficiency of AcFormer. +Codes are available at https://github.com/liuhaogeng/Anchor-Former. + +
+
+
+
+
+ + ♻ ☆ TRACE: Temporal Grounding Video LLM via Causal Event Modeling + + +
+ Video Temporal Grounding (VTG) is a crucial capability for video +understanding models and plays a vital role in downstream tasks such as video +browsing and editing. To effectively handle various tasks simultaneously and +enable zero-shot prediction, there is a growing trend in employing video LLMs +for VTG tasks. However, current video LLM-based methods rely exclusively on +natural language generation, lacking the ability to model the clear structure +inherent in videos, which restricts their effectiveness in tackling VTG tasks. +To address this issue, this paper first formally introduces causal event +modeling framework, which represents videos as sequences of events, and predict +the current event using previous events, video inputs, and textural +instructions. Each event consists of three components: timestamps, salient +scores, and textual captions. We then propose a novel task-interleaved video +LLM called TRACE to effectively implement the causal event modeling framework +in practice. The TRACE processes visual frames, timestamps, salient scores, and +text as distinct tasks, employing various encoders and decoding heads for each. +Task tokens are arranged in an interleaved sequence according to the causal +event modeling framework's formulation. Extensive experiments on various VTG +tasks and datasets demonstrate the superior performance of TRACE compared to +state-of-the-art video LLMs. Our model and code are available at +\url{https://github.com/gyxxyg/TRACE}. + +
+
+
+
+
+ + ♻ ☆ DeSparsify: Adversarial Attack Against Token Sparsification Mechanisms + in Vision Transformers + + +
+ Vision transformers have contributed greatly to advancements in the computer +vision domain, demonstrating state-of-the-art performance in diverse tasks +(e.g., image classification, object detection). However, their high +computational requirements grow quadratically with the number of tokens used. +Token sparsification mechanisms have been proposed to address this issue. These +mechanisms employ an input-dependent strategy, in which uninformative tokens +are discarded from the computation pipeline, improving the model's efficiency. +However, their dynamism and average-case assumption makes them vulnerable to a +new threat vector - carefully crafted adversarial examples capable of fooling +the sparsification mechanism, resulting in worst-case performance. In this +paper, we present DeSparsify, an attack targeting the availability of vision +transformers that use token sparsification mechanisms. The attack aims to +exhaust the operating system's resources, while maintaining its stealthiness. +Our evaluation demonstrates the attack's effectiveness on three token +sparsification mechanisms and examines the attack's transferability between +them and its effect on the GPU resources. To mitigate the impact of the attack, +we propose various countermeasures. + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ F-OAL: Forward-only Online Analytic Learning with Fast Training and Low + Memory Footprint in Class Incremental Learning + + +
+ Online Class Incremental Learning (OCIL) aims to train models incrementally, +where data arrive in mini-batches, and previous data are not accessible. A +major challenge in OCIL is Catastrophic Forgetting, i.e., the loss of +previously learned knowledge. Among existing baselines, replay-based methods +show competitive results but requires extra memory for storing exemplars, while +exemplar-free (i.e., data need not be stored for replay in production) methods +are resource-friendly but often lack accuracy. In this paper, we propose an +exemplar-free approach--Forward-only Online Analytic Learning (F-OAL). Unlike +traditional methods, F-OAL does not rely on back-propagation and is +forward-only, significantly reducing memory usage and computational time. +Cooperating with a pre-trained frozen encoder with Feature Fusion, F-OAL only +needs to update a linear classifier by recursive least square. This approach +simultaneously achieves high accuracy and low resource consumption. Extensive +experiments on benchmark datasets demonstrate F-OAL's robust performance in +OCIL scenarios. Code is available at https://github.com/liuyuchen-cz/F-OAL. + +
+
+
+
+
+ + ♻ ☆ MemControl: Mitigating Memorization in Diffusion Models via Automated + Parameter Selection + + +
+ Diffusion models excel in generating images that closely resemble their +training data but are also susceptible to data memorization, raising privacy, +ethical, and legal concerns, particularly in sensitive domains such as medical +imaging. We hypothesize that this memorization stems from the +overparameterization of deep models and propose that regularizing model +capacity during fine-tuning can mitigate this issue. Firstly, we empirically +show that regulating the model capacity via Parameter-efficient fine-tuning +(PEFT) mitigates memorization to some extent, however, it further requires the +identification of the exact parameter subsets to be fine-tuned for high-quality +generation. To identify these subsets, we introduce a bi-level optimization +framework, MemControl, that automates parameter selection using memorization +and generation quality metrics as rewards during fine-tuning. The parameter +subsets discovered through MemControl achieve a superior tradeoff between +generation quality and memorization. For the task of medical image generation, +our approach outperforms existing state-of-the-art memorization mitigation +strategies by fine-tuning as few as 0.019% of model parameters. Moreover, we +demonstrate that the discovered parameter subsets are transferable to +non-medical domains. Our framework is scalable to large datasets, agnostic to +reward functions, and can be integrated with existing approaches for further +memorization mitigation. To the best of our knowledge, this is the first study +to empirically evaluate memorization in medical images and propose a targeted +yet universal mitigation strategy. The code is available at +https://github.com/Raman1121/Diffusion_Memorization_HPO + +
+
+ comment: Accepted at WACV'25 (Applications Track) +
+
+
+
+
+ + ♻ ☆ CVQA: Culturally-diverse Multilingual Visual Question Answering + Benchmark NeurIPS + 2024 + + +
+ Visual Question Answering (VQA) is an important task in multimodal AI, and it +is often used to test the ability of vision-language models to understand and +reason on knowledge present in both visual and textual data. However, most of +the current VQA models use datasets that are primarily focused on English and a +few major world languages, with images that are typically Western-centric. +While recent efforts have tried to increase the number of languages covered on +VQA datasets, they still lack diversity in low-resource languages. More +importantly, although these datasets often extend their linguistic range via +translation or some other approaches, they usually keep images the same, +resulting in narrow cultural representation. To address these limitations, we +construct CVQA, a new Culturally-diverse multilingual Visual Question Answering +benchmark, designed to cover a rich set of languages and cultures, where we +engage native speakers and cultural experts in the data collection process. As +a result, CVQA includes culturally-driven images and questions from across 30 +countries on four continents, covering 31 languages with 13 scripts, providing +a total of 10k questions. We then benchmark several Multimodal Large Language +Models (MLLMs) on CVQA, and show that the dataset is challenging for the +current state-of-the-art models. This benchmark can serve as a probing +evaluation suite for assessing the cultural capability and bias of multimodal +models and hopefully encourage more research efforts toward increasing cultural +awareness and linguistic diversity in this field. + +
+
+ comment: 38th Conference on Neural Information Processing Systems (NeurIPS + 2024) Track on Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ Decay Pruning Method: Smooth Pruning With a Self-Rectifying Procedure + + +
+ Current structured pruning methods often result in considerable accuracy +drops due to abrupt network changes and loss of information from pruned +structures. To address these issues, we introduce the Decay Pruning Method +(DPM), a novel smooth pruning approach with a self-rectifying mechanism. DPM +consists of two key components: (i) Smooth Pruning: It converts conventional +single-step pruning into multi-step smooth pruning, gradually reducing +redundant structures to zero over N steps with ongoing optimization. (ii) +Self-Rectifying: This procedure further enhances the aforementioned process by +rectifying sub-optimal pruning based on gradient information. Our approach +demonstrates strong generalizability and can be easily integrated with various +existing pruning methods. We validate the effectiveness of DPM by integrating +it with three popular pruning methods: OTOv2, Depgraph, and Gate Decorator. +Experimental results show consistent improvements in performance compared to +the original pruning methods, along with further reductions of FLOPs in most +scenarios. + +
+
+
+
+
+ + ♻ ☆ eMoE-Tracker: Environmental MoE-based Transformer for Robust + Event-guided Object Tracking + + +
+ The unique complementarity of frame-based and event cameras for high frame +rate object tracking has recently inspired some research attempts to develop +multi-modal fusion approaches. However, these methods directly fuse both +modalities and thus ignore the environmental attributes, e.g., motion blur, +illumination variance, occlusion, scale variation, etc. Meanwhile, insufficient +interaction between search and template features makes distinguishing target +objects and backgrounds difficult. As a result, performance degradation is +induced especially in challenging conditions. This paper proposes a novel and +effective Transformer-based event-guided tracking framework, called +eMoE-Tracker, which achieves new SOTA performance under various conditions. Our +key idea is to disentangle the environment into several learnable attributes to +dynamically learn the attribute-specific features and strengthen the target +information by improving the interaction between the target template and search +regions. To achieve the goal, we first propose an environmental Mix-of-Experts +(eMoE) module that is built upon the environmental Attributes Disentanglement +to learn attribute-specific features and environmental Attributes Assembling to +assemble the attribute-specific features by the learnable attribute scores +dynamically. The eMoE module is a subtle router that prompt-tunes the +transformer backbone more efficiently. We then introduce a contrastive relation +modeling (CRM) module to emphasize target information by leveraging a +contrastive learning strategy between the target template and search regions. +Extensive experiments on diverse event-based benchmark datasets showcase the +superior performance of our eMoE-Tracker compared to the prior arts. + +
+
+ comment: RGB-event single object tracking +
+
+
+
+
+ + ♻ ☆ Efficient Vectorized Backpropagation Algorithms for Training Feedforward + Networks Composed of Quadratic Neurons + + +
+ Higher order artificial neurons whose outputs are computed by applying an +activation function to a higher order multinomial function of the inputs have +been considered in the past, but did not gain acceptance due to the extra +parameters and computational cost. However, higher order neurons have +significantly greater learning capabilities since the decision boundaries of +higher order neurons can be complex surfaces instead of just hyperplanes. The +boundary of a single quadratic neuron can be a general hyper-quadric surface +allowing it to learn many nonlinearly separable datasets. Since quadratic forms +can be represented by symmetric matrices, only $\frac{n(n+1)}{2}$ additional +parameters are needed instead of $n^2$. A quadratic Logistic regression model +is first presented. Solutions to the XOR problem with a single quadratic neuron +are considered. The complete vectorized equations for both forward and backward +propagation in feedforward networks composed of quadratic neurons are derived. +A reduced parameter quadratic neural network model with just $ n $ additional +parameters per neuron that provides a compromise between learning ability and +computational cost is presented. Comparison on benchmark classification +datasets are used to demonstrate that a final layer of quadratic neurons +enables networks to achieve higher accuracy with significantly fewer hidden +layer neurons. In particular this paper shows that any dataset composed of +$\mathcal{C}$ bounded clusters can be separated with only a single layer of +$\mathcal{C}$ quadratic neurons. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Visual CoT: Advancing Multi-Modal Language Models with a Comprehensive + Dataset and Benchmark for Chain-of-Thought Reasoning + + +
+ Multi-Modal Large Language Models (MLLMs) have demonstrated impressive +performance in various VQA tasks. However, they often lack interpretability and +struggle with complex visual inputs, especially when the resolution of the +input image is high or when the interested region that could provide key +information for answering the question is small. To address these challenges, +we collect and introduce the large-scale Visual CoT dataset comprising 438k +question-answer pairs, annotated with intermediate bounding boxes highlighting +key regions essential for answering the questions. Additionally, about 98k +pairs of them are annotated with detailed reasoning steps. Importantly, we +propose a multi-turn processing pipeline that dynamically focuses on visual +inputs and provides interpretable thoughts. We also introduce the related +benchmark to evaluate the MLLMs in scenarios requiring specific local region +identification. Extensive experiments demonstrate the effectiveness of our +framework and shed light on better inference strategies. The Visual CoT +dataset, benchmark, and pre-trained models are available on +https://hao-shao.com/projects/viscot.html to support further research in this +area. + +
+
+ comment: Project Page: https://hao-shao.com/projects/viscot.html +
+
+
+
+
+ + ♻ ☆ Understanding Generalizability of Diffusion Models Requires Rethinking + the Hidden Gaussian Structure + + +
+ In this work, we study the generalizability of diffusion models by looking +into the hidden properties of the learned score functions, which are +essentially a series of deep denoisers trained on various noise levels. We +observe that as diffusion models transition from memorization to +generalization, their corresponding nonlinear diffusion denoisers exhibit +increasing linearity. This discovery leads us to investigate the linear +counterparts of the nonlinear diffusion models, which are a series of linear +models trained to match the function mappings of the nonlinear diffusion +denoisers. Surprisingly, these linear denoisers are approximately the optimal +denoisers for a multivariate Gaussian distribution characterized by the +empirical mean and covariance of the training dataset. This finding implies +that diffusion models have the inductive bias towards capturing and utilizing +the Gaussian structure (covariance information) of the training dataset for +data generation. We empirically demonstrate that this inductive bias is a +unique property of diffusion models in the generalization regime, which becomes +increasingly evident when the model's capacity is relatively small compared to +the training dataset size. In the case that the model is highly +overparameterized, this inductive bias emerges during the initial training +phases before the model fully memorizes its training data. Our study provides +crucial insights into understanding the notable strong generalization +phenomenon recently observed in real-world diffusion models. + +
+
+
+
+
+ + ♻ ☆ Vision-Aware Text Features in Referring Image Segmentation: From Object + Understanding to Context Understanding + + +
+ Referring image segmentation is a challenging task that involves generating +pixel-wise segmentation masks based on natural language descriptions. The +complexity of this task increases with the intricacy of the sentences provided. +Existing methods have relied mostly on visual features to generate the +segmentation masks while treating text features as supporting components. +However, this under-utilization of text understanding limits the model's +capability to fully comprehend the given expressions. In this work, we propose +a novel framework that specifically emphasizes object and context comprehension +inspired by human cognitive processes through Vision-Aware Text Features. +Firstly, we introduce a CLIP Prior module to localize the main object of +interest and embed the object heatmap into the query initialization process. +Secondly, we propose a combination of two components: Contextual Multimodal +Decoder and Meaning Consistency Constraint, to further enhance the coherent and +consistent interpretation of language cues with the contextual understanding +obtained from the image. Our method achieves significant performance +improvements on three benchmark datasets RefCOCO, RefCOCO+ and G-Ref. Project +page: \url{https://vatex.hkustvgd.com/}. + +
+
+ comment: This paper is accepted in WACV 2025 +
+
+
+
+
+ + ♻ ☆ ChartGemma: Visual Instruction-tuning for Chart Reasoning in the Wild + + +
+ Given the ubiquity of charts as a data analysis, visualization, and +decision-making tool across industries and sciences, there has been a growing +interest in developing pre-trained foundation models as well as general purpose +instruction-tuned models for chart understanding and reasoning. However, +existing methods suffer crucial drawbacks across two critical axes affecting +the performance of chart representation models: they are trained on data +generated from underlying data tables of the charts, ignoring the visual trends +and patterns in chart images, and use weakly aligned vision-language backbone +models for domain-specific training, limiting their generalizability when +encountering charts in the wild. We address these important drawbacks and +introduce ChartGemma, a novel chart understanding and reasoning model developed +over PaliGemma. Rather than relying on underlying data tables, ChartGemma is +trained on instruction-tuning data generated directly from chart images, thus +capturing both high-level trends and low-level visual information from a +diverse set of charts. Our simple approach achieves state-of-the-art results +across $5$ benchmarks spanning chart summarization, question answering, and +fact-checking, and our elaborate qualitative studies on real-world charts show +that ChartGemma generates more realistic and factually correct summaries +compared to its contemporaries. We release the code, model checkpoints, +dataset, and demos at https://github.com/vis-nlp/ChartGemma. + +
+
+
+
+
+ + ♻ ☆ Survey on Adversarial Attack and Defense for Medical Image Analysis: + Methods and Challenges + + +
+ Deep learning techniques have achieved superior performance in computer-aided +medical image analysis, yet they are still vulnerable to imperceptible +adversarial attacks, resulting in potential misdiagnosis in clinical practice. +Oppositely, recent years have also witnessed remarkable progress in defense +against these tailored adversarial examples in deep medical diagnosis systems. +In this exposition, we present a comprehensive survey on recent advances in +adversarial attacks and defenses for medical image analysis with a systematic +taxonomy in terms of the application scenario. We also provide a unified +framework for different types of adversarial attack and defense methods in the +context of medical image analysis. For a fair comparison, we establish a new +benchmark for adversarially robust medical diagnosis models obtained by +adversarial training under various scenarios. To the best of our knowledge, +this is the first survey paper that provides a thorough evaluation of +adversarially robust medical diagnosis models. By analyzing qualitative and +quantitative results, we conclude this survey with a detailed discussion of +current challenges for adversarial attack and defense in medical image analysis +systems to shed light on future research directions. Code is available on +\href{https://github.com/tomvii/Adv_MIA}{\color{red}{GitHub}}. + +
+
+ comment: Accepted by ACM Computing Surveys (CSUR) (DOI: + https://doi.org/10.1145/3702638) +
+
+
+
+
+ + ♻ ☆ Camera-Based HRV Prediction for Remote Learning Environments + + +
+ In recent years, due to the widespread use of internet videos, remote +photoplethysmography (rPPG) has gained more and more attention in the fields of +affective computing. Restoring blood volume pulse (BVP) signals from facial +videos is a challenging task that involves a series of preprocessing, image +algorithms, and postprocessing to restore waveforms. Not only is the heart rate +metric utilized for affective computing, but the heart rate variability (HRV) +metric is even more significant. The challenge in obtaining HRV indices through +rPPG lies in the necessity for algorithms to precisely predict the BVP peak +positions. In this paper, we collected the Remote Learning Affect and +Physiology (RLAP) dataset, which includes over 32 hours of highly synchronized +video and labels from 58 subjects. This is a public dataset whose BVP labels +have been meticulously designed to better suit the training of HRV models. +Using the RLAP dataset, we trained a new model called Seq-rPPG, it is a model +based on one-dimensional convolution, and experimental results reveal that this +structure is more suitable for handling HRV tasks, which outperformed all other +baselines in HRV performance and also demonstrated significant advantages in +computational efficiency. + +
+
+
+
+
+ + ♻ ☆ Neural Pose Representation Learning for Generating and Transferring + Non-Rigid Object Poses NeurIPS 2024 + + +
+ We propose a novel method for learning representations of poses for 3D +deformable objects, which specializes in 1) disentangling pose information from +the object's identity, 2) facilitating the learning of pose variations, and 3) +transferring pose information to other object identities. Based on these +properties, our method enables the generation of 3D deformable objects with +diversity in both identities and poses, using variations of a single object. It +does not require explicit shape parameterization such as skeletons or joints, +point-level or shape-level correspondence supervision, or variations of the +target object for pose transfer. To achieve pose disentanglement, compactness +for generative models, and transferability, we first design the pose extractor +to represent the pose as a keypoint-based hybrid representation and the pose +applier to learn an implicit deformation field. To better distill pose +information from the object's geometry, we propose the implicit pose applier to +output an intrinsic mesh property, the face Jacobian. Once the extracted pose +information is transferred to the target object, the pose applier is fine-tuned +in a self-supervised manner to better describe the target object's shapes with +pose variations. The extracted poses are also used to train a cascaded +diffusion model to enable the generation of novel poses. Our experiments with +the DeformThings4D and Human datasets demonstrate state-of-the-art performance +in pose transfer and the ability to generate diverse deformed shapes with +various objects and poses. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ ManiWAV: Learning Robot Manipulation from In-the-Wild Audio-Visual Data + + +
+ Audio signals provide rich information for the robot interaction and object +properties through contact. This information can surprisingly ease the learning +of contact-rich robot manipulation skills, especially when the visual +information alone is ambiguous or incomplete. However, the usage of audio data +in robot manipulation has been constrained to teleoperated demonstrations +collected by either attaching a microphone to the robot or object, which +significantly limits its usage in robot learning pipelines. In this work, we +introduce ManiWAV: an 'ear-in-hand' data collection device to collect +in-the-wild human demonstrations with synchronous audio and visual feedback, +and a corresponding policy interface to learn robot manipulation policy +directly from the demonstrations. We demonstrate the capabilities of our system +through four contact-rich manipulation tasks that require either passively +sensing the contact events and modes, or actively sensing the object surface +materials and states. In addition, we show that our system can generalize to +unseen in-the-wild environments by learning from diverse in-the-wild human +demonstrations. + +
+
+ comment: Conference on Robot Learning (CoRL) 2024; Project website: + https://maniwav.github.io/ +
+
+
+
+
+ + ♻ ☆ SyncTweedies: A General Generative Framework Based on Synchronized + Diffusions NeurIPS 2024 + + +
+ We introduce a general framework for generating diverse visual content, +including ambiguous images, panorama images, mesh textures, and Gaussian splat +textures, by synchronizing multiple diffusion processes. We present exhaustive +investigation into all possible scenarios for synchronizing multiple diffusion +processes through a canonical space and analyze their characteristics across +applications. In doing so, we reveal a previously unexplored case: averaging +the outputs of Tweedie's formula while conducting denoising in multiple +instance spaces. This case also provides the best quality with the widest +applicability to downstream tasks. We name this case SyncTweedies. In our +experiments generating visual content aforementioned, we demonstrate the +superior quality of generation by SyncTweedies compared to other +synchronization methods, optimization-based and iterative-update-based methods. + +
+
+ comment: Project page: https://synctweedies.github.io/ (NeurIPS 2024) +
+
+
+
+
+ + ♻ ☆ Convolutional Kolmogorov-Arnold Networks + + +
+ In this paper, we introduce Convolutional Kolmogorov-Arnold Networks +(Convolutional KANs), an innovative alternative to the standard Convolutional +Neural Networks (CNNs) that have revolutionized the field of computer vision. +By integrating the learneable non-linear activation functions presented in +Kolmogorov-Arnold Networks (KANs) into convolutions, we propose a new layer. +Throughout the paper, we empirically validate the performance of Convolutional +KANs against traditional architectures across Fashion-MNIST dataset, finding +that, in some cases, this new approach maintains a similar level of accuracy +while using half the number of parameters. This experiments show that KAN +Convolutions seem to learn more per kernel, which opens up a new horizon of +possibilities in deep learning for computer vision. + +
+
+
+
+
+ + ♻ ☆ ReactFace: Online Multiple Appropriate Facial Reaction Generation in + Dyadic Interactions + + +
+ In dyadic interaction, predicting the listener's facial reactions is +challenging as different reactions could be appropriate in response to the same +speaker's behaviour. Previous approaches predominantly treated this task as an +interpolation or fitting problem, emphasizing deterministic outcomes but +ignoring the diversity and uncertainty of human facial reactions. Furthermore, +these methods often failed to model short-range and long-range dependencies +within the interaction context, leading to issues in the synchrony and +appropriateness of the generated facial reactions. To address these +limitations, this paper reformulates the task as an extrapolation or prediction +problem, and proposes an novel framework (called ReactFace) to generate +multiple different but appropriate facial reactions from a speaker behaviour +rather than merely replicating the corresponding listener facial behaviours. +Our ReactFace generates multiple different but appropriate photo-realistic +human facial reactions by: (i) learning an appropriate facial reaction +distribution representing multiple different but appropriate facial reactions; +and (ii) synchronizing the generated facial reactions with the speaker verbal +and non-verbal behaviours at each time stamp, resulting in realistic 2D facial +reaction sequences. Experimental results demonstrate the effectiveness of our +approach in generating multiple diverse, synchronized, and appropriate facial +reactions from each speaker's behaviour. The quality of the generated facial +reactions is intimately tied to the speaker's speech and facial expressions, +achieved through our novel speaker-listener interaction modules. Our code is +made publicly available at \url{https://github.com/lingjivoo/ReactFace}. + +
+
+ comment: Accepted to IEEE Transactions on Visualization and Computer Graphics + (TVCG), 18 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ DeblurDiNAT: A Generalizable Transformer for Perceptual Image Deblurring + + +
+ Although prior state-of-the-art (SOTA) deblurring networks achieve high +metric scores on synthetic datasets, there are two challenges which prevent +them from perceptual image deblurring. First, a deblurring model overtrained on +synthetic datasets may collapse in a broad range of unseen real-world +scenarios. Second, the conventional metrics PSNR and SSIM may not correctly +reflect the perceptual quality observed by human eyes. To this end, we propose +DeblurDiNAT, a generalizable and efficient encoder-decoder Transformer which +restores clean images visually close to the ground truth. We adopt an +alternating dilation factor structure to capture local and global blur +patterns. We propose a local cross-channel learner to assist self-attention +layers to learn short-range cross-channel relationships. In addition, we +present a linear feed-forward network and a non-linear dual-stage feature +fusion module for faster feature propagation across the network. Compared to +nearest competitors, our model demonstrates the strongest generalization +ability and achieves the best perceptual quality on mainstream image deblurring +datasets with 3%-68% fewer parameters. + +
+
+
+
+
+ + ♻ ☆ Is A Picture Worth A Thousand Words? Delving Into Spatial Reasoning for + Vision Language Models NeurIPS 2024 + + +
+ Large language models (LLMs) and vision-language models (VLMs) have +demonstrated remarkable performance across a wide range of tasks and domains. +Despite this promise, spatial understanding and reasoning -- a fundamental +component of human cognition -- remains under-explored. We propose SpatialEval, +a novel benchmark that covers diverse aspects of spatial reasoning such as +relationship understanding, navigation, and counting. We conduct a +comprehensive evaluation of competitive language and vision-language models. +Our findings reveal several counter-intuitive insights that have been +overlooked in the literature: (1) Spatial reasoning poses significant +challenges where competitive models can fall behind random guessing; (2) +Despite additional visual input, VLMs often under-perform compared to their LLM +counterparts; (3) When both textual and visual information is available, +multi-modal language models become less reliant on visual information if +sufficient textual clues are provided. Additionally, we demonstrate that +leveraging redundancy between vision and text can significantly enhance model +performance. We hope our study will inform the development of multimodal models +to improve spatial intelligence and further close the gap with human +intelligence. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 19 + +
+
+
+ + ☆ Large-Scale Multi-Robot Coverage Path Planning on Grids with Path + Deconfliction + + +
+ We study Multi-Robot Coverage Path Planning (MCPP) on a 4-neighbor 2D grid G, +which aims to compute paths for multiple robots to cover all cells of G. +Traditional approaches are limited as they first compute coverage trees on a +quadrant coarsened grid H and then employ the Spanning Tree Coverage (STC) +paradigm to generate paths on G, making them inapplicable to grids with +partially obstructed 2x2 blocks. To address this limitation, we reformulate the +problem directly on G, revolutionizing grid-based MCPP solving and establishing +new NP-hardness results. We introduce Extended-STC (ESTC), a novel paradigm +that extends STC to ensure complete coverage with bounded suboptimality, even +when H includes partially obstructed blocks. Furthermore, we present LS-MCPP, a +new algorithmic framework that integrates ESTC with three novel types of +neighborhood operators within a local search strategy to optimize coverage +paths directly on G. Unlike prior grid-based MCPP work, our approach also +incorporates a versatile post-processing procedure that applies Multi-Agent +Path Finding (MAPF) techniques to MCPP for the first time, enabling a fusion of +these two important fields in multi-robot coordination. This procedure +effectively resolves inter-robot conflicts and accommodates turning costs by +solving a MAPF variant, making our MCPP solutions more practical for real-world +applications. Extensive experiments demonstrate that our approach significantly +improves solution quality and efficiency, managing up to 100 robots on grids as +large as 256x256 within minutes of runtime. Validation with physical robots +confirms the feasibility of our solutions under real-world conditions. + +
+
+ comment: Submitted to T-RO +
+
+
+
+
+ + ☆ Neural Inverse Source Problems + + +
+ Reconstructing unknown external source functions is an important perception +capability for a large range of robotics domains including manipulation, +aerial, and underwater robotics. In this work, we propose a Physics-Informed +Neural Network (PINN [1]) based approach for solving the inverse source +problems in robotics, jointly identifying unknown source functions and the +complete state of a system given partial and noisy observations. Our approach +demonstrates several advantages over prior works (Finite Element Methods (FEM) +and data-driven approaches): it offers flexibility in integrating diverse +constraints and boundary conditions; eliminates the need for complex +discretizations (e.g., meshing); easily accommodates gradients from real +measurements; and does not limit performance based on the diversity and quality +of training data. We validate our method across three simulation and real-world +scenarios involving up to 4th order partial differential equations (PDEs), +constraints such as Signorini and Dirichlet, and various regression losses +including Chamfer distance and L2 norm. + +
+
+
+
+
+ + ☆ Know Where You're Uncertain When Planning with Multimodal Foundation + Models: A Formal Framework + + +
+ Multimodal foundation models offer a promising framework for robotic +perception and planning by processing sensory inputs to generate actionable +plans. However, addressing uncertainty in both perception (sensory +interpretation) and decision-making (plan generation) remains a critical +challenge for ensuring task reliability. We present a comprehensive framework +to disentangle, quantify, and mitigate these two forms of uncertainty. We first +introduce a framework for uncertainty disentanglement, isolating perception +uncertainty arising from limitations in visual understanding and decision +uncertainty relating to the robustness of generated plans. + To quantify each type of uncertainty, we propose methods tailored to the +unique properties of perception and decision-making: we use conformal +prediction to calibrate perception uncertainty and introduce +Formal-Methods-Driven Prediction (FMDP) to quantify decision uncertainty, +leveraging formal verification techniques for theoretical guarantees. Building +on this quantification, we implement two targeted intervention mechanisms: an +active sensing process that dynamically re-observes high-uncertainty scenes to +enhance visual input quality and an automated refinement procedure that +fine-tunes the model on high-certainty data, improving its capability to meet +task specifications. Empirical validation in real-world and simulated robotic +tasks demonstrates that our uncertainty disentanglement framework reduces +variability by up to 40% and enhances task success rates by 5% compared to +baselines. These improvements are attributed to the combined effect of both +interventions and highlight the importance of uncertainty disentanglement which +facilitates targeted interventions that enhance the robustness and reliability +of autonomous systems. + +
+
+ comment: Fine-tuned models, code, and datasets are available at + https://tinyurl.com/uncertainty-disentanglement +
+
+
+
+
+ + ☆ GITSR: Graph Interaction Transformer-based Scene Representation for + Multi Vehicle Collaborative Decision-making + + +
+ In this study, we propose GITSR, an effective framework for Graph Interaction +Transformer-based Scene Representation for multi-vehicle collaborative +decision-making in intelligent transportation system. In the context of mixed +traffic where Connected Automated Vehicles (CAVs) and Human Driving Vehicles +(HDVs) coexist, in order to enhance the understanding of the environment by +CAVs to improve decision-making capabilities, this framework focuses on +efficient scene representation and the modeling of spatial interaction +behaviors of traffic states. We first extract features of the driving +environment based on the background of intelligent networking. Subsequently, +the local scene representation, which is based on the agent-centric and dynamic +occupation grid, is calculated by the Transformer module. Besides, feasible +region of the map is captured through the multi-head attention mechanism to +reduce the collision of vehicles. Notably, spatial interaction behaviors, based +on motion information, are modeled as graph structures and extracted via Graph +Neural Network (GNN). Ultimately, the collaborative decision-making among +multiple vehicles is formulated as a Markov Decision Process (MDP), with +driving actions output by Reinforcement Learning (RL) algorithms. Our +algorithmic validation is executed within the extremely challenging scenario of +highway off-ramp task, thereby substantiating the superiority of agent-centric +approach to scene representation. Simulation results demonstrate that the GITSR +method can not only effectively capture scene representation but also extract +spatial interaction data, outperforming the baseline method across various +comparative metrics. + +
+
+
+
+
+ + ☆ An Aerial Transport System in Marine GNSS-Denied Environment + + +
+ This paper presents an autonomous aerial system specifically engineered for +operation in challenging marine GNSS-denied environments, aimed at transporting +small cargo from a target vessel. In these environments, characterized by +weakly textured sea surfaces with few feature points, chaotic deck oscillations +due to waves, and significant wind gusts, conventional navigation methods often +prove inadequate. Leveraging the DJI M300 platform, our system is designed to +autonomously navigate and transport cargo while overcoming these environmental +challenges. In particular, this paper proposes an anchor-based localization +method using ultrawideband (UWB) and QR codes facilities, which decouples the +UAV's attitude from that of the moving landing platform, thus reducing control +oscillations caused by platform movement. Additionally, a motor-driven +attachment mechanism for cargo is designed, which enhances the UAV's field of +view during descent and ensures a reliable attachment to the cargo upon +landing. The system's reliability and effectiveness were progressively enhanced +through multiple outdoor experimental iterations and were validated by the +successful cargo transport during the 2024 Mohamed BinZayed International +Robotics Challenge (MBZIRC2024) competition. Crucially, the system addresses +uncertainties and interferences inherent in maritime transportation missions +without prior knowledge of cargo locations on the deck and with strict +limitations on intervention throughout the transportation. + +
+
+
+
+
+ + ☆ Addressing Failures in Robotics using Vision-Based Language Models + (VLMs) and Behavior Trees (BT) + + +
+ In this paper, we propose an approach that combines Vision Language Models +(VLMs) and Behavior Trees (BTs) to address failures in robotics. Current +robotic systems can handle known failures with pre-existing recovery +strategies, but they are often ill-equipped to manage unknown failures or +anomalies. We introduce VLMs as a monitoring tool to detect and identify +failures during task execution. Additionally, VLMs generate missing conditions +or skill templates that are then incorporated into the BT, ensuring the system +can autonomously address similar failures in future tasks. We validate our +approach through simulations in several failure scenarios. + +
+
+
+
+
+ + ☆ Interaction-Aware Trajectory Prediction for Safe Motion Planning in + Autonomous Driving: A Transformer-Transfer Learning Approach + + +
+ A critical aspect of safe and efficient motion planning for autonomous +vehicles (AVs) is to handle the complex and uncertain behavior of surrounding +human-driven vehicles (HDVs). Despite intensive research on driver behavior +prediction, existing approaches typically overlook the interactions between AVs +and HDVs assuming that HDV trajectories are not affected by AV actions. To +address this gap, we present a transformer-transfer learning-based +interaction-aware trajectory predictor for safe motion planning of autonomous +driving, focusing on a vehicle-to-vehicle (V2V) interaction scenario consisting +of an AV and an HDV. Specifically, we construct a transformer-based +interaction-aware trajectory predictor using widely available datasets of HDV +trajectory data and further transfer the learned predictor using a small set of +AV-HDV interaction data. Then, to better incorporate the proposed trajectory +predictor into the motion planning module of AVs, we introduce an uncertainty +quantification method to characterize the errors of the predictor, which are +integrated into the path-planning process. Our experimental results demonstrate +the value of explicitly considering interactions and handling uncertainties. + +
+
+
+
+
+ + ☆ Exploring the Edges of Latent State Clusters for Goal-Conditioned + Reinforcement Learning NeurIPS2024 + + +
+ Exploring unknown environments efficiently is a fundamental challenge in +unsupervised goal-conditioned reinforcement learning. While selecting +exploratory goals at the frontier of previously explored states is an effective +strategy, the policy during training may still have limited capability of +reaching rare goals on the frontier, resulting in reduced exploratory behavior. +We propose "Cluster Edge Exploration" ($CE^2$), a new goal-directed exploration +algorithm that when choosing goals in sparsely explored areas of the state +space gives priority to goal states that remain accessible to the agent. The +key idea is clustering to group states that are easily reachable from one +another by the current policy under training in a latent space and traversing +to states holding significant exploration potential on the boundary of these +clusters before doing exploratory behavior. In challenging robotics +environments including navigating a maze with a multi-legged ant robot, +manipulating objects with a robot arm on a cluttered tabletop, and rotating +objects in the palm of an anthropomorphic robotic hand, $CE^2$ demonstrates +superior efficiency in exploration compared to baseline methods and ablations. + +
+
+ comment: NeurIPS2024 Poster +
+
+
+
+
+ + ☆ Wallbounce : Push wall to navigate with Contact-Implicit MPC + + +
+ In this work, we introduce a framework that enables highly maneuverable +locomotion using non-periodic contacts. This task is challenging for +traditional optimization and planning methods to handle due to difficulties in +specifying contact mode sequences in real-time. To address this, we use a +bi-level contact-implicit planner and hybrid model predictive controller to +draft and execute a motion plan. We investigate how this method allows us to +plan arm contact events on the shmoobot, a smaller ballbot, which uses an +inverse mouse-ball drive to achieve dynamic balancing with a low number of +actuators. Through multiple experiments we show how the arms allow for +acceleration, deceleration and dynamic obstacle avoidance that are not +achievable with the mouse-ball drive alone. This demonstrates how a holistic +approach to locomotion can increase the control authority of unique robot +morpohologies without additional hardware by leveraging robot arms that are +typically used only for manipulation. Project website: +https://cmushmoobot.github.io/Wallbounce + +
+
+
+
+
+ + ☆ Learning World Models for Unconstrained Goal Navigation NeurIPS2024 + + +
+ Learning world models offers a promising avenue for goal-conditioned +reinforcement learning with sparse rewards. By allowing agents to plan actions +or exploratory goals without direct interaction with the environment, world +models enhance exploration efficiency. The quality of a world model hinges on +the richness of data stored in the agent's replay buffer, with expectations of +reasonable generalization across the state space surrounding recorded +trajectories. However, challenges arise in generalizing learned world models to +state transitions backward along recorded trajectories or between states across +different trajectories, hindering their ability to accurately model real-world +dynamics. To address these challenges, we introduce a novel goal-directed +exploration algorithm, MUN (short for "World Models for Unconstrained Goal +Navigation"). This algorithm is capable of modeling state transitions between +arbitrary subgoal states in the replay buffer, thereby facilitating the +learning of policies to navigate between any "key" states. Experimental results +demonstrate that MUN strengthens the reliability of world models and +significantly improves the policy's capacity to generalize across new goal +settings. + +
+
+ comment: NeurIPS2024 Poster. arXiv admin note: substantial text overlap with + arXiv:2411.01396 +
+
+
+
+
+ + ♻ ☆ MapEx: Indoor Structure Exploration with Probabilistic Information Gain + from Global Map Predictions + + +
+ Exploration is a critical challenge in robotics, centered on understanding +unknown environments. In this work, we focus on robots exploring structured +indoor environments which are often predictable and composed of repeating +patterns. Most existing approaches, such as conventional frontier approaches, +have difficulty leveraging the predictability and explore with simple +heuristics such as `closest first'. Recent works use deep learning techniques +to predict unknown regions of the map, using these predictions for information +gain calculation. However, these approaches are often sensitive to the +predicted map quality or do not reason over sensor coverage. To overcome these +issues, our key insight is to jointly reason over what the robot can observe +and its uncertainty to calculate probabilistic information gain. We introduce +MapEx, a new exploration framework that uses predicted maps to form +probabilistic sensor model for information gain estimation. MapEx generates +multiple predicted maps based on observed information, and takes into +consideration both the computed variances of predicted maps and estimated +visible area to estimate the information gain of a given viewpoint. Experiments +on the real-world KTH dataset showed on average 12.4% improvement than +representative map-prediction based exploration and 25.4% improvement than +nearest frontier approach. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ♻ ☆ AnyRotate: Gravity-Invariant In-Hand Object Rotation with Sim-to-Real + Touch + + +
+ Human hands are capable of in-hand manipulation in the presence of different +hand motions. For a robot hand, harnessing rich tactile information to achieve +this level of dexterity still remains a significant challenge. In this paper, +we present AnyRotate, a system for gravity-invariant multi-axis in-hand object +rotation using dense featured sim-to-real touch. We tackle this problem by +training a dense tactile policy in simulation and present a sim-to-real method +for rich tactile sensing to achieve zero-shot policy transfer. Our formulation +allows the training of a unified policy to rotate unseen objects about +arbitrary rotation axes in any hand direction. In our experiments, we highlight +the benefit of capturing detailed contact information when handling objects of +varying properties. Interestingly, we found rich multi-fingered tactile sensing +can detect unstable grasps and provide a reactive behavior that improves the +robustness of the policy. The project website can be found at +https://maxyang27896.github.io/anyrotate/. + +
+
+ comment: Project website can be found at + https://maxyang27896.github.io/anyrotate/ +
+
+
+
+
+ + ♻ ☆ DualAD: Dual-Layer Planning for Reasoning in Autonomous Driving + + +
+ We present a novel autonomous driving framework, DualAD, designed to imitate +human reasoning during driving. DualAD comprises two layers: a rule-based +motion planner at the bottom layer that handles routine driving tasks requiring +minimal reasoning, and an upper layer featuring a rule-based text encoder that +converts driving scenarios from absolute states into text description. This +text is then processed by a large language model (LLM) to make driving +decisions. The upper layer intervenes in the bottom layer's decisions when +potential danger is detected, mimicking human reasoning in critical situations. +Closed-loop experiments demonstrate that DualAD, using a zero-shot pre-trained +model, significantly outperforms rule-based motion planners that lack reasoning +abilities. Our experiments also highlight the effectiveness of the text +encoder, which considerably enhances the model's scenario understanding. +Additionally, the integrated DualAD model improves with stronger LLMs, +indicating the framework's potential for further enhancement. Code and +benchmarks are available at github.com/TUM-AVS/DualAD. + +
+
+ comment: Autonomous Driving, Large Language Models (LLMs), Human Reasoning, + Critical Scenario +
+
+
+
+
+ + ♻ ☆ SuperVINS: A Real-Time Visual-Inertial SLAM Framework for Challenging + Imaging Conditions + + +
+ The traditional visual-inertial SLAM system often struggles with stability +under low-light or motion-blur conditions, leading to potential lost of +trajectory tracking. High accuracy and robustness are essential for the +long-term and stable localization capabilities of SLAM systems. Addressing the +challenges of enhancing robustness and accuracy in visual-inertial SLAM, this +paper propose SuperVINS, a real-time visual-inertial SLAM framework designed +for challenging imaging conditions. In contrast to geometric modeling, deep +learning features are capable of fully leveraging the implicit information +present in images, which is often not captured by geometric features. +Therefore, SuperVINS, developed as an enhancement of VINS-Fusion, integrates +the deep learning neural network model SuperPoint for feature point extraction +and loop closure detection. At the same time, a deep learning neural network +LightGlue model for associating feature points is integrated in front-end +feature matching. A feature matching enhancement strategy based on the RANSAC +algorithm is proposed. The system is allowed to set different masks and RANSAC +thresholds for various environments, thereby balancing computational cost and +localization accuracy. Additionally, it allows for flexible training of +specific SuperPoint bag of words tailored for loop closure detection in +particular environments. The system enables real-time localization and mapping. +Experimental validation on the well-known EuRoC dataset demonstrates that +SuperVINS is comparable to other visual-inertial SLAM system in accuracy and +robustness across the most challenging sequences. This paper analyzes the +advantages of SuperVINS in terms of accuracy, real-time performance, and +robustness. To facilitate knowledge exchange within the field, we have made the +code for this paper publicly available. + +
+
+
+
+
+ + ♻ ☆ Real-world Instance-specific Image Goal Navigation: Bridging Domain Gaps + via Contrastive Learning + + +
+ Improving instance-specific image goal navigation (InstanceImageNav), which +locates the identical object in a real-world environment from a query image, is +essential for robotic systems to assist users in finding desired objects. The +challenge lies in the domain gap between low-quality images observed by the +moving robot, characterized by motion blur and low-resolution, and high-quality +query images provided by the user. Such domain gaps could significantly reduce +the task success rate but have not been the focus of previous work. To address +this, we propose a novel method called Few-shot Cross-quality Instance-aware +Adaptation (CrossIA), which employs contrastive learning with an instance +classifier to align features between massive low- and few high-quality images. +This approach effectively reduces the domain gap by bringing the latent +representations of cross-quality images closer on an instance basis. +Additionally, the system integrates an object image collection with a +pre-trained deblurring model to enhance the observed image quality. Our method +fine-tunes the SimSiam model, pre-trained on ImageNet, using CrossIA. We +evaluated our method's effectiveness through an InstanceImageNav task with 20 +different types of instances, where the robot identifies the same instance in a +real-world environment as a high-quality query image. Our experiments showed +that our method improves the task success rate by up to three times compared to +the baseline, a conventional approach based on SuperGlue. These findings +highlight the potential of leveraging contrastive learning and image +enhancement techniques to bridge the domain gap and improve object localization +in robotic applications. The project website is +https://emergentsystemlabstudent.github.io/DomainBridgingNav/. + +
+
+ comment: See website at + https://emergentsystemlabstudent.github.io/DomainBridgingNav/. Accepted to + IEEE IRC2024 +
+
+
+
+
+ + ♻ ☆ PVPUFormer: Probabilistic Visual Prompt Unified Transformer for + Interactive Image Segmentation + + +
+ Integration of diverse visual prompts like clicks, scribbles, and boxes in +interactive image segmentation significantly facilitates users' interaction as +well as improves interaction efficiency. However, existing studies primarily +encode the position or pixel regions of prompts without considering the +contextual areas around them, resulting in insufficient prompt feedback, which +is not conducive to performance acceleration. To tackle this problem, this +paper proposes a simple yet effective Probabilistic Visual Prompt Unified +Transformer (PVPUFormer) for interactive image segmentation, which allows users +to flexibly input diverse visual prompts with the probabilistic prompt encoding +and feature post-processing to excavate sufficient and robust prompt features +for performance boosting. Specifically, we first propose a Probabilistic +Prompt-unified Encoder (PPuE) to generate a unified one-dimensional vector by +exploring both prompt and non-prompt contextual information, offering richer +feedback cues to accelerate performance improvement. On this basis, we further +present a Prompt-to-Pixel Contrastive (P$^2$C) loss to accurately align both +prompt and pixel features, bridging the representation gap between them to +offer consistent feature representations for mask prediction. Moreover, our +approach designs a Dual-cross Merging Attention (DMA) module to implement +bidirectional feature interaction between image and prompt features, generating +notable features for performance improvement. A comprehensive variety of +experiments on several challenging datasets demonstrates that the proposed +components achieve consistent improvements, yielding state-of-the-art +interactive segmentation performance. Our code is available at +https://github.com/XuZhang1211/PVPUFormer. + +
+
+ comment: Accepted to IEEE Transactions on Image Processing (TIP). Code is + available at https://github.com/XuZhang1211/PVPUFormer +
+
+
+
+
+ + ♻ ☆ MSI-NeRF: Linking Omni-Depth with View Synthesis through Multi-Sphere + Image aided Generalizable Neural Radiance Field + + +
+ Panoramic observation using fisheye cameras is significant in virtual reality +(VR) and robot perception. However, panoramic images synthesized by traditional +methods lack depth information and can only provide three degrees-of-freedom +(3DoF) rotation rendering in VR applications. To fully preserve and exploit the +parallax information within the original fisheye cameras, we introduce +MSI-NeRF, which combines deep learning omnidirectional depth estimation and +novel view synthesis. We construct a multi-sphere image as a cost volume +through feature extraction and warping of the input images. We further build an +implicit radiance field using spatial points and interpolated 3D feature +vectors as input, which can simultaneously realize omnidirectional depth +estimation and 6DoF view synthesis. Leveraging the knowledge from depth +estimation task, our method can learn scene appearance by source view +supervision only. It does not require novel target views and can be trained +conveniently on existing panorama depth estimation datasets. Our network has +the generalization ability to reconstruct unknown scenes efficiently using only +four images. Experimental results show that our method outperforms existing +methods in both depth estimation and novel view synthesis tasks. + +
+
+ comment: 10 pages, 9 figures, Accepted to IEEE/CVF Winter Conference on + Applications of Computer Vision +
+
+
+
+
+ + ♻ ☆ Automatic Tissue Traction Using Miniature Force-Sensing Forceps for + Minimally Invasive Surgery + + +
+ A common limitation of autonomous tissue manipulation in robotic minimally +invasive surgery (MIS) is the absence of force sensing and control at the tool +level. Recently, our team has developed miniature force-sensing forceps that +can simultaneously measure the grasping and pulling forces during tissue +manipulation. Based on this design, here we further present a method to +automate tissue traction that comprises grasping and pulling stages. During +this process, the grasping and pulling forces can be controlled either +separately or simultaneously through force decoupling. The force controller is +built upon a static model of tissue manipulation, considering the interaction +between the force-sensing forceps and soft tissue. The efficacy of this force +control approach is validated through a series of experiments comparing +targeted, estimated, and actual reference forces. To verify the feasibility of +the proposed method in surgical applications, various tissue resections are +conducted on ex vivo tissues employing a dual-arm robotic setup. Finally, we +discuss the benefits of multi-force control in tissue traction, evidenced +through comparative analyses of various ex vivo tissue resections with and +without the proposed method, and the potential generalization with traction on +different tissues. The results affirm the feasibility of implementing automatic +tissue traction using miniature forceps with multi-force control, suggesting +its potential to promote autonomous MIS. A video demonstrating the experiments +can be found at https://youtu.be/f5gXuXe67Ak. + +
+
+ comment: 15 pages, 14 figures, accepted by T-RO +
+
+
+
+
+ + ♻ ☆ Digital Twin-Enhanced Wireless Indoor Navigation: Achieving Efficient + Environment Sensing with Zero-Shot Reinforcement Learning + + +
+ Millimeter-wave (mmWave) communication is a vital component of future +generations of mobile networks, offering not only high data rates but also +precise beams, making it ideal for indoor navigation in complex environments. +However, the challenges of multipath propagation and noisy signal measurements +in indoor spaces complicate the use of mmWave signals for navigation tasks. +Traditional physics-based methods, such as following the angle of arrival +(AoA), often fall short in complex scenarios, highlighting the need for more +sophisticated approaches. Digital twins, as virtual replicas of physical +environments, offer a powerful tool for simulating and optimizing mmWave signal +propagation in such settings. By creating detailed, physics-based models of +real-world spaces, digital twins enable the training of machine learning +algorithms in virtual environments, reducing the costs and limitations of +physical testing. Despite their advantages, current machine learning models +trained in digital twins often overfit specific virtual environments and +require costly retraining when applied to new scenarios. In this paper, we +propose a Physics-Informed Reinforcement Learning (PIRL) approach that +leverages the physical insights provided by digital twins to shape the +reinforcement learning (RL) reward function. By integrating physics-based +metrics such as signal strength, AoA, and path reflections into the learning +process, PIRL enables efficient learning and improved generalization to new +environments without retraining. Our experiments demonstrate that the proposed +PIRL, supported by digital twin simulations, outperforms traditional heuristics +and standard RL models, achieving zero-shot generalization in unseen +environments and offering a cost-effective, scalable solution for wireless +indoor navigation. + +
+
+ comment: Submitted to IEEE Open Journal of the Communications Society +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 15 + +
+
+
+ + ☆ Rethinking Weight Decay for Robust Fine-Tuning of Foundation Models + + +
+ Modern optimizers such as AdamW, equipped with momentum and adaptive learning +rate, are designed to escape local minima and explore the vast parameter space. +This exploration is beneficial for finding good loss basins when training from +scratch. It is not necessarily ideal when resuming from a powerful foundation +model because it can lead to large deviations from the pre-trained +initialization and, consequently, worse robustness and generalization. At the +same time, strong regularization on all parameters can lead to under-fitting. +We hypothesize that selectively regularizing the parameter space is the key to +fitting and retraining the pre-trained knowledge. This paper proposes a new +weight decay technique, Selective Projection Decay (SPD), that selectively +imposes a strong penalty on certain layers while allowing others to change +freely. Intuitively, SPD expands and contracts the parameter search space for +layers with consistent and inconsistent loss reduction, respectively. +Experimentally, when equipped with SPD, Adam consistently provides better +in-distribution generalization and out-of-distribution robustness performance +on multiple popular vision and language benchmarks. Code available +at~\url{https://github.com/GT-RIPL/Selective-Projection-Decay.git} + +
+
+ comment: Accepted to Neurips 2024 +
+
+
+
+
+ + ☆ ROAD-Waymo: Action Awareness at Scale for Autonomous Driving + + +
+ Autonomous Vehicle (AV) perception systems require more than simply seeing, +via e.g., object detection or scene segmentation. They need a holistic +understanding of what is happening within the scene for safe interaction with +other road users. Few datasets exist for the purpose of developing and training +algorithms to comprehend the actions of other road users. This paper presents +ROAD-Waymo, an extensive dataset for the development and benchmarking of +techniques for agent, action, location and event detection in road scenes, +provided as a layer upon the (US) Waymo Open dataset. Considerably larger and +more challenging than any existing dataset (and encompassing multiple cities), +it comes with 198k annotated video frames, 54k agent tubes, 3.9M bounding boxes +and a total of 12.4M labels. The integrity of the dataset has been confirmed +and enhanced via a novel annotation pipeline designed for automatically +identifying violations of requirements specifically designed for this dataset. +As ROAD-Waymo is compatible with the original (UK) ROAD dataset, it provides +the opportunity to tackle domain adaptation between real-world road scenarios +in different countries within a novel benchmark: ROAD++. + +
+
+
+
+
+ + ☆ MamT$^4$: Multi-view Attention Networks for Mammography Cancer + Classification + + +
+ In this study, we introduce a novel method, called MamT$^4$, which is used +for simultaneous analysis of four mammography images. A decision is made based +on one image of a breast, with attention also devoted to three additional +images: another view of the same breast and two images of the other breast. +This approach enables the algorithm to closely replicate the practice of a +radiologist who reviews the entire set of mammograms for a patient. +Furthermore, this paper emphasizes the preprocessing of images, specifically +proposing a cropping model (U-Net based on ResNet-34) to help the method remove +image artifacts and focus on the breast region. To the best of our knowledge, +this study is the first to achieve a ROC-AUC of 84.0 $\pm$ 1.7 and an F1 score +of 56.0 $\pm$ 1.3 on an independent test dataset of Vietnam digital mammography +(VinDr-Mammo), which is preprocessed with the cropping model. + +
+
+ comment: The crop model is available here: + https://github.com/ispras/mammo_crop +
+
+
+
+
+ + ☆ Degradation-Aware Residual-Conditioned Optimal Transport for Unified + Image Restoration + + +
+ All-in-one image restoration has emerged as a practical and promising +low-level vision task for real-world applications. In this context, the key +issue lies in how to deal with different types of degraded images +simultaneously. In this work, we present a Degradation-Aware +Residual-Conditioned Optimal Transport (DA-RCOT) approach that models +(all-in-one) image restoration as an optimal transport (OT) problem for +unpaired and paired settings, introducing the transport residual as a +degradation-specific cue for both the transport cost and the transport map. +Specifically, we formalize image restoration with a residual-guided OT +objective by exploiting the degradation-specific patterns of the Fourier +residual in the transport cost. More crucially, we design the transport map for +restoration as a two-pass DA-RCOT map, in which the transport residual is +computed in the first pass and then encoded as multi-scale residual embeddings +to condition the second-pass restoration. This conditioning process injects +intrinsic degradation knowledge (e.g., degradation type and level) and +structural information from the multi-scale residual embeddings into the OT +map, which thereby can dynamically adjust its behaviors for all-in-one +restoration. Extensive experiments across five degradations demonstrate the +favorable performance of DA-RCOT as compared to state-of-the-art methods, in +terms of distortion measures, perceptual quality, and image structure +preservation. Notably, DA-RCOT delivers superior adaptability to real-world +scenarios even with multiple degradations and shows distinctive robustness to +both degradation levels and the number of degradations. + +
+
+
+
+
+ + ☆ Optimizing Gastrointestinal Diagnostics: A CNN-Based Model for VCE Image + Classification + + +
+ In recent years, the diagnosis of gastrointestinal (GI) diseases has advanced +greatly with the advent of high-tech video capsule endoscopy (VCE) technology, +which allows for non-invasive observation of the digestive system. The MisaHub +Capsule Vision Challenge encourages the development of vendor-independent +artificial intelligence models that can autonomously classify GI anomalies from +VCE images. This paper presents CNN architecture designed specifically for +multiclass classification of ten gut pathologies, including angioectasia, +bleeding, erosion, erythema, foreign bodies, lymphangiectasia, polyps, ulcers, +and worms as well as their normal state. + +
+
+ comment: 11 pages, 7 figuers +
+
+
+
+
+ + ♻ ☆ MapEx: Indoor Structure Exploration with Probabilistic Information Gain + from Global Map Predictions + + +
+ Exploration is a critical challenge in robotics, centered on understanding +unknown environments. In this work, we focus on robots exploring structured +indoor environments which are often predictable and composed of repeating +patterns. Most existing approaches, such as conventional frontier approaches, +have difficulty leveraging the predictability and explore with simple +heuristics such as `closest first'. Recent works use deep learning techniques +to predict unknown regions of the map, using these predictions for information +gain calculation. However, these approaches are often sensitive to the +predicted map quality or do not reason over sensor coverage. To overcome these +issues, our key insight is to jointly reason over what the robot can observe +and its uncertainty to calculate probabilistic information gain. We introduce +MapEx, a new exploration framework that uses predicted maps to form +probabilistic sensor model for information gain estimation. MapEx generates +multiple predicted maps based on observed information, and takes into +consideration both the computed variances of predicted maps and estimated +visible area to estimate the information gain of a given viewpoint. Experiments +on the real-world KTH dataset showed on average 12.4% improvement than +representative map-prediction based exploration and 25.4% improvement than +nearest frontier approach. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ♻ ☆ SynCo: Synthetic Hard Negatives in Contrastive Learning for Better + Unsupervised Visual Representations + + +
+ Contrastive learning has become a dominant approach in self-supervised visual +representation learning. Hard negatives - samples closely resembling the anchor +- are key to enhancing learned representations' discriminative power. However, +efficiently leveraging hard negatives remains challenging. We introduce SynCo +(Synthetic Negatives in Contrastive learning), a novel approach that improves +model performance by generating synthetic hard negatives on the representation +space. Building on the MoCo framework, SynCo introduces six strategies for +creating diverse synthetic hard negatives on-the-fly with minimal computational +overhead. SynCo achieves faster training and better representation learning, +reaching 67.9% top-1 accuracy on ImageNet ILSVRC-2012 linear evaluation after +200 pretraining epochs, surpassing MoCo's 67.5% using the same ResNet-50 +encoder. It also transfers more effectively to detection tasks: on PASCAL VOC, +it outperforms both the supervised baseline and MoCo with 82.5% AP; on COCO, it +sets new benchmarks with 40.9% AP for bounding box detection and 35.5% AP for +instance segmentation. Our synthetic hard negative generation approach +significantly enhances visual representations learned through self-supervised +contrastive learning. Code is available at +https://github.com/giakoumoglou/synco. + +
+
+ comment: 10 pages, 5 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Distilling Invariant Representations with Dual Augmentation + + +
+ Knowledge distillation (KD) has been widely used to transfer knowledge from +large, accurate models (teachers) to smaller, efficient ones (students). Recent +methods have explored enforcing consistency by incorporating causal +interpretations to distill invariant representations. In this work, we extend +this line of research by introducing a dual augmentation strategy to promote +invariant feature learning in both teacher and student models. Our approach +leverages different augmentations applied to both models during distillation, +pushing the student to capture robust, transferable features. This dual +augmentation strategy complements invariant causal distillation by ensuring +that the learned representations remain stable across a wider range of data +variations and transformations. Extensive experiments on CIFAR-100 demonstrate +the effectiveness of this approach, achieving competitive results in +same-architecture KD. + +
+
+ comment: This paper presents preliminary results from a project that we have + since discontinued, as our research focus has shifted to new directions +
+
+
+
+
+ + ♻ ☆ AnyV2V: A Tuning-Free Framework For Any Video-to-Video Editing Tasks + + +
+ In the dynamic field of digital content creation using generative models, +state-of-the-art video editing models still do not offer the level of quality +and control that users desire. Previous works on video editing either extended +from image-based generative models in a zero-shot manner or necessitated +extensive fine-tuning, which can hinder the production of fluid video edits. +Furthermore, these methods frequently rely on textual input as the editing +guidance, leading to ambiguities and limiting the types of edits they can +perform. Recognizing these challenges, we introduce AnyV2V, a novel tuning-free +paradigm designed to simplify video editing into two primary steps: (1) +employing an off-the-shelf image editing model to modify the first frame, (2) +utilizing an existing image-to-video generation model to generate the edited +video through temporal feature injection. AnyV2V can leverage any existing +image editing tools to support an extensive array of video editing tasks, +including prompt-based editing, reference-based style transfer, subject-driven +editing, and identity manipulation, which were unattainable by previous +methods. AnyV2V can also support any video length. Our evaluation shows that +AnyV2V achieved CLIP-scores comparable to other baseline methods. Furthermore, +AnyV2V significantly outperformed these baselines in human evaluations, +demonstrating notable improvements in visual consistency with the source video +while producing high-quality edits across all editing tasks. + +
+
+ comment: Published in Transactions on Machine Learning Research (TMLR 2024) + (11/2024) +
+
+
+
+
+ + ♻ ☆ ParallelEdits: Efficient Multi-Aspect Text-Driven Image Editing with + Attention Grouping + + +
+ Text-driven image synthesis has made significant advancements with the +development of diffusion models, transforming how visual content is generated +from text prompts. Despite these advances, text-driven image editing, a key +area in computer graphics, faces unique challenges. A major challenge is making +simultaneous edits across multiple objects or attributes. Applying these +methods sequentially for multi-attribute edits increases computational demands +and efficiency losses. In this paper, we address these challenges with +significant contributions. Our main contribution is the development of +ParallelEdits, a method that seamlessly manages simultaneous edits across +multiple attributes. In contrast to previous approaches, ParallelEdits not only +preserves the quality of single attribute edits but also significantly improves +the performance of multitasking edits. This is achieved through innovative +attention distribution mechanism and multi-branch design that operates across +several processing heads. Additionally, we introduce the PIE-Bench++ dataset, +an expansion of the original PIE-Bench dataset, to better support evaluating +image-editing tasks involving multiple objects and attributes simultaneously. +This dataset is a benchmark for evaluating text-driven image editing methods in +multifaceted scenarios. + +
+
+
+
+
+ + ♻ ☆ GenAI-Bench: Evaluating and Improving Compositional Text-to-Visual + Generation + + +
+ While text-to-visual models now produce photo-realistic images and videos, +they struggle with compositional text prompts involving attributes, +relationships, and higher-order reasoning such as logic and comparison. In this +work, we conduct an extensive human study on GenAI-Bench to evaluate the +performance of leading image and video generation models in various aspects of +compositional text-to-visual generation. We also compare automated evaluation +metrics against our collected human ratings and find that VQAScore -- a metric +measuring the likelihood that a VQA model views an image as accurately +depicting the prompt -- significantly outperforms previous metrics such as +CLIPScore. In addition, VQAScore can improve generation in a black-box manner +(without finetuning) via simply ranking a few (3 to 9) candidate images. +Ranking by VQAScore is 2x to 3x more effective than other scoring methods like +PickScore, HPSv2, and ImageReward at improving human alignment ratings for +DALL-E 3 and Stable Diffusion, especially on compositional prompts that require +advanced visio-linguistic reasoning. We release a new GenAI-Rank benchmark with +over 40,000 human ratings to evaluate scoring metrics on ranking images +generated from the same prompt. Lastly, we discuss promising areas for +improvement in VQAScore, such as addressing fine-grained visual details. We +will release all human ratings (over 80,000) to facilitate scientific +benchmarking of both generative models and automated metrics. + +
+
+ comment: We open-source our dataset, model, and code at: + https://linzhiqiu.github.io/papers/genai_bench ; Project page: + https://linzhiqiu.github.io/papers/genai_bench ; GenAI-Bench was first + introduced in arxiv:2404.01291. This article extends it with an additional + GenAI-Rank benchmark +
+
+
+
+
+ + ♻ ☆ LaB-GATr: geometric algebra transformers for large biomedical surface + and volume meshes + + +
+ Many anatomical structures can be described by surface or volume meshes. +Machine learning is a promising tool to extract information from these 3D +models. However, high-fidelity meshes often contain hundreds of thousands of +vertices, which creates unique challenges in building deep neural network +architectures. Furthermore, patient-specific meshes may not be canonically +aligned which limits the generalisation of machine learning algorithms. We +propose LaB-GATr, a transfomer neural network with geometric tokenisation that +can effectively learn with large-scale (bio-)medical surface and volume meshes +through sequence compression and interpolation. Our method extends the recently +proposed geometric algebra transformer (GATr) and thus respects all Euclidean +symmetries, i.e. rotation, translation and reflection, effectively mitigating +the problem of canonical alignment between patients. LaB-GATr achieves +state-of-the-art results on three tasks in cardiovascular hemodynamics +modelling and neurodevelopmental phenotype prediction, featuring meshes of up +to 200,000 vertices. Our results demonstrate that LaB-GATr is a powerful +architecture for learning with high-fidelity meshes which has the potential to +enable interesting downstream applications. Our implementation is publicly +available. + +
+
+ comment: First published in "Medical Image Computing and Computer Assisted + Intervention" (MICCAI), pp 185-195, 2024 by Springer Nature +
+
+
+
+
+ + ♻ ☆ Multimodality Helps Few-Shot 3D Point Cloud Semantic Segmentation + + +
+ Few-shot 3D point cloud segmentation (FS-PCS) aims at generalizing models to +segment novel categories with minimal annotated support samples. While existing +FS-PCS methods have shown promise, they primarily focus on unimodal point cloud +inputs, overlooking the potential benefits of leveraging multimodal +information. In this paper, we address this gap by introducing a cost-free +multimodal FS-PCS setup, utilizing textual labels and the potentially available +2D image modality. Under this easy-to-achieve setup, we present the MultiModal +Few-Shot SegNet (MM-FSS), a model effectively harnessing complementary +information from multiple modalities. MM-FSS employs a shared backbone with two +heads to extract intermodal and unimodal visual features, and a pretrained text +encoder to generate text embeddings. To fully exploit the multimodal +information, we propose a Multimodal Correlation Fusion (MCF) module to +generate multimodal correlations, and a Multimodal Semantic Fusion (MSF) module +to refine the correlations using text-aware semantic guidance. Additionally, we +propose a simple yet effective Test-time Adaptive Cross-modal Calibration +(TACC) technique to mitigate training bias, further improving generalization. +Experimental results on S3DIS and ScanNet datasets demonstrate significant +performance improvements achieved by our method. The efficacy of our approach +indicates the benefits of leveraging commonly-ignored free modalities for +FS-PCS, providing valuable insights for future research. The code is available +at https://github.com/ZhaochongAn/Multimodality-3D-Few-Shot + +
+
+
+
+
+ + ♻ ☆ Why are Visually-Grounded Language Models Bad at Image Classification? NeurIPS 2024 + + +
+ Image classification is one of the most fundamental capabilities of machine +vision intelligence. In this work, we revisit the image classification task +using visually-grounded language models (VLMs) such as GPT-4V and LLaVA. We +find that existing proprietary and public VLMs, despite often using CLIP as a +vision encoder and having many more parameters, significantly underperform CLIP +on standard image classification benchmarks like ImageNet. To understand the +reason, we explore several hypotheses concerning the inference algorithms, +training objectives, and data processing in VLMs. Our analysis reveals that the +primary cause is data-related: critical information for image classification is +encoded in the VLM's latent space but can only be effectively decoded with +enough training data. Specifically, there is a strong correlation between the +frequency of class exposure during VLM training and instruction-tuning and the +VLM's performance in those classes; when trained with sufficient data, VLMs can +match the accuracy of state-of-the-art classification models. Based on these +findings, we enhance a VLM by integrating classification-focused datasets into +its training, and demonstrate that the enhanced classification performance of +the VLM transfers to its general capabilities, resulting in an improvement of +11.8% on the newly collected ImageWikiQA dataset. + +
+
+ comment: Published at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ BrepGen: A B-rep Generative Diffusion Model with Structured Latent + Geometry SIGGRAPH 2024 + + +
+ This paper presents BrepGen, a diffusion-based generative approach that +directly outputs a Boundary representation (B-rep) Computer-Aided Design (CAD) +model. BrepGen represents a B-rep model as a novel structured latent geometry +in a hierarchical tree. With the root node representing a whole CAD solid, each +element of a B-rep model (i.e., a face, an edge, or a vertex) progressively +turns into a child-node from top to bottom. B-rep geometry information goes +into the nodes as the global bounding box of each primitive along with a latent +code describing the local geometric shape. The B-rep topology information is +implicitly represented by node duplication. When two faces share an edge, the +edge curve will appear twice in the tree, and a T-junction vertex with three +incident edges appears six times in the tree with identical node features. +Starting from the root and progressing to the leaf, BrepGen employs +Transformer-based diffusion models to sequentially denoise node features while +duplicated nodes are detected and merged, recovering the B-Rep topology +information. Extensive experiments show that BrepGen advances the task of CAD +B-rep generation, surpassing existing methods on various benchmarks. Results on +our newly collected furniture dataset further showcase its exceptional +capability in generating complicated geometry. While previous methods were +limited to generating simple prismatic shapes, BrepGen incorporates free-form +and doubly-curved surfaces for the first time. Additional applications of +BrepGen include CAD autocomplete and design interpolation. The code, pretrained +models, and dataset are available at https://github.com/samxuxiang/BrepGen. + +
+
+ comment: Accepted to ACM SIGGRAPH 2024. Code at + https://github.com/samxuxiang/BrepGen +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 23 + +
+
+
+ + ☆ Use Digital Twins to Support Fault Diagnosis From System-level + Condition-monitoring Data + + +
+ Deep learning models have created great opportunities for data-driven fault +diagnosis but they require large amount of labeled failure data for training. +In this paper, we propose to use a digital twin to support developing +data-driven fault diagnosis model to reduce the amount of failure data used in +the training process. The developed fault diagnosis models are also able to +diagnose component-level failures based on system-level condition-monitoring +data. The proposed framework is evaluated on a real-world robot system. The +results showed that the deep learning model trained by digital twins is able to +diagnose the locations and modes of 9 faults/failure from $4$ different motors. +However, the performance of the model trained by a digital twin can still be +improved, especially when the digital twin model has some discrepancy with the +real system. + +
+
+ comment: 6 pages, 4 figure. Paper submitted to 2025 22nd International + Multi-Conference on Systems, Signals & Devices (SSD) +
+
+
+
+
+ + ☆ The Role of Domain Randomization in Training Diffusion Policies for + Whole-Body Humanoid Control + + +
+ Humanoids have the potential to be the ideal embodiment in environments +designed for humans. Thanks to the structural similarity to the human body, +they benefit from rich sources of demonstration data, e.g., collected via +teleoperation, motion capture, or even using videos of humans performing tasks. +However, distilling a policy from demonstrations is still a challenging +problem. While Diffusion Policies (DPs) have shown impressive results in +robotic manipulation, their applicability to locomotion and humanoid control +remains underexplored. In this paper, we investigate how dataset diversity and +size affect the performance of DPs for humanoid whole-body control. In a +simulated IsaacGym environment, we generate synthetic demonstrations by +training Adversarial Motion Prior (AMP) agents under various Domain +Randomization (DR) conditions, and we compare DPs fitted to datasets of +different size and diversity. Our findings show that, although DPs can achieve +stable walking behavior, successful training of locomotion policies requires +significantly larger and more diverse datasets compared to manipulation tasks, +even in simple scenarios. + +
+
+ comment: Conference on Robot Learning, Workshop on Whole-Body Control and + Bimanual Manipulation +
+
+
+
+
+ + ☆ Control Strategies for Pursuit-Evasion Under Occlusion Using Visibility + and Safety Barrier Functions + + +
+ This paper develops a control strategy for pursuit-evasion problems in +environments with occlusions. We address the challenge of a mobile pursuer +keeping a mobile evader within its field of view (FoV) despite line-of-sight +obstructions. The signed distance function (SDF) of the FoV is used to +formulate visibility as a control barrier function (CBF) constraint on the +pursuer's control inputs. Similarly, obstacle avoidance is formulated as a CBF +constraint based on the SDF of the obstacle set. While the visibility and +safety CBFs are Lipschitz continuous, they are not differentiable everywhere, +necessitating the use of generalized gradients. To achieve non-myopic pursuit, +we generate reference control trajectories leading to evader visibility using a +sampling-based kinodynamic planner. The pursuer then tracks this reference via +convex optimization under the CBF constraints. We validate our approach in +CARLA simulations and real-world robot experiments, demonstrating successful +visibility maintenance using only onboard sensing, even under severe occlusions +and dynamic evader movements. + +
+
+ comment: 7 pages, 7 figures +
+
+
+
+
+ + ☆ Receding Hamiltonian-Informed Optimal Neural Control and State + Estimation for Closed-Loop Dynamical Systems + + +
+ This paper formalizes Hamiltonian-Informed Optimal Neural (Hion) controllers, +a novel class of neural network-based controllers for dynamical systems and +explicit non-linear model predictive control. Hion controllers estimate future +states and compute optimal control inputs using Pontryagin's Maximum Principle. +The proposed framework allows for customization of transient behavior, +addressing limitations of existing methods. The Taylored Multi-Faceted Approach +for Neural ODE and Optimal Control (T-mano) architecture facilitates training +and ensures accurate state estimation. Optimal control strategies are +demonstrated for both linear and non-linear dynamical systems. + +
+
+
+
+
+ + ☆ Mixed-Integer MPC-Based Motion Planning Using Hybrid Zonotopes with + Tight Relaxations + + +
+ Autonomous vehicle (AV) motion planning problems often involve non-convex +constraints, which present a major barrier to applying model predictive control +(MPC) in real time on embedded hardware. This paper presents an approach for +efficiently solving mixed-integer MPC motion planning problems using a hybrid +zonotope representation of the obstacle-free space. The MPC optimization +problem is formulated as a multi-stage mixed-integer quadratic program (MIQP) +using a hybrid zonotope representation of the non-convex constraints. +Risk-aware planning is supported by assigning costs to different regions of the +obstacle-free space within the MPC cost function. A multi-stage MIQP solver is +presented that exploits the structure of the hybrid zonotope constraints. For +some hybrid zonotope representations, it is shown that the convex relaxation is +tight, i.e., equal to the convex hull. In conjunction with logical constraints +derived from the AV motion planning context, this property is leveraged to +generate tight quadratic program (QP) sub-problems within a branch-and-bound +mixed-integer solver. The hybrid zonotope structure is further leveraged to +reduce the number of matrix factorizations that need to be computed within the +QP sub-problems. Simulation studies are presented for obstacle-avoidance and +risk-aware motion planning problems using polytopic maps and occupancy grids. +In most cases, the proposed solver finds the optimal solution an order of +magnitude faster than a state-of-the-art commercial solver. +Processor-in-the-loop studies demonstrate the utility of the solver for +real-time implementations on embedded hardware. + +
+
+
+
+
+ + ☆ Task-Oriented Hierarchical Object Decomposition for Visuomotor Control + + +
+ Good pre-trained visual representations could enable robots to learn +visuomotor policy efficiently. Still, existing representations take a +one-size-fits-all-tasks approach that comes with two important drawbacks: (1) +Being completely task-agnostic, these representations cannot effectively ignore +any task-irrelevant information in the scene, and (2) They often lack the +representational capacity to handle unconstrained/complex real-world scenes. +Instead, we propose to train a large combinatorial family of representations +organized by scene entities: objects and object parts. This hierarchical object +decomposition for task-oriented representations (HODOR) permits selectively +assembling different representations specific to each task while scaling in +representational capacity with the complexity of the scene and the task. In our +experiments, we find that HODOR outperforms prior pre-trained representations, +both scene vector representations and object-centric representations, for +sample-efficient imitation learning across 5 simulated and 5 real-world +manipulation tasks. We further find that the invariances captured in HODOR are +inherited into downstream policies, which can robustly generalize to +out-of-distribution test conditions, permitting zero-shot skill chaining. +Appendix, code, and videos: https://sites.google.com/view/hodor-corl24. + +
+
+
+
+
+ + ☆ Efficient Collaborative Navigation through Perception Fusion for + Multi-Robots in Unknown Environments + + +
+ For tasks conducted in unknown environments with efficiency requirements, +real-time navigation of multi-robot systems remains challenging due to +unfamiliarity with surroundings.In this paper, we propose a novel multi-robot +collaborative planning method that leverages the perception of different robots +to intelligently select search directions and improve planning efficiency. +Specifically, a foundational planner is employed to ensure reliable exploration +towards targets in unknown environments and we introduce Graph Attention +Architecture with Information Gain Weight(GIWT) to synthesizes the information +from the target robot and its teammates to facilitate effective navigation +around obstacles.In GIWT, after regionally encoding the relative positions of +the robots along with their perceptual features, we compute the shared +attention scores and incorporate the information gain obtained from neighboring +robots as a supplementary weight. We design a corresponding expert data +generation scheme to simulate real-world decision-making conditions for network +training. Simulation experiments and real robot tests demonstrates that the +proposed method significantly improves efficiency and enables collaborative +planning for multiple robots. Our method achieves approximately 82% accuracy on +the expert dataset and reduces the average path length by about 8% and 6% +across two types of tasks compared to the fundamental planner in ROS tests, and +a path length reduction of over 6% in real-world experiments. + +
+
+
+
+
+ + ☆ Rotational Odometry using Ultra Low Resolution Thermal Cameras + + +
+ This letter provides what is, to the best of our knowledge, a first study on +the applicability of ultra-low-resolution thermal cameras for providing +rotational odometry measurements to navigational devices such as rovers and +drones. Our use of an ultra-low-resolution thermal camera instead of other +modalities such as an RGB camera is motivated by its robustness to lighting +conditions, while being one order of magnitude less cost-expensive compared to +higher-resolution thermal cameras. After setting up a custom data acquisition +system and acquiring thermal camera data together with its associated +rotational speed label, we train a small 4-layer Convolutional Neural Network +(CNN) for regressing the rotational speed from the thermal data. Experiments +and ablation studies are conducted for determining the impact of thermal camera +resolution and the number of successive frames on the CNN estimation precision. +Finally, our novel dataset for the study of low-resolution thermal odometry is +openly released with the hope of benefiting future research. + +
+
+
+
+
+ + ☆ MonoPlane: Exploiting Monocular Geometric Cues for Generalizable 3D + Plane Reconstruction IROS 2024 + + +
+ This paper presents a generalizable 3D plane detection and reconstruction +framework named MonoPlane. Unlike previous robust estimator-based works (which +require multiple images or RGB-D input) and learning-based works (which suffer +from domain shift), MonoPlane combines the best of two worlds and establishes a +plane reconstruction pipeline based on monocular geometric cues, resulting in +accurate, robust and scalable 3D plane detection and reconstruction in the +wild. Specifically, we first leverage large-scale pre-trained neural networks +to obtain the depth and surface normals from a single image. These monocular +geometric cues are then incorporated into a proximity-guided RANSAC framework +to sequentially fit each plane instance. We exploit effective 3D point +proximity and model such proximity via a graph within RANSAC to guide the plane +fitting from noisy monocular depths, followed by image-level multi-plane joint +optimization to improve the consistency among all plane instances. We further +design a simple but effective pipeline to extend this single-view solution to +sparse-view 3D plane reconstruction. Extensive experiments on a list of +datasets demonstrate our superior zero-shot generalizability over baselines, +achieving state-of-the-art plane reconstruction performance in a transferring +setting. Our code is available at https://github.com/thuzhaowang/MonoPlane . + +
+
+ comment: IROS 2024 (oral) +
+
+
+
+
+ + ☆ GarmentLab: A Unified Simulation and Benchmark for Garment Manipulation NeurIPS 2024 + + +
+ Manipulating garments and fabrics has long been a critical endeavor in the +development of home-assistant robots. However, due to complex dynamics and +topological structures, garment manipulations pose significant challenges. +Recent successes in reinforcement learning and vision-based methods offer +promising avenues for learning garment manipulation. Nevertheless, these +approaches are severely constrained by current benchmarks, which offer limited +diversity of tasks and unrealistic simulation behavior. Therefore, we present +GarmentLab, a content-rich benchmark and realistic simulation designed for +deformable object and garment manipulation. Our benchmark encompasses a diverse +range of garment types, robotic systems and manipulators. The abundant tasks in +the benchmark further explores of the interactions between garments, deformable +objects, rigid bodies, fluids, and human body. Moreover, by incorporating +multiple simulation methods such as FEM and PBD, along with our proposed +sim-to-real algorithms and real-world benchmark, we aim to significantly narrow +the sim-to-real gap. We evaluate state-of-the-art vision methods, reinforcement +learning, and imitation learning approaches on these tasks, highlighting the +challenges faced by current algorithms, notably their limited generalization +capabilities. Our proposed open-source environments and comprehensive analysis +show promising boost to future research in garment manipulation by unlocking +the full potential of these methods. We guarantee that we will open-source our +code as soon as possible. You can watch the videos in supplementary files to +learn more about the details of our work. Our project page is available at: +https://garmentlab.github.io/ + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Generation of Conservative Dynamical Systems Based on Stiffness Encoding + + +
+ Dynamical systems (DSs) provide a framework for high flexibility, robustness, +and control reliability and are widely used in motion planning and physical +human-robot interaction. The properties of the DS directly determine the +robot's specific motion patterns and the performance of the closed-loop control +system. In this paper, we establish a quantitative relationship between +stiffness properties and DS. We propose a stiffness encoding framework to +modulate DS properties by embedding specific stiffnesses. In particular, from +the perspective of the closed-loop control system's passivity, a conservative +DS is learned by encoding a conservative stiffness. The generated DS has a +symmetric attraction behavior and a variable stiffness profile. The proposed +method is applicable to demonstration trajectories belonging to different +manifolds and types (e.g., closed and self-intersecting trajectories), and the +closed-loop control system is always guaranteed to be passive in different +cases. For controllers tracking the general DS, the passivity of the system +needs to be guaranteed by the energy tank. We further propose a generic vector +field decomposition strategy based on conservative stiffness, which effectively +slows down the decay rate of energy in the energy tank and improves the +stability margin of the control system. Finally, a series of simulations in +various scenarios and experiments on planar and curved motion tasks demonstrate +the validity of our theory and methodology. + +
+
+
+
+
+ + ☆ AquaFuse: Waterbody Fusion for Physics Guided View Synthesis of + Underwater Scenes + + +
+ We introduce the idea of AquaFuse, a physics-based method for synthesizing +waterbody properties in underwater imagery. We formulate a closed-form solution +for waterbody fusion that facilitates realistic data augmentation and +geometrically consistent underwater scene rendering. AquaFuse leverages the +physical characteristics of light propagation underwater to synthesize the +waterbody from one scene to the object contents of another. Unlike data-driven +style transfer, AquaFuse preserves the depth consistency and object geometry in +an input scene. We validate this unique feature by comprehensive experiments +over diverse underwater scenes. We find that the AquaFused images preserve over +94% depth consistency and 90-95% structural similarity of the input scenes. We +also demonstrate that it generates accurate 3D view synthesis by preserving +object geometry while adapting to the inherent waterbody fusion process. +AquaFuse opens up a new research direction in data augmentation by +geometry-preserving style transfer for underwater imaging and robot vision +applications. + +
+
+
+
+
+ + ♻ ☆ Assigning Credit with Partial Reward Decoupling in Multi-Agent Proximal + Policy Optimization + + +
+ Multi-agent proximal policy optimization (MAPPO) has recently demonstrated +state-of-the-art performance on challenging multi-agent reinforcement learning +tasks. However, MAPPO still struggles with the credit assignment problem, +wherein the sheer difficulty in ascribing credit to individual agents' actions +scales poorly with team size. In this paper, we propose a multi-agent +reinforcement learning algorithm that adapts recent developments in credit +assignment to improve upon MAPPO. Our approach leverages partial reward +decoupling (PRD), which uses a learned attention mechanism to estimate which of +a particular agent's teammates are relevant to its learning updates. We use +this estimate to dynamically decompose large groups of agents into smaller, +more manageable subgroups. We empirically demonstrate that our approach, +PRD-MAPPO, decouples agents from teammates that do not influence their expected +future reward, thereby streamlining credit assignment. We additionally show +that PRD-MAPPO yields significantly higher data efficiency and asymptotic +performance compared to both MAPPO and other state-of-the-art methods across +several multi-agent tasks, including StarCraft II. Finally, we propose a +version of PRD-MAPPO that is applicable to \textit{shared} reward settings, +where PRD was previously not applicable, and empirically show that this also +leads to performance improvements over MAPPO. + +
+
+ comment: 20 pages, 5 figures, 12 tables, Reinforcement Learning Journal and + Reinforcement Learning Conference 2024 +
+
+
+
+
+ + ♻ ☆ Visual Whole-Body Control for Legged Loco-Manipulation + + +
+ We study the problem of mobile manipulation using legged robots equipped with +an arm, namely legged loco-manipulation. The robot legs, while usually utilized +for mobility, offer an opportunity to amplify the manipulation capabilities by +conducting whole-body control. That is, the robot can control the legs and the +arm at the same time to extend its workspace. We propose a framework that can +conduct the whole-body control autonomously with visual observations. Our +approach, namely Visual Whole-Body Control(VBC), is composed of a low-level +policy using all degrees of freedom to track the body velocities along with the +end-effector position, and a high-level policy proposing the velocities and +end-effector position based on visual inputs. We train both levels of policies +in simulation and perform Sim2Real transfer for real robot deployment. We +perform extensive experiments and show significant improvements over baselines +in picking up diverse objects in different configurations (heights, locations, +orientations) and environments. + +
+
+ comment: CoRL 2024 Oral. Project page: https://wholebody-b1.github.io +
+
+
+
+
+ + ♻ ☆ Context-Aware Replanning with Pre-explored Semantic Map for Object + Navigation + + +
+ Pre-explored Semantic Maps, constructed through prior exploration using +visual language models (VLMs), have proven effective as foundational elements +for training-free robotic applications. However, existing approaches assume the +map's accuracy and do not provide effective mechanisms for revising decisions +based on incorrect maps. To address this, we introduce Context-Aware Replanning +(CARe), which estimates map uncertainty through confidence scores and +multi-view consistency, enabling the agent to revise erroneous decisions +stemming from inaccurate maps without requiring additional labels. We +demonstrate the effectiveness of our proposed method by integrating it with two +modern mapping backbones, VLMaps and OpenMask3D, and observe significant +performance improvements in object navigation tasks. More details can be found +on the project page: https://care-maps.github.io/ + +
+
+ comment: CoRL 2024 camera ready. The first three authors contributed equally, + and their order of authorship is interchangeable. Project page: + https://care-maps.github.io/ +
+
+
+
+
+ + ♻ ☆ Using Fiber Optic Bundles to Miniaturize Vision-Based Tactile Sensors + + +
+ Vision-based tactile sensors have recently become popular due to their +combination of low cost, very high spatial resolution, and ease of integration +using widely available miniature cameras. The associated field of view and +focal length, however, are difficult to package in a human-sized finger. In +this paper we employ optical fiber bundles to achieve a form factor that, at 15 +mm diameter, is smaller than an average human fingertip. The electronics and +camera are also located remotely, further reducing package size. The sensor +achieves a spatial resolution of 0.22 mm and a minimum force resolution 5 mN +for normal and shear contact forces. With these attributes, the DIGIT Pinki +sensor is suitable for applications such as robotic and teleoperated digital +palpation. We demonstrate its utility for palpation of the prostate gland and +show that it can achieve clinically relevant discrimination of prostate +stiffness for phantom and ex vivo tissue. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + The CAD design files of DIGIT Pinki are available at + https://github.com/facebookresearch/digit-design +
+
+
+
+
+ + ♻ ☆ FilMBot: A High-Speed Soft Parallel Robotic Micromanipulator + + +
+ Soft robotic manipulators are generally slow despite their great +adaptability, resilience, and compliance. This limitation also extends to +current soft robotic micromanipulators. Here, we introduce FilMBot, a 3-DOF +film-based, electromagnetically actuated, soft kinematic robotic +micromanipulator achieving speeds up to 2117 $\deg$/s and 2456 $\deg$/s in +$\alpha$ and $\beta$ angular motions, with corresponding linear velocities of +1.61 m/s and 1.92 m/s using a 4-cm needle end-effector, and 1.57 m/s along the +Z axis. The robot can reach ~1.50 m/s in path-following tasks, operates at +frequencies up to 30 Hz, and remains functional up to 50 Hz. It demonstrates +high precision (~6.3 $\mu$m, or ~0.05% of its workspace) in small +path-following tasks. The novel combination of the low-stiffness soft kinematic +film structure and strong electromagnetic actuation in FilMBot opens new +avenues for soft robotics. Furthermore, its simple construction and +inexpensive, readily accessible components could broaden the application of +micromanipulators beyond current academic and professional users. + +
+
+ comment: 12 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ GPTR: Gaussian Process Trajectory Representation for Continuous-Time + Motion Estimation + + +
+ Continuous-time trajectory representation has gained significant popularity +in recent years, as it offers an elegant formulation that allows the fusion of +a larger number of sensors and sensing modalities, overcoming limitations of +traditional discrete-time frameworks. To bolster the adoption of the +continuous-time paradigm, we propose a so-called Gaussian Process Trajectory +Representation (GPTR) framework for continuous-time motion estimation (CTME) +tasks. Our approach stands out by employing a third-order random jerk model, +featuring closed-form expressions for both rotational and translational state +derivatives. This model provides smooth, continuous trajectory representations +that are crucial for precise estimation of complex motion. To support the wider +robotics and computer vision communities, we have made the source code for GPTR +available as a light-weight header-only library. This format was chosen for its +ease of integration, allowing developers to incorporate GPTR into existing +systems without needing extensive code modifications. Moreover, we also provide +a set of optimization examples with LiDAR, camera, IMU, UWB factors, and +closed-form analytical Jacobians under the proposed GP framework. Our +experiments demonstrate the efficacy and efficiency of GP-based trajectory +representation in various motion estimation tasks, and the examples can serve +as the prototype to help researchers quickly develop future applications such +as batch optimization, calibration, sensor fusion, trajectory planning, etc., +with continuous-time trajectory representation. Our project is accessible at +https://github.com/brytsknguyen/gptr . + +
+
+ comment: The source code has been released. All feedbacks are welcome +
+
+
+
+
+ + ♻ ☆ $π_0$: A Vision-Language-Action Flow Model for General Robot Control + + +
+ Robot learning holds tremendous promise to unlock the full potential of +flexible, general, and dexterous robot systems, as well as to address some of +the deepest questions in artificial intelligence. However, bringing robot +learning to the level of generality required for effective real-world systems +faces major obstacles in terms of data, generalization, and robustness. In this +paper, we discuss how generalist robot policies (i.e., robot foundation models) +can address these challenges, and how we can design effective generalist robot +policies for complex and highly dexterous tasks. We propose a novel flow +matching architecture built on top of a pre-trained vision-language model (VLM) +to inherit Internet-scale semantic knowledge. We then discuss how this model +can be trained on a large and diverse dataset from multiple dexterous robot +platforms, including single-arm robots, dual-arm robots, and mobile +manipulators. We evaluate our model in terms of its ability to perform tasks in +zero shot after pre-training, follow language instructions from people and from +a high-level VLM policy, and its ability to acquire new skills via fine-tuning. +Our results cover a wide variety of tasks, such as laundry folding, table +cleaning, and assembling boxes. + +
+
+ comment: See project website for videos: + https://physicalintelligence.company/blog/pi0 +
+
+
+
+
+ + ♻ ☆ Human-Aware Vision-and-Language Navigation: Bridging Simulation to + Reality with Dynamic Human Interactions NeurIPS 2024 + + +
+ Vision-and-Language Navigation (VLN) aims to develop embodied agents that +navigate based on human instructions. However, current VLN frameworks often +rely on static environments and optimal expert supervision, limiting their +real-world applicability. To address this, we introduce Human-Aware +Vision-and-Language Navigation (HA-VLN), extending traditional VLN by +incorporating dynamic human activities and relaxing key assumptions. We propose +the Human-Aware 3D (HA3D) simulator, which combines dynamic human activities +with the Matterport3D dataset, and the Human-Aware Room-to-Room (HA-R2R) +dataset, extending R2R with human activity descriptions. To tackle HA-VLN +challenges, we present the Expert-Supervised Cross-Modal (VLN-CM) and +Non-Expert-Supervised Decision Transformer (VLN-DT) agents, utilizing +cross-modal fusion and diverse training strategies for effective navigation in +dynamic human environments. A comprehensive evaluation, including metrics +considering human activities, and systematic analysis of HA-VLN's unique +challenges, underscores the need for further research to enhance HA-VLN agents' +real-world robustness and adaptability. Ultimately, this work provides +benchmarks and insights for future research on embodied AI and Sim2Real +transfer, paving the way for more realistic and applicable VLN systems in +human-populated environments. + +
+
+ comment: Spotlight at NeurIPS 2024 D&B Track. 32 pages, 18 figures, Project + Page: https://lpercc.github.io/HA3D_simulator/ +
+
+
+
+
+ + ♻ Robot Policy Learning with Temporal Optimal Transport Reward NeurIPS 2024 + + +
+ Reward specification is one of the most tricky problems in Reinforcement +Learning, which usually requires tedious hand engineering in practice. One +promising approach to tackle this challenge is to adopt existing expert video +demonstrations for policy learning. Some recent work investigates how to learn +robot policies from only a single/few expert video demonstrations. For example, +reward labeling via Optimal Transport (OT) has been shown to be an effective +strategy to generate a proxy reward by measuring the alignment between the +robot trajectory and the expert demonstrations. However, previous work mostly +overlooks that the OT reward is invariant to temporal order information, which +could bring extra noise to the reward signal. To address this issue, in this +paper, we introduce the Temporal Optimal Transport (TemporalOT) reward to +incorporate temporal order information for learning a more accurate OT-based +proxy reward. Extensive experiments on the Meta-world benchmark tasks validate +the efficacy of the proposed method. Code is available at: +https://github.com/fuyw/TemporalOT + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Learning to Control and Coordinate Mixed Traffic Through Robot Vehicles + at Complex and Unsignalized Intersections + + +
+ Intersections are essential road infrastructures for traffic in modern +metropolises. However, they can also be the bottleneck of traffic flows as a +result of traffic incidents or the absence of traffic coordination mechanisms +such as traffic lights. Recently, various control and coordination mechanisms +that are beyond traditional control methods have been proposed to improve the +efficiency of intersection traffic by leveraging the ability of autonomous +vehicles. Amongst these methods, the control of foreseeable mixed traffic that +consists of human-driven vehicles (HVs) and robot vehicles (RVs) has emerged. +We propose a decentralized multi-agent reinforcement learning approach for the +control and coordination of mixed traffic by RVs at real-world, complex +intersections -- an open challenge to date. We design comprehensive experiments +to evaluate the effectiveness, robustness, generalizablility, and adaptability +of our approach. In particular, our method can prevent congestion formation via +merely 5% RVs under a real-world traffic demand of 700 vehicles per hour. In +contrast, without RVs, congestion will form when the traffic demand reaches as +low as 200 vehicles per hour. Moreover, when the RV penetration rate exceeds +60%, our method starts to outperform traffic signal control in terms of the +average waiting time of all vehicles. Our method is not only robust against +blackout events, sudden RV percentage drops, and V2V communication error, but +also enjoys excellent generalizablility, evidenced by its successful deployment +in five unseen intersections. Lastly, our method performs well under various +traffic rules, demonstrating its adaptability to diverse scenarios. Videos and +code of our work are available at +https://sites.google.com/view/mixedtrafficcontrol + +
+
+ comment: This paper introduces the first method to control and coordinate + mixed traffic (i.e., human-driven vehicles and robot vehicles) at + unsignalized intersections with both complicated topology and real-world + traffic demands. The International Journal of Robotics Research. 2024;0(0) +
+
+
+
+
+ + ♻ ☆ MPCGPU: Real-Time Nonlinear Model Predictive Control through + Preconditioned Conjugate Gradient on the GPU ICRA 2024 + + +
+ Nonlinear Model Predictive Control (NMPC) is a state-of-the-art approach for +locomotion and manipulation which leverages trajectory optimization at each +control step. While the performance of this approach is computationally +bounded, implementations of direct trajectory optimization that use iterative +methods to solve the underlying moderately-large and sparse linear systems, are +a natural fit for parallel hardware acceleration. In this work, we introduce +MPCGPU, a GPU-accelerated, real-time NMPC solver that leverages an accelerated +preconditioned conjugate gradient (PCG) linear system solver at its core. We +show that MPCGPU increases the scalability and real-time performance of NMPC, +solving larger problems, at faster rates. In particular, for tracking tasks +using the Kuka IIWA manipulator, MPCGPU is able to scale to kilohertz control +rates with trajectories as long as 512 knot points. This is driven by a custom +PCG solver which outperforms state-of-the-art, CPU-based, linear system solvers +by at least 10x for a majority of solves and 3.6x on average. + +
+
+ comment: Accepted to ICRA 2024, 8 pages, 6 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 35 + +
+
+
+ + ☆ AGISim, An Open Source Airborne Gimbal Mounted IMU Signal Simulator + Considering Flight Dynamics Model + + +
+ In this work we present more comprehensive evaluations on our airborne Gimbal +mounted inertial measurement unit (IMU) signal simulator which also considers +flight dynamic model (FDM). A flexible IMU signal simulator is an enabling tool +in design, development, improvement, test and verification of aided inertial +navigation systems (INS). Efforts by other researchers had been concentrated on +simulation of the strapdown INS (SINS) with the IMU rigidly attached to the +moving body frame. However custom airborne surveying/mapping applications that +need pointing and stabilizing camera or any other surveying sensor, require +mounting the IMU beside the sensor on a Gimbal onboard the airframe. Hence the +proposed Gimbal mounted IMU signal simulator is of interest whilst itself +requires further analysis and verifications. Extended evaluation results in +terms of both unit tests and functional/integration tests (using aided inertial +navigation algorithms with variable/dynamic lever arms), verifies the simulator +and its applicability for the mentioned tasks. We have further packaged and +published our MATLAB code for the proposed simulator as an open source GitHub +repository. + +
+
+ comment: 10 pages, 8 figures, 4 tables, Submitted to Journal of Aerospace + Science and Technology (JAST) +
+
+
+
+
+ + ☆ Mixed Reality Teleoperation Assistance for Direct Control of Humanoids + + +
+ Teleoperation plays a crucial role in enabling robot operations in +challenging environments, yet existing limitations in effectiveness and +accuracy necessitate the development of innovative strategies for improving +teleoperated tasks. This article introduces a novel approach that utilizes +mixed reality and assistive autonomy to enhance the efficiency and precision of +humanoid robot teleoperation. By leveraging Probabilistic Movement Primitives, +object detection, and Affordance Templates, the assistance combines user motion +with autonomous capabilities, achieving task efficiency while maintaining +human-like robot motion. Experiments and feasibility studies on the Nadia robot +confirm the effectiveness of the proposed framework. + +
+
+ comment: IEEE Robotics and Automation, Volume: 9, Issue: 2 +
+
+
+
+
+ + ☆ Active Learning-augmented Intention-aware Obstacle Avoidance of + Autonomous Surface Vehicles in High-traffic Waters IROS 2024 + + +
+ This paper enhances the obstacle avoidance of Autonomous Surface Vehicles +(ASVs) for safe navigation in high-traffic waters with an active state +estimation of obstacle's passing intention and reducing its uncertainty. We +introduce a topological modeling of passing intention of obstacles, which can +be applied to varying encounter situations based on the inherent embedding of +topological concepts in COLREGs. With a Long Short-Term Memory (LSTM) neural +network, we classify the passing intention of obstacles. Then, for determining +the ASV maneuver, we propose a multi-objective optimization framework including +information gain about the passing obstacle intention and safety. We validate +the proposed approach under extensive Monte Carlo simulations (2,400 runs) with +a varying number of obstacles, dynamic properties, encounter situations, and +different behavioral patterns of obstacles (cooperative, non-cooperative). We +also present the results from a real marine accident case study as well as +real-world experiments of a real ASV with environmental disturbances, showing +successful collision avoidance with our strategy in real-time. + +
+
+ comment: Accepted to IROS 2024 +
+
+
+
+
+ + ☆ Enhancing Model-Based Step Adaptation for Push Recovery through + Reinforcement Learning of Step Timing and Region + + +
+ This paper introduces a new approach to enhance the robustness of humanoid +walking under strong perturbations, such as substantial pushes. Effective +recovery from external disturbances requires bipedal robots to dynamically +adjust their stepping strategies, including footstep positions and timing. +Unlike most advanced walking controllers that restrict footstep locations to a +predefined convex region, substantially limiting recoverable disturbances, our +method leverages reinforcement learning to dynamically adjust the permissible +footstep region, expanding it to a larger, effectively non-convex area and +allowing cross-over stepping, which is crucial for counteracting large lateral +pushes. Additionally, our method adapts footstep timing in real time to further +extend the range of recoverable disturbances. Based on these adjustments, +feasible footstep positions and DCM trajectory are planned by solving a QP. +Finally, we employ a DCM controller and an inverse dynamics whole-body control +framework to ensure the robot effectively follows the trajectory. + +
+
+
+
+
+ + ☆ Raspberry PhenoSet: A Phenology-based Dataset for Automated Growth + Detection and Yield Estimation + + +
+ The future of the agriculture industry is intertwined with automation. +Accurate fruit detection, yield estimation, and harvest time estimation are +crucial for optimizing agricultural practices. These tasks can be carried out +by robots to reduce labour costs and improve the efficiency of the process. To +do so, deep learning models should be trained to perform knowledge-based tasks, +which outlines the importance of contributing valuable data to the literature. +In this paper, we introduce Raspberry PhenoSet, a phenology-based dataset +designed for detecting and segmenting raspberry fruit across seven +developmental stages. To the best of our knowledge, Raspberry PhenoSet is the +first fruit dataset to integrate biology-based classification with fruit +detection tasks, offering valuable insights for yield estimation and precise +harvest timing. This dataset contains 1,853 high-resolution images, the highest +quality in the literature, captured under controlled artificial lighting in a +vertical farm. The dataset has a total of 6,907 instances of mask annotations, +manually labelled to reflect the seven phenology stages. We have also +benchmarked Raspberry PhenoSet using several state-of-the-art deep learning +models, including YOLOv8, YOLOv10, RT-DETR, and Mask R-CNN, to provide a +comprehensive evaluation of their performance on the dataset. Our results +highlight the challenges of distinguishing subtle phenology stages and +underscore the potential of Raspberry PhenoSet for both deep learning model +development and practical robotic applications in agriculture, particularly in +yield prediction and supply chain management. The dataset and the trained +models are publicly available for future studies. + +
+
+
+
+
+ + ☆ SPOT: SE(3) Pose Trajectory Diffusion for Object-Centric Manipulation + + +
+ We introduce SPOT, an object-centric imitation learning framework. The key +idea is to capture each task by an object-centric representation, specifically +the SE(3) object pose trajectory relative to the target. This approach +decouples embodiment actions from sensory inputs, facilitating learning from +various demonstration types, including both action-based and action-less human +hand demonstrations, as well as cross-embodiment generalization. Additionally, +object pose trajectories inherently capture planning constraints from +demonstrations without the need for manually crafted rules. To guide the robot +in executing the task, the object trajectory is used to condition a diffusion +policy. We show improvement compared to prior work on RLBench simulated tasks. +In real-world evaluation, using only eight demonstrations shot on an iPhone, +our approach completed all tasks while fully complying with task constraints. +Project page: https://nvlabs.github.io/object_centric_diffusion + +
+
+
+
+
+ + ☆ FG-PE: Factor-graph Approach for Multi-robot Pursuit-Evasion + + +
+ With the increasing use of robots in daily life, there is a growing need to +provide robust collaboration protocols for robots to tackle more complicated +and dynamic problems effectively. This paper presents a novel, factor +graph-based approach to address the pursuit-evasion problem, enabling accurate +estimation, planning, and tracking of an evader by multiple pursuers working +together. It is assumed that there are multiple pursuers and only one evader in +this scenario. The proposed method significantly improves the accuracy of +evader estimation and tracking, allowing pursuers to capture the evader in the +shortest possible time and distance compared to existing techniques. In +addition to these primary objectives, the proposed approach effectively +minimizes uncertainty while remaining robust, even when communication issues +lead to some messages being dropped or lost. Through a series of comprehensive +experiments, this paper demonstrates that the proposed algorithm consistently +outperforms traditional pursuit-evasion methods across several key performance +metrics, such as the time required to capture the evader and the average +distance traveled by the pursuers. Additionally, the proposed method is tested +in real-world hardware experiments, further validating its effectiveness and +applicability. + +
+
+
+
+
+ + ☆ Multi-Agent Deep Q-Network with Layer-based Communication Channel for + Autonomous Internal Logistics Vehicle Scheduling in Smart Manufacturing + + +
+ In smart manufacturing, scheduling autonomous internal logistic vehicles is +crucial for optimizing operational efficiency. This paper proposes a +multi-agent deep Q-network (MADQN) with a layer-based communication channel +(LBCC) to address this challenge. The main goals are to minimize total job +tardiness, reduce the number of tardy jobs, and lower vehicle energy +consumption. The method is evaluated against nine well-known scheduling +heuristics, demonstrating its effectiveness in handling dynamic job shop +behaviors like job arrivals and workstation unavailabilities. The approach also +proves scalable, maintaining performance across different layouts and larger +problem instances, highlighting the robustness and adaptability of MADQN with +LBCC in smart manufacturing. + +
+
+ comment: Accepted for the 5th IFAC/INSTICC INTERNATIONAL CONFERENCE ON + INNOVATIVE INTELLIGENT INDUSTRIAL PRODUCTION AND LOGISTICS +
+
+
+
+
+ + ☆ Learning to Look Around: Enhancing Teleoperation and Learning with a + Human-like Actuated Neck + + +
+ We introduce a teleoperation system that integrates a 5 DOF actuated neck, +designed to replicate natural human head movements and perception. By enabling +behaviors like peeking or tilting, the system provides operators with a more +intuitive and comprehensive view of the environment, improving task +performance, reducing cognitive load, and facilitating complex whole-body +manipulation. We demonstrate the benefits of natural perception across seven +challenging teleoperation tasks, showing how the actuated neck enhances the +scope and efficiency of remote operation. Furthermore, we investigate its role +in training autonomous policies through imitation learning. In three distinct +tasks, the actuated neck supports better spatial awareness, reduces +distribution shift, and enables adaptive task-specific adjustments compared to +a static wide-angle camera. + +
+
+
+
+
+ + ☆ Path Integral Control for Hybrid Dynamical Systems + + +
+ This work introduces a novel paradigm for solving optimal control problems +for hybrid dynamical systems under uncertainties. Robotic systems having +contact with the environment can be modeled as hybrid systems. Controller +design for hybrid systems under disturbances is complicated by the +discontinuous jump dynamics, mode changes with inconsistent state dimensions, +and variations in jumping timing and states caused by noise. We formulate this +problem into a stochastic control problem with hybrid transition constraints +and propose the Hybrid Path Integral (H-PI) framework to obtain the optimal +controller. Despite random mode changes across stochastic path samples, we show +that the ratio between hybrid path distributions with varying drift terms +remains analogous to the smooth path distributions. We then show that the +optimal controller can be obtained by evaluating a path integral with hybrid +constraints. Importance sampling for path distributions with hybrid dynamics +constraints is introduced to reduce the variance of the path integral +evaluation, where we leverage the recently developed Hybrid +iterative-Linear-Quadratic-Regulator (H-iLQR) controller to induce a hybrid +path distribution proposal with low variance. The proposed method is validated +through numerical experiments on various hybrid systems and extensive ablation +studies. All the sampling processes are conducted in parallel on a Graphics +Processing Unit (GPU). + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ On Deep Learning for Geometric and Semantic Scene Understanding Using + On-Vehicle 3D LiDAR ECCV 2024 + + +
+ 3D LiDAR point cloud data is crucial for scene perception in computer vision, +robotics, and autonomous driving. Geometric and semantic scene understanding, +involving 3D point clouds, is essential for advancing autonomous driving +technologies. However, significant challenges remain, particularly in improving +the overall accuracy (e.g., segmentation accuracy, depth estimation accuracy, +etc.) and efficiency of these systems. To address the challenge in terms of +accuracy related to LiDAR-based tasks, we present DurLAR, the first +high-fidelity 128-channel 3D LiDAR dataset featuring panoramic ambient (near +infrared) and reflectivity imagery. To improve efficiency in 3D segmentation +while ensuring the accuracy, we propose a novel pipeline that employs a smaller +architecture, requiring fewer ground-truth annotations while achieving superior +segmentation accuracy compared to contemporary approaches. To improve the +segmentation accuracy, we introduce Range-Aware Pointwise Distance Distribution +(RAPiD) features and the associated RAPiD-Seg architecture. All contributions +have been accepted by peer-reviewed conferences, underscoring the advancements +in both accuracy and efficiency in 3D LiDAR applications for autonomous +driving. Full abstract: https://etheses.dur.ac.uk/15738/. + +
+
+ comment: PhD thesis (Durham University, Computer Science), 149 pages (the 2024 + BMVA Sullivan Doctoral Thesis Prize runner-up). Includes published content + from arXiv:2407.10159 (ECCV 2024 ORAL), arXiv:2303.11203 (CVPR 2023), and + arXiv:2406.10068 (3DV 2021), with minor revisions to the examined version: + https://etheses.dur.ac.uk/15738/ +
+
+
+
+
+ + ☆ Differentiable Physics-based System Identification for Robotic + Manipulation of Elastoplastic Materials + + +
+ Robotic manipulation of volumetric elastoplastic deformable materials, from +foods such as dough to construction materials like clay, is in its infancy, +largely due to the difficulty of modelling and perception in a high-dimensional +space. Simulating the dynamics of such materials is computationally expensive. +It tends to suffer from inaccurately estimated physics parameters of the +materials and the environment, impeding high-precision manipulation. Estimating +such parameters from raw point clouds captured by optical cameras suffers +further from heavy occlusions. To address this challenge, this work introduces +a novel Differentiable Physics-based System Identification (DPSI) framework +that enables a robot arm to infer the physics parameters of elastoplastic +materials and the environment using simple manipulation motions and incomplete +3D point clouds, aligning the simulation with the real world. Extensive +experiments show that with only a single real-world interaction, the estimated +parameters, Young's modulus, Poisson's ratio, yield stress and friction +coefficients, can accurately simulate visually and physically realistic +deformation behaviours induced by unseen and long-horizon manipulation motions. +Additionally, the DPSI framework inherently provides physically intuitive +interpretations for the parameters in contrast to black-box approaches such as +deep neural networks. + +
+
+ comment: Underreivew on the Internation Journal of Robotics Research +
+
+
+
+
+ + ☆ CLIP-RT: Learning Language-Conditioned Robotic Policies from Natural + Language Supervision + + +
+ This paper explores how non-experts can teach robots desired skills in their +environments. We argue that natural language is an intuitive and accessible +interface for robot learning. To this end, we investigate two key aspects: (1) +how non-experts collect robotic data using natural language supervision and (2) +how pre-trained vision-language models learn end-to-end policies directly from +this supervision. We propose a data collection framework that collects robot +demonstrations based on natural language supervision (e.g., "move forward") and +further augments these demonstrations. Next, we introduce a model that learns +language-conditioned policies from natural language supervision called CLIP-RT. +Our model employs pre-trained CLIP models and learns to predict actions +represented in language via contrastive imitation learning. We first train +CLIP-RT on large-scale robotic data and then enable it to learn desired skills +using data collected from our framework. CLIP-RT shows strong capabilities in +acquiring novel manipulation skills, outperforming the state-of-the-art model, +OpenVLA (7B parameters), by 17% in average success rates, while using 7x fewer +parameters (1B). + +
+
+ comment: 27 pages, 27 figures +
+
+
+
+
+ + ☆ PlanScope: Learning to Plan Within Decision Scope Does Matter + + +
+ In the context of autonomous driving, learning-based methods have been +promising for the development of planning modules. During the training process +of planning modules, directly minimizing the discrepancy between expert-driving +logs and planning output is widely deployed. In general, driving logs consist +of suddenly appearing obstacles or swiftly changing traffic signals, which +typically necessitate swift and nuanced adjustments in driving maneuvers. +Concurrently, future trajectories of the vehicles exhibit their long-term +decisions, such as adhering to a reference lane or circumventing stationary +obstacles. Due to the unpredictable influence of future events in driving logs, +reasoning bias could be naturally introduced to learning based planning +modules, which leads to a possible degradation of driving performance. To +address this issue, we identify the decisions and their corresponding time +horizons, and characterize a so-called decision scope by retaining decisions +within derivable horizons only, to mitigate the effect of irrational behaviors +caused by unpredictable events. This framework employs wavelet transformation +based log preprocessing with an effective loss computation approach, rendering +the planning model only sensitive to valuable decisions at the current state. +Since frequency domain characteristics are extracted in conjunction with time +domain features by wavelets, decision information across various frequency +bands within the corresponding time horizon can be suitably captured. +Furthermore, to achieve valuable decision learning, this framework leverages a +transformer based decoder that incrementally generates the detailed profiles of +future decisions over multiple steps. Our experiments demonstrate that our +proposed method outperforms baselines in terms of driving scores with +closed-loop evaluations on the nuPlan dataset. + +
+
+
+
+
+ + ☆ ConceptFactory: Facilitate 3D Object Knowledge Annotation with Object + Conceptualization NeurIPS 2024 + + +
+ We present ConceptFactory, a novel scope to facilitate more efficient +annotation of 3D object knowledge by recognizing 3D objects through generalized +concepts (i.e. object conceptualization), aiming at promoting machine +intelligence to learn comprehensive object knowledge from both vision and +robotics aspects. This idea originates from the findings in human cognition +research that the perceptual recognition of objects can be explained as a +process of arranging generalized geometric components (e.g. cuboids and +cylinders). ConceptFactory consists of two critical parts: i) ConceptFactory +Suite, a unified toolbox that adopts Standard Concept Template Library (STL-C) +to drive a web-based platform for object conceptualization, and ii) +ConceptFactory Asset, a large collection of conceptualized objects acquired +using ConceptFactory suite. Our approach enables researchers to effortlessly +acquire or customize extensive varieties of object knowledge to comprehensively +study different object understanding tasks. We validate our idea on a wide +range of benchmark tasks from both vision and robotics aspects with +state-of-the-art algorithms, demonstrating the high quality and versatility of +annotations provided by our approach. Our website is available at +https://apeirony.github.io/ConceptFactory. + +
+
+ comment: NeurIPS 2024 Track on Datasets and Benchmarks +
+
+
+
+
+ + ☆ Expert-level protocol translation for self-driving labs NeurIPS'24 + + +
+ Recent development in Artificial Intelligence (AI) models has propelled their +application in scientific discovery, but the validation and exploration of +these discoveries require subsequent empirical experimentation. The concept of +self-driving laboratories promises to automate and thus boost the experimental +process following AI-driven discoveries. However, the transition of +experimental protocols, originally crafted for human comprehension, into +formats interpretable by machines presents significant challenges, which, +within the context of specific expert domain, encompass the necessity for +structured as opposed to natural language, the imperative for explicit rather +than tacit knowledge, and the preservation of causality and consistency +throughout protocol steps. Presently, the task of protocol translation +predominantly requires the manual and labor-intensive involvement of domain +experts and information technology specialists, rendering the process +time-intensive. To address these issues, we propose a framework that automates +the protocol translation process through a three-stage workflow, which +incrementally constructs Protocol Dependence Graphs (PDGs) that approach +structured on the syntax level, completed on the semantics level, and linked on +the execution level. Quantitative and qualitative evaluations have demonstrated +its performance at par with that of human experts, underscoring its potential +to significantly expedite and democratize the process of scientific discovery +by elevating the automation capabilities within self-driving laboratories. + +
+
+ comment: In Advances in Neural Information Processing Systems (NeurIPS'24) +
+
+
+
+
+ + ☆ NAMR-RRT: Neural Adaptive Motion Planning for Mobile Robots in Dynamic + Environments + + +
+ Robots are increasingly deployed in dynamic and crowded environments, such as +urban areas and shopping malls, where efficient and robust navigation is +crucial. Traditional risk-based motion planning algorithms face challenges in +such scenarios due to the lack of a well-defined search region, leading to +inefficient exploration in irrelevant areas. While bi-directional and +multi-directional search strategies can improve efficiency, they still result +in significant unnecessary exploration. This article introduces the Neural +Adaptive Multi-directional Risk-based Rapidly-exploring Random Tree (NAMR-RRT) +to address these limitations. NAMR-RRT integrates neural network-generated +heuristic regions to dynamically guide the exploration process, continuously +refining the heuristic region and sampling rates during the planning process. +This adaptive feature significantly enhances performance compared to +neural-based methods with fixed heuristic regions and sampling rates. NAMR-RRT +improves planning efficiency, reduces trajectory length, and ensures higher +success by focusing the search on promising areas and continuously adjusting to +environments. The experiment results from both simulations and real-world +applications demonstrate the robustness and effectiveness of our proposed +method in navigating dynamic environments. A website about this work is +available at https://sites.google.com/view/namr-rrt. + +
+
+
+
+
+ + ☆ Closed-Loop Stability of a Lyapunov-Based Switching Attitude Controller + for Energy-Efficient Torque-Input-Selection During Flight + + +
+ We present a new Lyapunov-based switching attitude controller for +energy-efficient real-time selection of the torque inputted to an uncrewed +aerial vehicle (UAV) during flight. The proposed method, using quaternions to +describe the attitude of the controlled UAV, interchanges the stability +properties of the two fixed points-one locally asymptotically stable and +another unstable-of the resulting closed-loop (CL) switching dynamics of the +system. In this approach, the switching events are triggered by the value of a +compound energy-based function. To analyze and ensure the stability of the CL +switching dynamics, we use classical nonlinear Lyapunov techniques, in +combination with switching-systems theory. For this purpose, we introduce a new +compound Lyapunov function (LF) that not only enables us to derive the +conditions for CL asymptotic and exponential stability, but also provides us +with an estimate of the CL system's region of attraction. This new estimate is +considerably larger than those previously reported for systems of the type +considered in this paper. To test and demonstrate the functionality, +suitability, and performance of the proposed method, we present and discuss +experimental data obtained using a 31-g quadrotor during the execution of +high-speed yaw-tracking maneuvers. Also, we provide empirical evidence +indicating that all the initial conditions chosen for these maneuvers, as +estimated, lie inside the system's region of attraction. Last, experimental +data obtained through these flight tests show that the proposed switching +controller reduces the control effort by about 53%, on average, with respect to +that corresponding to a commonly used benchmark control scheme, when executing +a particular type of high-speed yaw-tracking maneuvers. + +
+
+ comment: 2024 IEEE International Conference on Robotics and Biomimetics + (ROBIO) +
+
+
+
+
+ + ☆ Multi-Uncertainty Aware Autonomous Cooperative Planning + + +
+ Autonomous cooperative planning (ACP) is a promising technique to improve the +efficiency and safety of multi-vehicle interactions for future intelligent +transportation systems. However, realizing robust ACP is a challenge due to the +aggregation of perception, motion, and communication uncertainties. This paper +proposes a novel multi-uncertainty aware ACP (MUACP) framework that +simultaneously accounts for multiple types of uncertainties via regularized +cooperative model predictive control (RC-MPC). The regularizers and constraints +for perception, motion, and communication are constructed according to the +confidence levels, weather conditions, and outage probabilities, respectively. +The effectiveness of the proposed method is evaluated in the Car Learning to +Act (CARLA) simulation platform. Results demonstrate that the proposed MUACP +efficiently performs cooperative formation in real time and outperforms other +benchmark approaches in various scenarios under imperfect knowledge of the +environment. + +
+
+
+
+
+ + ☆ Capability-aware Task Allocation and Team Formation Analysis for + Cooperative Exploration of Complex Environments + + +
+ To achieve autonomy in complex real-world exploration missions, we consider +deployment strategies for a team of robots with heterogeneous autonomy +capabilities. In this work, we formulate a multi-robot exploration mission and +compute an operation policy to maintain robot team productivity and maximize +mission rewards. The environment description, robot capability, and mission +outcome are modeled as a Markov decision process (MDP). We also include +constraints in real-world operation, such as sensor failures, limited +communication coverage, and mobility-stressing elements. Then, we study the +proposed operation model on a real-world scenario in the context of the DARPA +Subterranean (SubT) Challenge. The computed deployment policy is also compared +against the human-based operation strategy in the final competition of the SubT +Challenge. Finally, using the proposed model, we discuss the design trade-off +on building a multi-robot team with heterogeneous capabilities. + +
+
+
+
+
+ + ☆ An Improved Rapidly Exploring Random Tree Algorithm for Path Planning in + Configuration Spaces with Narrow Channels + + +
+ Rapidly-exploring Random Tree (RRT) algorithms have been applied successfully +to challenging robot motion planning and under-actuated nonlinear control +problems. However a fundamental limitation of the RRT approach is the slow +convergence in configuration spaces with narrow channels because of the small +probability of generating test points inside narrow channels. This paper +presents an improved RRT algorithm that takes advantage of narrow channels +between the initial and goal states to find shorter paths by improving the +exploration of narrow regions in the configuration space. The proposed +algorithm detects the presence of narrow channel by checking for collision of +neighborhood points with the infeasible set and attempts to add points within +narrow channels with a predetermined bias. This approach is compared with the +classical RRT and its variants on a variety of benchmark planning problems. +Simulation results indicate that the algorithm presented in this paper computes +a significantly shorter path in spaces with narrow channels. + +
+
+
+
+
+ + ☆ An Untethered Bioinspired Robotic Tensegrity Dolphin with + Multi-Flexibility Design for Aquatic Locomotion + + +
+ This paper presents the first steps toward a soft dolphin robot using a +bio-inspired approach to mimic dolphin flexibility. The current dolphin robot +uses a minimalist approach, with only two actuated cable-driven degrees of +freedom actuated by a pair of motors. The actuated tail moves up and down in a +swimming motion, but this first proof of concept does not permit controlled +turns of the robot. While existing robotic dolphins typically use revolute +joints to articulate rigid bodies, our design -- which will be made opensource +-- incorporates a flexible tail with tunable silicone skin and actuation +flexibility via a cable-driven system, which mimics muscle dynamics and design +flexibility with a tunable skeleton structure. The design is also tunable since +the backbone can be easily printed in various geometries. The paper provides +insights into how a few such variations affect robot motion and efficiency, +measured by speed and cost of transport (COT). This approach demonstrates the +potential of achieving dolphin-like motion through enhanced flexibility in +bio-inspired robotics. + +
+
+ comment: 7 pages, 13 figures +
+
+
+
+
+ + ☆ On the Exploration of LM-Based Soft Modular Robot Design + + +
+ Recent large language models (LLMs) have demonstrated promising capabilities +in modeling real-world knowledge and enhancing knowledge-based generation +tasks. In this paper, we further explore the potential of using LLMs to aid in +the design of soft modular robots, taking into account both user instructions +and physical laws, to reduce the reliance on extensive trial-and-error +experiments typically needed to achieve robot designs that meet specific +structural or task requirements. Specifically, we formulate the robot design +process as a sequence generation task and find that LLMs are able to capture +key requirements expressed in natural language and reflect them in the +construction sequences of robots. To simplify, rather than conducting +real-world experiments to assess design quality, we utilize a simulation tool +to provide feedback to the generative model, allowing for iterative +improvements without requiring extensive human annotations. Furthermore, we +introduce five evaluation metrics to assess the quality of robot designs from +multiple angles including task completion and adherence to instructions, +supporting an automatic evaluation process. Our model performs well in +evaluations for designing soft modular robots with uni- and bi-directional +locomotion and stair-descending capabilities, highlighting the potential of +using natural language and LLMs for robot design. However, we also observe +certain limitations that suggest areas for further improvement. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Embodied AI with Two Arms: Zero-shot Learning, Safety and Modularity + + +
+ We present an embodied AI system which receives open-ended natural language +instructions from a human, and controls two arms to collaboratively accomplish +potentially long-horizon tasks over a large workspace. Our system is modular: +it deploys state of the art Large Language Models for task +planning,Vision-Language models for semantic perception, and Point Cloud +transformers for grasping. With semantic and physical safety in mind, these +modules are interfaced with a real-time trajectory optimizer and a compliant +tracking controller to enable human-robot proximity. We demonstrate performance +for the following tasks: bi-arm sorting, bottle opening, and trash disposal +tasks. These are done zero-shot where the models used have not been trained +with any real world data from this bi-arm robot, scenes or workspace. Composing +both learning- and non-learning-based components in a modular fashion with +interpretable inputs and outputs allows the user to easily debug points of +failures and fragilities. One may also in-place swap modules to improve the +robustness of the overall platform, for instance with imitation-learned +policies. Please see https://sites.google.com/corp/view/safe-robots . + +
+
+
+
+
+ + ♻ ☆ Perceptive Pedipulation with Local Obstacle Avoidance + + +
+ Pedipulation leverages the feet of legged robots for mobile manipulation, +eliminating the need for dedicated robotic arms. While previous works have +showcased blind and task-specific pedipulation skills, they fail to account for +static and dynamic obstacles in the environment. To address this limitation, we +introduce a reinforcement learning-based approach to train a whole-body +obstacle-aware policy that tracks foot position commands while simultaneously +avoiding obstacles. Despite training the policy in only five different static +scenarios in simulation, we show that it generalizes to unknown environments +with different numbers and types of obstacles. We analyze the performance of +our method through a set of simulation experiments and successfully deploy the +learned policy on the ANYmal quadruped, demonstrating its capability to follow +foot commands while navigating around static and dynamic obstacles. + +
+
+ comment: Accepted to the IEEE International Conference on Humanoid Robots 2024 +
+
+
+
+
+ + ♻ ☆ ConvBKI: Real-Time Probabilistic Semantic Mapping Network with + Quantifiable Uncertainty + + +
+ In this paper, we develop a modular neural network for real-time +{\color{black}(> 10 Hz)} semantic mapping in uncertain environments, which +explicitly updates per-voxel probabilistic distributions within a neural +network layer. Our approach combines the reliability of classical probabilistic +algorithms with the performance and efficiency of modern neural networks. +Although robotic perception is often divided between modern differentiable +methods and classical explicit methods, a union of both is necessary for +real-time and trustworthy performance. We introduce a novel Convolutional +Bayesian Kernel Inference (ConvBKI) layer which incorporates semantic +segmentation predictions online into a 3D map through a depthwise convolution +layer by leveraging conjugate priors. We compare ConvBKI against +state-of-the-art deep learning approaches and probabilistic algorithms for +mapping to evaluate reliability and performance. We also create a Robot +Operating System (ROS) package of ConvBKI and test it on real-world +perceptually challenging off-road driving data. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2209.10663 +
+
+
+
+
+ + ♻ ☆ Tiny Learning-Based MPC for Multirotors: Solver-Aware Learning for + Efficient Embedded Predictive Control + + +
+ Tiny aerial robots show promise for applications like environmental +monitoring and search-and-rescue but face challenges in control due to their +limited computing power and complex dynamics. Model Predictive Control (MPC) +can achieve agile trajectory tracking and handle constraints. Although current +learning-based MPC methods, such as Gaussian Process (GP) MPC, improve control +performance by learning residual dynamics, they are computationally demanding, +limiting their onboard application on tiny robots. This paper introduces Tiny +Learning-Based Model Predictive Control (LB MPC), a novel framework for +resource-constrained micro multirotor platforms. By exploiting multirotor +dynamics' structure and developing an efficient solver, our approach enables +high-rate control at 100 Hz on a Crazyflie 2.1 with a Teensy 4.0 +microcontroller. We demonstrate a 23% average improvement in tracking +performance over existing embedded MPC methods, achieving the first onboard +implementation of learning-based MPC on a tiny multirotor (53 g). + +
+
+
+
+
+ + ♻ ☆ Deep-Learning Estimation of Weight Distribution Using Joint Kinematics + for Lower-Limb Exoskeleton Control + + +
+ In the control of lower-limb exoskeletons with feet, the phase in the gait +cycle can be identified by monitoring the weight distribution at the feet. This +phase information can be used in the exoskeleton's controller to compensate the +dynamics of the exoskeleton and to assign impedance parameters. Typically the +weight distribution is calculated using data from sensors such as treadmill +force plates or insole force sensors. However, these solutions increase both +the setup complexity and cost. For this reason, we propose a deep-learning +approach that uses a short time window of joint kinematics to predict the +weight distribution of an exoskeleton in real time. The model was trained on +treadmill walking data from six users wearing a four-degree-of-freedom +exoskeleton and tested in real time on three different users wearing the same +device. This test set includes two users not present in the training set to +demonstrate the model's ability to generalize across individuals. Results show +that the proposed method is able to fit the actual weight distribution with +R2=0.9 and is suitable for real-time control with prediction times less than 1 +ms. Experiments in closed-loop exoskeleton control show that +deep-learning-based weight distribution estimation can be used to replace force +sensors in overground and treadmill walking. + +
+
+
+
+
+ + ♻ ☆ Asymptotically Optimal Lazy Lifelong Sampling-based Algorithm for + Efficient Motion Planning in Dynamic Environments + + +
+ The paper introduces an asymptotically optimal lifelong sampling-based path +planning algorithm that combines the merits of lifelong planning algorithms and +lazy search algorithms for rapid replanning in dynamic environments where edge +evaluation is expensive. By evaluating only sub-path candidates for the optimal +solution, the algorithm saves considerable evaluation time and thereby reduces +the overall planning cost. It employs a novel informed rewiring cascade to +efficiently repair the search tree when the underlying search graph changes. +Simulation results demonstrate that the algorithm outperforms various +state-of-the-art sampling-based planners in addressing both static and dynamic +motion planning problems. + +
+
+
+
+
+ + ♻ ☆ SMART: Scalable Multi-agent Real-time Motion Generation via Next-token + Prediction NeurIPS 2024 + + +
+ Data-driven autonomous driving motion generation tasks are frequently +impacted by the limitations of dataset size and the domain gap between +datasets, which precludes their extensive application in real-world scenarios. +To address this issue, we introduce SMART, a novel autonomous driving motion +generation paradigm that models vectorized map and agent trajectory data into +discrete sequence tokens. These tokens are then processed through a +decoder-only transformer architecture to train for the next token prediction +task across spatial-temporal series. This GPT-style method allows the model to +learn the motion distribution in real driving scenarios. SMART achieves +state-of-the-art performance across most of the metrics on the generative Sim +Agents challenge, ranking 1st on the leaderboards of Waymo Open Motion Dataset +(WOMD), demonstrating remarkable inference speed. Moreover, SMART represents +the generative model in the autonomous driving motion domain, exhibiting +zero-shot generalization capabilities: Using only the NuPlan dataset for +training and WOMD for validation, SMART achieved a competitive score of 0.72 on +the Sim Agents challenge. Lastly, we have collected over 1 billion motion +tokens from multiple datasets, validating the model's scalability. These +results suggest that SMART has initially emulated two important properties: +scalability and zero-shot generalization, and preliminarily meets the needs of +large-scale real-time simulation applications. We have released all the code to +promote the exploration of models for motion generation in the autonomous +driving field. The source code is available at +https://github.com/rainmaker22/SMART. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Autonomous Driving in Unstructured Environments: How Far Have We Come? + + +
+ Research on autonomous driving in unstructured outdoor environments is less +advanced than in structured urban settings due to challenges like environmental +diversities and scene complexity. These environments-such as rural areas and +rugged terrains-pose unique obstacles that are not common in structured urban +areas. Despite these difficulties, autonomous driving in unstructured outdoor +environments is crucial for applications in agriculture, mining, and military +operations. Our survey reviews over 250 papers for autonomous driving in +unstructured outdoor environments, covering offline mapping, pose estimation, +environmental perception, path planning, end-to-end autonomous driving, +datasets, and relevant challenges. We also discuss emerging trends and future +research directions. This review aims to consolidate knowledge and encourage +further research for autonomous driving in unstructured environments. To +support ongoing work, we maintain an active repository with up-to-date +literature and open-source projects at: +https://github.com/chaytonmin/Survey-Autonomous-Driving-in-Unstructured-Environments. + +
+
+ comment: Survey paper; 38 pages +
+
+
+
+
+ + ♻ ☆ Caging in Time: A Framework for Robust Object Manipulation under + Uncertainties and Limited Robot Perception + + +
+ Real-world object manipulation has been commonly challenged by physical +uncertainties and perception limitations. Being an effective strategy, while +caging configuration-based manipulation frameworks have successfully provided +robust solutions, they are not broadly applicable due to their strict +requirements on the availability of multiple robots, widely distributed +contacts, or specific geometries of the robots or the objects. To this end, +this work proposes a novel concept, termed Caging in Time, to allow caging +configurations to be formed even if there is just one robot engaged in a task. +This novel concept can be explained by an insight that even if a caging +configuration is needed to constrain the motion of an object, only a small +portion of the cage is actively manipulating at a time. As such, we can switch +the configuration of the robot strategically so that by collapsing its +configuration in time, we will see a cage formed and its necessary portion +active whenever needed. We instantiate our Caging in Time theory on challenging +quasistatic and dynamic manipulation tasks, showing that Caging in Time can be +achieved in general state spaces including geometry-based and energy-based +spaces. With extensive experiments, we show robust and accurate manipulation, +in an open-loop manner, without requiring detailed knowledge of the object +geometry or physical properties, nor realtime accurate feedback on the +manipulation states. In addition to being an effective and robust open-loop +manipulation solution, the proposed theory can be a supplementary strategy to +other manipulation systems affected by uncertain or limited robot perception. + +
+
+ comment: 24 pages, 25 figures, video available at: + www.youtube.com/watch?v=Ag_jTzazuSM +
+
+
+
+
+ + ♻ ☆ 4CNet: A Diffusion Approach to Map Prediction for Decentralized + Multi-robot Exploration + + +
+ Mobile robots in unknown cluttered environments with irregularly shaped +obstacles often face sensing, energy, and communication challenges which +directly affect their ability to explore these environments. In this paper, we +introduce a novel deep learning architecture, Confidence-Aware Contrastive +Conditional Consistency Model (4CNet), for robot map prediction during +decentralized, resource-limited multi-robot exploration. 4CNet uniquely +incorporates: 1) a conditional consistency model for map prediction in +unstructured unknown regions, 2) a contrastive map-trajectory pretraining +framework for a trajectory encoder that extracts spatial information from the +trajectories of nearby robots during map prediction, and 3) a confidence +network to measure the uncertainty of map prediction for effective exploration +under resource constraints. We incorporate 4CNet within our proposed robot +exploration with map prediction architecture, 4CNet-E. We then conduct +extensive comparison studies with 4CNet-E and state-of-the-art heuristic and +learning methods to investigate both map prediction and exploration performance +in environments consisting of irregularly shaped obstacles and uneven terrain. +Results showed that 4CNet-E obtained statistically significant higher +prediction accuracy and area coverage with varying environment sizes, number of +robots, energy budgets, and communication limitations. Hardware experiments +were performed and validated the applicability and generalizability of 4CNet-E +in both unstructured indoor and real natural outdoor environments. + +
+
+ comment: 16 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Neural-Rendezvous: Provably Robust Guidance and Control to Encounter + Interstellar Objects + + +
+ Interstellar objects (ISOs) are likely representatives of primitive materials +invaluable in understanding exoplanetary star systems. Due to their poorly +constrained orbits with generally high inclinations and relative velocities, +however, exploring ISOs with conventional human-in-the-loop approaches is +significantly challenging. This paper presents Neural-Rendezvous -- a deep +learning-based guidance and control framework for encountering fast-moving +objects, including ISOs, robustly, accurately, and autonomously in real time. +It uses pointwise minimum norm tracking control on top of a guidance policy +modeled by a spectrally-normalized deep neural network, where its +hyperparameters are tuned with a loss function directly penalizing the MPC +state trajectory tracking error. We show that Neural-Rendezvous provides a high +probability exponential bound on the expected spacecraft delivery error, the +proof of which leverages stochastic incremental stability analysis. In +particular, it is used to construct a non-negative function with a +supermartingale property, explicitly accounting for the ISO state uncertainty +and the local nature of nonlinear state estimation guarantees. In numerical +simulations, Neural-Rendezvous is demonstrated to satisfy the expected error +bound for 100 ISO candidates. This performance is also empirically validated +using our spacecraft simulator and in high-conflict and distributed UAV swarm +reconfiguration with up to 20 UAVs. + +
+
+ comment: Preprint Version, Accepted: October, 2024 (One-minute YouTube + summary: https://youtu.be/q3e0LYS2IYQ, DOI: + https://doi.org/10.2514/1.G007671) +
+
+
+
+
+ + ♻ ☆ Embodied Agent Interface: Benchmarking LLMs for Embodied Decision Making NeurIPS 2024 + + +
+ We aim to evaluate Large Language Models (LLMs) for embodied decision making. +While a significant body of work has been leveraging LLMs for decision making +in embodied environments, we still lack a systematic understanding of their +performance because they are usually applied in different domains, for +different purposes, and built based on different inputs and outputs. +Furthermore, existing evaluations tend to rely solely on a final success rate, +making it difficult to pinpoint what ability is missing in LLMs and where the +problem lies, which in turn blocks embodied agents from leveraging LLMs +effectively and selectively. To address these limitations, we propose a +generalized interface (Embodied Agent Interface) that supports the +formalization of various types of tasks and input-output specifications of +LLM-based modules. Specifically, it allows us to unify 1) a broad set of +embodied decision-making tasks involving both state and temporally extended +goals, 2) four commonly-used LLM-based modules for decision making: goal +interpretation, subgoal decomposition, action sequencing, and transition +modeling, and 3) a collection of fine-grained metrics which break down +evaluation into various types of errors, such as hallucination errors, +affordance errors, various types of planning errors, etc. Overall, our +benchmark offers a comprehensive assessment of LLMs' performance for different +subtasks, pinpointing the strengths and weaknesses in LLM-powered embodied AI +systems, and providing insights for effective and selective use of LLMs in +embodied decision making. + +
+
+ comment: Accepted for oral presentation at NeurIPS 2024 in the Datasets and + Benchmarks track. Camera-ready version +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 46 + +
+
+
+ + ♻ ☆ VascX Models: Model Ensembles for Retinal Vascular Analysis from Color + Fundus Images + + +
+ We introduce VascX models, a comprehensive set of model ensembles for +analyzing retinal vasculature from color fundus images (CFIs). Annotated CFIs +were aggregated from public datasets . Additional CFIs, mainly from the +population-based Rotterdam Study were annotated by graders for arteries and +veins at pixel level, resulting in a dataset diverse in patient demographics +and imaging conditions. VascX models demonstrated superior segmentation +performance across datasets, image quality levels, and anatomic regions when +compared to existing, publicly available models, likely due to the increased +size and variety of our training set. Important improvements were observed in +artery-vein and disc segmentation performance, particularly in segmentations of +these structures on CFIs of intermediate quality, common in large cohorts and +clinical datasets. Importantly, these improvements translated into +significantly more accurate vascular features when we compared features +extracted from VascX segmentation masks with features extracted from +segmentation masks generated by previous models. With VascX models we provide a +robust, ready-to-use set of model ensembles and inference code aimed at +simplifying the implementation and enhancing the quality of automated retinal +vasculature analyses. The precise vessel parameters generated by the model can +serve as starting points for the identification of disease patterns in and +outside of the eye. + +
+
+
+
+
+ + ♻ ☆ DELTA: Dense Efficient Long-range 3D Tracking for any video + + +
+ Tracking dense 3D motion from monocular videos remains challenging, +particularly when aiming for pixel-level precision over long sequences. We +introduce DELTA, a novel method that efficiently tracks every pixel in 3D +space, enabling accurate motion estimation across entire videos. Our approach +leverages a joint global-local attention mechanism for reduced-resolution +tracking, followed by a transformer-based upsampler to achieve high-resolution +predictions. Unlike existing methods, which are limited by computational +inefficiency or sparse tracking, DELTA delivers dense 3D tracking at scale, +running over 8x faster than previous methods while achieving state-of-the-art +accuracy. Furthermore, we explore the impact of depth representation on +tracking performance and identify log-depth as the optimal choice. Extensive +experiments demonstrate the superiority of DELTA on multiple benchmarks, +achieving new state-of-the-art results in both 2D and 3D dense tracking tasks. +Our method provides a robust solution for applications requiring fine-grained, +long-term motion tracking in 3D space. + +
+
+ comment: Project Page: https://snap-research.github.io/DELTA/ +
+
+
+
+
+ + ♻ ☆ BehAVE: Behaviour Alignment of Video Game Encodings + + +
+ Domain randomisation enhances the transferability of vision models across +visually distinct domains with similar content. However, current methods +heavily depend on intricate simulation engines, hampering feasibility and +scalability. This paper introduces BehAVE, a video understanding framework that +utilises existing commercial video games for domain randomisation without +accessing their simulation engines. BehAVE taps into the visual diversity of +video games for randomisation and uses textual descriptions of player actions +to align videos with similar content. We evaluate BehAVE across 25 first-person +shooter (FPS) games using various video and text foundation models, +demonstrating its robustness in domain randomisation. BehAVE effectively aligns +player behavioural patterns and achieves zero-shot transfer to multiple unseen +FPS games when trained on just one game. In a more challenging scenario, BehAVE +enhances the zero-shot transferability of foundation models to unseen FPS +games, even when trained on a game of a different genre, with improvements of +up to 22%. BehAVE is available online at https://github.com/nrasajski/BehAVE. + +
+
+
+
+
+ + ♻ ☆ Aligning Motion-Blurred Images Using Contrastive Learning on + Overcomplete Pixels + + +
+ We propose a new contrastive objective for learning overcomplete pixel-level +features that are invariant to motion blur. Other invariances (e.g., pose, +illumination, or weather) can be learned by applying the corresponding +transformations on unlabeled images during self-supervised training. We +showcase that a simple U-Net trained with our objective can produce local +features useful for aligning the frames of an unseen video captured with a +moving camera under realistic and challenging conditions. Using a carefully +designed toy example, we also show that the overcomplete pixels can encode the +identity of objects in an image and the pixel coordinates relative to these +objects. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ GeoSplatting: Towards Geometry Guided Gaussian Splatting for + Physically-based Inverse Rendering + + +
+ We consider the problem of physically-based inverse rendering using 3D +Gaussian Splatting (3DGS) representations. While recent 3DGS methods have +achieved remarkable results in novel view synthesis (NVS), accurately capturing +high-fidelity geometry, physically interpretable materials and lighting remains +challenging, as it requires precise geometry modeling to provide accurate +surface normals, along with physically-based rendering (PBR) techniques to +ensure correct material and lighting disentanglement. Previous 3DGS methods +resort to approximating surface normals, but often struggle with noisy local +geometry, leading to inaccurate normal estimation and suboptimal +material-lighting decomposition. In this paper, we introduce GeoSplatting, a +novel hybrid representation that augments 3DGS with explicit geometric guidance +and differentiable PBR equations. Specifically, we bridge isosurface and 3DGS +together, where we first extract isosurface mesh from a scalar field, then +convert it into 3DGS points and formulate PBR equations for them in a fully +differentiable manner. In GeoSplatting, 3DGS is grounded on the mesh geometry, +enabling precise surface normal modeling, which facilitates the use of PBR +frameworks for material decomposition. This approach further maintains the +efficiency and quality of NVS from 3DGS while ensuring accurate geometry from +the isosurface. Comprehensive evaluations across diverse datasets demonstrate +the superiority of GeoSplatting, consistently outperforming existing methods +both quantitatively and qualitatively. + +
+
+ comment: Project page: https://pku-vcl-geometry.github.io/GeoSplatting/ +
+
+
+
+
+ + ♻ ☆ HENASY: Learning to Assemble Scene-Entities for Egocentric + Video-Language Model NeurIPS 2024 + + +
+ Current video-language models (VLMs) rely extensively on instance-level +alignment between video and language modalities, which presents two major +limitations: (1) visual reasoning disobeys the natural perception that humans +do in first-person perspective, leading to a lack of reasoning interpretation; +and (2) learning is limited in capturing inherent fine-grained relationships +between two modalities. + In this paper, we take an inspiration from human perception and explore a +compositional approach for egocentric video representation. We introduce HENASY +(Hierarchical ENtities ASsemblY), which includes a spatiotemporal token +grouping mechanism to explicitly assemble dynamically evolving scene entities +through time and model their relationship for video representation. By +leveraging compositional structure understanding, HENASY possesses strong +interpretability via visual grounding with free-form text queries. We further +explore a suite of multi-grained contrastive losses to facilitate +entity-centric understandings. This comprises three alignment types: +video-narration, noun-entity, verb-entities alignments. + Our method demonstrates strong interpretability in both quantitative and +qualitative experiments; while maintaining competitive performances on five +downstream tasks via zero-shot transfer or as video/text representation, +including video/text retrieval, action recognition, multi-choice query, natural +language query, and moments query. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ CaptainCook4D: A Dataset for Understanding Errors in Procedural + Activities + + +
+ Following step-by-step procedures is an essential component of various +activities carried out by individuals in their daily lives. These procedures +serve as a guiding framework that helps to achieve goals efficiently, whether +it is assembling furniture or preparing a recipe. However, the complexity and +duration of procedural activities inherently increase the likelihood of making +errors. Understanding such procedural activities from a sequence of frames is a +challenging task that demands an accurate interpretation of visual information +and the ability to reason about the structure of the activity. To this end, we +collect a new egocentric 4D dataset, CaptainCook4D, comprising 384 recordings +(94.5 hours) of people performing recipes in real kitchen environments. This +dataset consists of two distinct types of activity: one in which participants +adhere to the provided recipe instructions and another in which they deviate +and induce errors. We provide 5.3K step annotations and 10K fine-grained action +annotations and benchmark the dataset for the following tasks: supervised error +recognition, multistep localization, and procedure learning + +
+
+ comment: Accepted to the 2024 Neural Information Processing Systems Datasets + and Benchmarks Track, Project Page: + https://captaincook4d.github.io/captain-cook/ +
+
+
+
+
+ + ♻ ☆ Comparing YOLO11 and YOLOv8 for instance segmentation of occluded and + non-occluded immature green fruits in complex orchard environment + + +
+ This study conducted a comprehensive performance evaluation on YOLO11 and +YOLOv8, the latest in the "You Only Look Once" (YOLO) series, focusing on their +instance segmentation capabilities for immature green apples in orchard +environments. YOLO11n-seg achieved the highest mask precision across all +categories with a notable score of 0.831, highlighting its effectiveness in +fruit detection. YOLO11m-seg and YOLO11l-seg excelled in non-occluded and +occluded fruitlet segmentation with scores of 0.851 and 0.829, respectively. +Additionally, YOLO11x-seg led in mask recall for all categories, achieving a +score of 0.815, with YOLO11m-seg performing best for non-occluded immature +green fruitlets at 0.858 and YOLOv8x-seg leading the occluded category with +0.800. In terms of mean average precision at a 50\% intersection over union +(mAP@50), YOLO11m-seg consistently outperformed, registering the highest scores +for both box and mask segmentation, at 0.876 and 0.860 for the "All" class and +0.908 and 0.909 for non-occluded immature fruitlets, respectively. YOLO11l-seg +and YOLOv8l-seg shared the top box mAP@50 for occluded immature fruitlets at +0.847, while YOLO11m-seg achieved the highest mask mAP@50 of 0.810. Despite the +advancements in YOLO11, YOLOv8n surpassed its counterparts in image processing +speed, with an impressive inference speed of 3.3 milliseconds, compared to the +fastest YOLO11 series model at 4.8 milliseconds, underscoring its suitability +for real-time agricultural applications related to complex green fruit +environments. + +
+
+ comment: 16 Pages, 10 Figures, 3 Tables +
+
+
+
+
+ + ♻ ☆ Digital Twins in Additive Manufacturing: A Systematic Review + + +
+ Digital Twins (DTs) are becoming popular in Additive Manufacturing (AM) due +to their ability to create virtual replicas of physical components of AM +machines, which helps in real-time production monitoring. Advanced techniques +such as Machine Learning (ML), Augmented Reality (AR), and simulation-based +models play key roles in developing intelligent and adaptable DTs in +manufacturing processes. However, questions remain regarding scalability, the +integration of high-quality data, and the computational power required for +real-time applications in developing DTs. Understanding the current state of +DTs in AM is essential to address these challenges and fully utilize their +potential in advancing AM processes. Considering this opportunity, this work +aims to provide a comprehensive overview of DTs in AM by addressing the +following four research questions: (1) What are the key types of DTs used in AM +and their specific applications? (2) What are the recent developments and +implementations of DTs? (3) How are DTs employed in process improvement and +hybrid manufacturing? (4) How are DTs integrated with Industry 4.0 +technologies? By discussing current applications and techniques, we aim to +offer a better understanding and potential future research directions for +researchers and practitioners in AM and DTs. + +
+
+
+
+
+ + ♻ ☆ A survey on deep learning in medical image registration: new + technologies, uncertainty, evaluation metrics, and beyond + + +
+ Deep learning technologies have dramatically reshaped the field of medical +image registration over the past decade. The initial developments, such as +regression-based and U-Net-based networks, established the foundation for deep +learning in image registration. Subsequent progress has been made in various +aspects of deep learning-based registration, including similarity measures, +deformation regularizations, network architectures, and uncertainty estimation. +These advancements have not only enriched the field of image registration but +have also facilitated its application in a wide range of tasks, including atlas +construction, multi-atlas segmentation, motion estimation, and 2D-3D +registration. In this paper, we present a comprehensive overview of the most +recent advancements in deep learning-based image registration. We begin with a +concise introduction to the core concepts of deep learning-based image +registration. Then, we delve into innovative network architectures, loss +functions specific to registration, and methods for estimating registration +uncertainty. Additionally, this paper explores appropriate evaluation metrics +for assessing the performance of deep learning models in registration tasks. +Finally, we highlight the practical applications of these novel techniques in +medical imaging and discuss the future prospects of deep learning-based image +registration. + +
+
+ comment: Accepted to Medical Image Analysis ((c) MedIA). A list of + open-sourced code from the papers reviewed has been organized and is + available at https://bit.ly/3QgFJ9z +
+
+
+
+
+ + ♻ ☆ DenoiseRep: Denoising Model for Representation Learning NeurIPS 2024 + + +
+ The denoising model has been proven a powerful generative model but has +little exploration of discriminative tasks. Representation learning is +important in discriminative tasks, which is defined as "learning +representations (or features) of the data that make it easier to extract useful +information when building classifiers or other predictors". In this paper, we +propose a novel Denoising Model for Representation Learning (DenoiseRep) to +improve feature discrimination with joint feature extraction and denoising. +DenoiseRep views each embedding layer in a backbone as a denoising layer, +processing the cascaded embedding layers as if we are recursively denoise +features step-by-step. This unifies the frameworks of feature extraction and +denoising, where the former progressively embeds features from low-level to +high-level, and the latter recursively denoises features step-by-step. After +that, DenoiseRep fuses the parameters of feature extraction and denoising +layers, and theoretically demonstrates its equivalence before and after the +fusion, thus making feature denoising computation-free. DenoiseRep is a +label-free algorithm that incrementally improves features but also +complementary to the label if available. Experimental results on various +discriminative vision tasks, including re-identification (Market-1501, +DukeMTMC-reID, MSMT17, CUHK-03, vehicleID), image classification (ImageNet, +UB200, Oxford-Pet, Flowers), object detection (COCO), image segmentation +(ADE20K) show stability and impressive improvements. We also validate its +effectiveness on the CNN (ResNet) and Transformer (ViT, Swin, Vmamda) +architectures. + +
+
+ comment: Accepted by NeurIPS 2024,oral +
+
+
+
+
+ + ♻ ☆ Return of Unconditional Generation: A Self-supervised Representation + Generation Method + + +
+ Unconditional generation -- the problem of modeling data distribution without +relying on human-annotated labels -- is a long-standing and fundamental +challenge in generative models, creating a potential of learning from +large-scale unlabeled data. In the literature, the generation quality of an +unconditional method has been much worse than that of its conditional +counterpart. This gap can be attributed to the lack of semantic information +provided by labels. In this work, we show that one can close this gap by +generating semantic representations in the representation space produced by a +self-supervised encoder. These representations can be used to condition the +image generator. This framework, called Representation-Conditioned Generation +(RCG), provides an effective solution to the unconditional generation problem +without using labels. Through comprehensive experiments, we observe that RCG +significantly improves unconditional generation quality: e.g., it achieves a +new state-of-the-art FID of 2.15 on ImageNet 256x256, largely reducing the +previous best of 5.91 by a relative 64%. Our unconditional results are situated +in the same tier as the leading class-conditional ones. We hope these +encouraging observations will attract the community's attention to the +fundamental problem of unconditional generation. Code is available at +https://github.com/LTH14/rcg. + +
+
+ comment: Neurips 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ Erasing Self-Supervised Learning Backdoor by Cluster Activation Masking + + +
+ Self-Supervised Learning (SSL) is an effective paradigm for learning +representations from unlabeled data, such as text, images, and videos. However, +researchers have recently found that SSL is vulnerable to backdoor attacks. The +attacker can embed hidden SSL backdoors via a few poisoned examples in the +training dataset and maliciously manipulate the behavior of downstream models. +To defend against SSL backdoor attacks, a feasible route is to detect and +remove the poisonous samples in the training set. However, the existing SSL +backdoor defense method fails to detect the poisonous samples precisely. In +this paper, we propose to erase the SSL backdoor by cluster activation masking +and propose a novel PoisonCAM method. After obtaining the threat model trained +on the poisoned dataset, our method can precisely detect poisonous samples +based on the assumption that masking the backdoor trigger can effectively +change the activation of a downstream clustering model. In experiments, our +PoisonCAM achieves 96\% accuracy for backdoor trigger detection compared to 3\% +of the state-of-the-art method on poisoned ImageNet-100. Moreover, our proposed +PoisonCAM significantly improves the performance of the trained SSL model under +backdoor attacks compared to the state-of-the-art method. Our code, data, and +trained models will be open once this paper is accepted. + +
+
+
+
+
+ + ♻ ☆ Autoregressive Image Generation without Vector Quantization + + +
+ Conventional wisdom holds that autoregressive models for image generation are +typically accompanied by vector-quantized tokens. We observe that while a +discrete-valued space can facilitate representing a categorical distribution, +it is not a necessity for autoregressive modeling. In this work, we propose to +model the per-token probability distribution using a diffusion procedure, which +allows us to apply autoregressive models in a continuous-valued space. Rather +than using categorical cross-entropy loss, we define a Diffusion Loss function +to model the per-token probability. This approach eliminates the need for +discrete-valued tokenizers. We evaluate its effectiveness across a wide range +of cases, including standard autoregressive models and generalized masked +autoregressive (MAR) variants. By removing vector quantization, our image +generator achieves strong results while enjoying the speed advantage of +sequence modeling. We hope this work will motivate the use of autoregressive +generation in other continuous-valued domains and applications. Code is +available at: https://github.com/LTH14/mar. + +
+
+ comment: Neurips 2024 (Spotlight). Code: https://github.com/LTH14/mar +
+
+
+
+
+ + ♻ ☆ Disentangling spatio-temporal knowledge for weakly supervised object + detection and segmentation in surgical video + + +
+ Weakly supervised video object segmentation (WSVOS) enables the +identification of segmentation maps without requiring an extensive training +dataset of object masks, relying instead on coarse video labels indicating +object presence. Current state-of-the-art methods either require multiple +independent stages of processing that employ motion cues or, in the case of +end-to-end trainable networks, lack in segmentation accuracy, in part due to +the difficulty of learning segmentation maps from videos with transient object +presence. This limits the application of WSVOS for semantic annotation of +surgical videos where multiple surgical tools frequently move in and out of the +field of view, a problem that is more difficult than typically encountered in +WSVOS. This paper introduces Video Spatio-Temporal Disentanglement Networks +(VDST-Net), a framework to disentangle spatiotemporal information using +semi-decoupled knowledge distillation to predict high-quality class activation +maps (CAMs). A teacher network designed to resolve temporal conflicts when +specifics about object location and timing in the video are not provided works +with a student network that integrates information over time by leveraging +temporal dependencies. We demonstrate the efficacy of our framework on a public +reference dataset and on a more challenging surgical video dataset where +objects are, on average, present in less than 60\% of annotated frames. Our +method outperforms state-of-the-art techniques and generates superior +segmentation masks under video-level weak supervision. + +
+
+ comment: Accepted to IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) +
+
+
+
+
+ + ♻ ☆ ConvBKI: Real-Time Probabilistic Semantic Mapping Network with + Quantifiable Uncertainty + + +
+ In this paper, we develop a modular neural network for real-time +{\color{black}(> 10 Hz)} semantic mapping in uncertain environments, which +explicitly updates per-voxel probabilistic distributions within a neural +network layer. Our approach combines the reliability of classical probabilistic +algorithms with the performance and efficiency of modern neural networks. +Although robotic perception is often divided between modern differentiable +methods and classical explicit methods, a union of both is necessary for +real-time and trustworthy performance. We introduce a novel Convolutional +Bayesian Kernel Inference (ConvBKI) layer which incorporates semantic +segmentation predictions online into a 3D map through a depthwise convolution +layer by leveraging conjugate priors. We compare ConvBKI against +state-of-the-art deep learning approaches and probabilistic algorithms for +mapping to evaluate reliability and performance. We also create a Robot +Operating System (ROS) package of ConvBKI and test it on real-world +perceptually challenging off-road driving data. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2209.10663 +
+
+
+
+
+ + ♻ ☆ Kuro Siwo: 33 billion $m^2$ under the water. A global multi-temporal + satellite dataset for rapid flood mapping NeurIPS 2024 + + +
+ Global floods, exacerbated by climate change, pose severe threats to human +life, infrastructure, and the environment. Recent catastrophic events in +Pakistan and New Zealand underscore the urgent need for precise flood mapping +to guide restoration efforts, understand vulnerabilities, and prepare for +future occurrences. While Synthetic Aperture Radar (SAR) remote sensing offers +day-and-night, all-weather imaging capabilities, its application in deep +learning for flood segmentation is limited by the lack of large annotated +datasets. To address this, we introduce Kuro Siwo, a manually annotated +multi-temporal dataset, spanning 43 flood events globally. Our dataset maps +more than 338 billion $m^2$ of land, with 33 billion designated as either +flooded areas or permanent water bodies. Kuro Siwo includes a highly processed +product optimized for flood mapping based on SAR Ground Range Detected, and a +primal SAR Single Look Complex product with minimal preprocessing, designed to +promote research on the exploitation of both the phase and amplitude +information and to offer maximum flexibility for downstream task preprocessing. +To leverage advances in large scale self-supervised pretraining methods for +remote sensing data, we augment Kuro Siwo with a large unlabeled set of SAR +samples. Finally, we provide an extensive benchmark, namely BlackBench, +offering strong baselines for a diverse set of flood events from Europe, +America, Africa, Asia and Australia. + +
+
+ comment: Accepted at the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024) Track on Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ On-Air Deep Learning Integrated Semantic Inference Models for Enhanced + Earth Observation Satellite Networks + + +
+ Earth Observation (EO) systems are crucial for cartography, disaster +surveillance, and resource administration. Nonetheless, they encounter +considerable obstacles in the processing and transmission of extensive data, +especially in specialized domains such as precision agriculture and real-time +disaster response. Earth observation satellites, outfitted with remote sensing +technology, gather data from onboard sensors and IoT-enabled terrestrial +objects, delivering important information remotely. Domain-adapted Large +Language Models (LLMs) provide a solution by enabling the integration of raw +and processed EO data. Through domain adaptation, LLMs improve the assimilation +and analysis of many data sources, tackling the intricacies of specialized +datasets in agriculture and disaster response. This data synthesis, directed by +LLMs, enhances the precision and pertinence of conveyed information. This study +provides a thorough examination of using semantic inference and deep learning +for sophisticated EO systems. It presents an innovative architecture for +semantic communication in EO satellite networks, designed to improve data +transmission efficiency using semantic processing methodologies. Recent +advancements in onboard processing technologies enable dependable, adaptable, +and energy-efficient data management in orbit. These improvements guarantee +reliable performance in adverse space circumstances using radiation-hardened +and reconfigurable technology. Collectively, these advancements enable +next-generation satellite missions with improved processing capabilities, +crucial for operational flexibility and real-time decision-making in 6G +satellite communication. + +
+
+ comment: 17 pages, 7 figures, Journal +
+
+
+
+
+ + ♻ ☆ Video Diffusion Models are Training-free Motion Interpreter and + Controller + + +
+ Video generation primarily aims to model authentic and customized motion +across frames, making understanding and controlling the motion a crucial topic. +Most diffusion-based studies on video motion focus on motion customization with +training-based paradigms, which, however, demands substantial training +resources and necessitates retraining for diverse models. Crucially, these +approaches do not explore how video diffusion models encode cross-frame motion +information in their features, lacking interpretability and transparency in +their effectiveness. To answer this question, this paper introduces a novel +perspective to understand, localize, and manipulate motion-aware features in +video diffusion models. Through analysis using Principal Component Analysis +(PCA), our work discloses that robust motion-aware feature already exists in +video diffusion models. We present a new MOtion FeaTure (MOFT) by eliminating +content correlation information and filtering motion channels. MOFT provides a +distinct set of benefits, including the ability to encode comprehensive motion +information with clear interpretability, extraction without the need for +training, and generalizability across diverse architectures. Leveraging MOFT, +we propose a novel training-free video motion control framework. Our method +demonstrates competitive performance in generating natural and faithful motion, +providing architecture-agnostic insights and applicability in a variety of +downstream tasks. + +
+
+ comment: Project Page: https://xizaoqu.github.io/moft/ +
+
+
+
+
+ + ♻ ☆ Improving Generalization in Visual Reasoning via Self-Ensemble + + +
+ The cognitive faculty of visual reasoning necessitates the integration of +multimodal perceptual processing and commonsense and external knowledge of the +world. In recent years, a plethora of large vision-language models (LVLMs) have +been proposed, demonstrating outstanding power and exceptional proficiency in +commonsense reasoning across diverse domains and tasks. Nevertheless, training +such LVLMs requires a lot of costly resources. Recent approaches, instead of +training LVLMs from scratch on various large datasets, focus on exploring ways +to take advantage of the capabilities of many different LVLMs, such as ensemble +methods. In this work, we propose self-ensemble, a novel method that improves +the generalization and visual reasoning of the model without updating any +parameters, a training-free method. Our key insight is that we realized that +LVLM itself can ensemble without the need for any other LVLMs, which helps to +unlock their internal capabilities. Extensive experiments on various benchmarks +demonstrate the effectiveness of our method in achieving state-of-the-art +(SOTA) performance on SketchyVQA, Outside Knowledge VQA, and +out-of-distribution VQA tasks. + +
+
+
+
+
+ + ♻ ☆ FRoundation: Are Foundation Models Ready for Face Recognition? + + +
+ Foundation models are predominantly trained in an unsupervised or +self-supervised manner on highly diverse and large-scale datasets, making them +broadly applicable to various downstream tasks. In this work, we investigate +for the first time whether such models are suitable for the specific domain of +face recognition. We further propose and demonstrate the adaptation of these +models for face recognition across different levels of data availability. +Extensive experiments are conducted on multiple foundation models and datasets +of varying scales for training and fine-tuning, with evaluation on a wide range +of benchmarks. Our results indicate that, despite their versatility, +pre-trained foundation models underperform in face recognition compared to +similar architectures trained specifically for this task. However, fine-tuning +foundation models yields promising results, often surpassing models trained +from scratch when training data is limited. Even with access to large-scale +face recognition training datasets, fine-tuned foundation models perform +comparably to models trained from scratch, but with lower training +computational costs and without relying on the assumption of extensive data +availability. Our analysis also explores bias in face recognition, with +slightly higher bias observed in some settings when using foundation models. + +
+
+
+
+
+ + ♻ ☆ LongVILA: Scaling Long-Context Visual Language Models for Long Videos + + +
+ Long-context capability is critical for multi-modal foundation models, +especially for long video understanding. We introduce LongVILA, a full-stack +solution for long-context visual-language models by co-designing the algorithm +and system. For model training, we upgrade existing VLMs to support long video +understanding by incorporating two additional stages, i.e., long context +extension and long video supervised fine-tuning. However, training on long +video is computationally and memory intensive. We introduce the long-context +Multi-Modal Sequence Parallelism (MM-SP) system that efficiently parallelizes +long video training and inference, enabling 2M context length training on 256 +GPUs without any gradient checkpointing. LongVILA efficiently extends the +number of video frames of VILA from 8 to 2048, improving the long video +captioning score from 2.00 to 3.26 (out of 5), achieving 99.8% accuracy in +6,000-frame (more than 1 million tokens) video needle-in-a-haystack. +LongVILA-7B demonstrates strong accuracy on the VideoMME benchmark, i.e., 61.8% +with subtitle. Besides, MM-SP is 2.1x - 5.7x faster than ring style sequence +parallelism and 1.1x - 1.4x faster than Megatron with a hybrid context and +tensor parallelism. Moreover, it seamlessly integrates with Hugging Face +Transformers. + +
+
+ comment: Code and models are available at + https://github.com/NVlabs/VILA/blob/main/LongVILA.md +
+
+
+
+
+ + ♻ ☆ Conditional GAN for Enhancing Diffusion Models in Efficient and + Authentic Global Gesture Generation from Audios + + +
+ Audio-driven simultaneous gesture generation is vital for human-computer +communication, AI games, and film production. While previous research has shown +promise, there are still limitations. Methods based on VAEs are accompanied by +issues of local jitter and global instability, whereas methods based on +diffusion models are hampered by low generation efficiency. This is because the +denoising process of DDPM in the latter relies on the assumption that the noise +added at each step is sampled from a unimodal distribution, and the noise +values are small. DDIM borrows the idea from the Euler method for solving +differential equations, disrupts the Markov chain process, and increases the +noise step size to reduce the number of denoising steps, thereby accelerating +generation. However, simply increasing the step size during the step-by-step +denoising process causes the results to gradually deviate from the original +data distribution, leading to a significant drop in the quality of the +generated actions and the emergence of unnatural artifacts. In this paper, we +break the assumptions of DDPM and achieves breakthrough progress in denoising +speed and fidelity. Specifically, we introduce a conditional GAN to capture +audio control signals and implicitly match the multimodal denoising +distribution between the diffusion and denoising steps within the same sampling +step, aiming to sample larger noise values and apply fewer denoising steps for +high-speed generation. + +
+
+ comment: Accepted by WACV 2025 (Round 1) +
+
+
+
+
+ + ♻ ☆ RopeTP: Global Human Motion Recovery via Integrating Robust Pose + Estimation with Diffusion Trajectory Prior + + +
+ We present RopeTP, a novel framework that combines Robust pose estimation +with a diffusion Trajectory Prior to reconstruct global human motion from +videos. At the heart of RopeTP is a hierarchical attention mechanism that +significantly improves context awareness, which is essential for accurately +inferring the posture of occluded body parts. This is achieved by exploiting +the relationships with visible anatomical structures, enhancing the accuracy of +local pose estimations. The improved robustness of these local estimations +allows for the reconstruction of precise and stable global trajectories. +Additionally, RopeTP incorporates a diffusion trajectory model that predicts +realistic human motion from local pose sequences. This model ensures that the +generated trajectories are not only consistent with observed local actions but +also unfold naturally over time, thereby improving the realism and stability of +3D human motion reconstruction. Extensive experimental validation shows that +RopeTP surpasses current methods on two benchmark datasets, particularly +excelling in scenarios with occlusions. It also outperforms methods that rely +on SLAM for initial camera estimates and extensive optimization, delivering +more accurate and realistic trajectories. + +
+
+ comment: Accepted by WACV 2025 (Round 1) +
+
+
+
+
+ + ♻ ☆ Adversarial Purification and Fine-tuning for Robust UDC Image + Restoration + + +
+ This study delves into the enhancement of Under-Display Camera (UDC) image +restoration models, focusing on their robustness against adversarial attacks. +Despite its innovative approach to seamless display integration, UDC technology +faces unique image degradation challenges exacerbated by the susceptibility to +adversarial perturbations. Our research initially conducts an in-depth +robustness evaluation of deep-learning-based UDC image restoration models by +employing several white-box and black-box attacking methods. This evaluation is +pivotal in understanding the vulnerabilities of current UDC image restoration +techniques. Following the assessment, we introduce a defense framework +integrating adversarial purification with subsequent fine-tuning processes. +First, our approach employs diffusion-based adversarial purification, +effectively neutralizing adversarial perturbations. Then, we apply the +fine-tuning methodologies to refine the image restoration models further, +ensuring that the quality and fidelity of the restored images are maintained. +The effectiveness of our proposed approach is validated through extensive +experiments, showing marked improvements in resilience against typical +adversarial attacks. + +
+
+ comment: Failure to meet expectations +
+
+
+
+
+ + ♻ ☆ Posture-Informed Muscular Force Learning for Robust Hand Pressure + Estimation NeurIPS 2024 + + +
+ We present PiMForce, a novel framework that enhances hand pressure estimation +by leveraging 3D hand posture information to augment forearm surface +electromyography (sEMG) signals. Our approach utilizes detailed spatial +information from 3D hand poses in conjunction with dynamic muscle activity from +sEMG to enable accurate and robust whole-hand pressure measurements under +diverse hand-object interactions. We also developed a multimodal data +collection system that combines a pressure glove, an sEMG armband, and a +markerless finger-tracking module. We created a comprehensive dataset from 21 +participants, capturing synchronized data of hand posture, sEMG signals, and +exerted hand pressure across various hand postures and hand-object interaction +scenarios using our collection system. Our framework enables precise hand +pressure estimation in complex and natural interaction scenarios. Our approach +substantially mitigates the limitations of traditional sEMG-based or +vision-based methods by integrating 3D hand posture information with sEMG +signals. Video demos, data, and code are available online. + +
+
+ comment: Accepted to NeurIPS 2024. Project Page Link: + https://pimforce.hcitech.org/ +
+
+
+
+
+ + ♻ ☆ LADDER: Language Driven Slice Discovery and Error Rectification + + +
+ Error slice discovery associates structured patterns with model errors. +Existing methods discover error slices by clustering the error-prone samples +with similar patterns or assigning discrete attributes to each sample for +post-hoc analysis. While these methods aim for interpretability and easier +mitigation through reweighting or rebalancing, they may not capture the full +complexity of error patterns due to incomplete or missing attributes. Contrary +to the existing approach, this paper utilizes the reasoning capabilities of the +Large Language Model (LLM) to analyze complex error patterns and generate +testable hypotheses. This paper proposes LADDER: Language Driven slice +Discovery and Error Rectification. It first projects the model's representation +into a language-aligned feature space (eg CLIP) to preserve semantics in the +original model feature space. This ensures the accurate retrieval of sentences +that highlight the model's errors. Next, the LLM utilizes the sentences and +generates hypotheses to discover error slices. Finally, we mitigate the error +by fine-tuning the classification head by creating a group-balanced dataset +using the hypotheses. Our entire method does not require any attribute +annotation, either explicitly or through external tagging models. We validate +our method with \textbf{five} image classification datasets. The code is +available (https://github.com/batmanlab/Ladder). + +
+
+
+
+
+ + ♻ ☆ Human Action Recognition (HAR) Using Skeleton-based Spatial Temporal + Relative Transformer Network: ST-RTR + + +
+ Human Action Recognition (HAR) is an interesting research area in +human-computer interaction used to monitor the activities of elderly and +disabled individuals affected by physical and mental health. In the recent era, +skeleton-based HAR has received much attention because skeleton data has shown +that it can handle changes in striking, body size, camera views, and complex +backgrounds. One key characteristic of ST-GCN is automatically learning spatial +and temporal patterns from skeleton sequences. It has some limitations, as this +method only works for short-range correlation due to its limited receptive +field. Consequently, understanding human action requires long-range +interconnection. To address this issue, we developed a spatial-temporal +relative transformer ST-RTR model. The ST-RTR includes joint and relay nodes, +which allow efficient communication and data transmission within the network. +These nodes help to break the inherent spatial and temporal skeleton +topologies, which enables the model to understand long-range human action +better. Furthermore, we combine ST-RTR with a fusion model for further +performance improvements. To assess the performance of the ST-RTR method, we +conducted experiments on three skeleton-based HAR benchmarks: NTU RGB+D 60, NTU +RGB+D 120, and UAV-Human. It boosted CS and CV by 2.11 % and 1.45% on NTU RGB+D +60, 1.25% and 1.05% on NTU RGB+D 120. On UAV-Human datasets, accuracy improved +by 2.54%. The experimental outcomes explain that the proposed ST-RTR model +significantly improves action recognition associated with the standard ST-GCN +method. + +
+
+
+
+
+ + ♻ ☆ DQ-DETR: DETR with Dynamic Query for Tiny Object Detection ECCV 2024 + + +
+ Despite previous DETR-like methods having performed successfully in generic +object detection, tiny object detection is still a challenging task for them +since the positional information of object queries is not customized for +detecting tiny objects, whose scale is extraordinarily smaller than general +objects. Also, DETR-like methods using a fixed number of queries make them +unsuitable for aerial datasets, which only contain tiny objects, and the +numbers of instances are imbalanced between different images. Thus, we present +a simple yet effective model, named DQ-DETR, which consists of three different +components: categorical counting module, counting-guided feature enhancement, +and dynamic query selection to solve the above-mentioned problems. DQ-DETR uses +the prediction and density maps from the categorical counting module to +dynamically adjust the number of object queries and improve the positional +information of queries. Our model DQ-DETR outperforms previous CNN-based and +DETR-like methods, achieving state-of-the-art mAP 30.2% on the AI-TOD-V2 +dataset, which mostly consists of tiny objects. Our code will be available at +https://github.com/hoiliu-0801/DQ-DETR. + +
+
+ comment: Accepted by ECCV 2024. Our code will be available at + https://github.com/hoiliu-0801/DQ-DETR +
+
+
+
+
+ + ♻ ☆ Fast Samplers for Inverse Problems in Iterative Refinement Models NeurIPS'24 + + +
+ Constructing fast samplers for unconditional diffusion and flow-matching +models has received much attention recently; however, existing methods for +solving inverse problems, such as super-resolution, inpainting, or deblurring, +still require hundreds to thousands of iterative steps to obtain high-quality +results. We propose a plug-and-play framework for constructing efficient +samplers for inverse problems, requiring only pre-trained diffusion or +flow-matching models. We present Conditional Conjugate Integrators, which +leverage the specific form of the inverse problem to project the respective +conditional diffusion/flow dynamics into a more amenable space for sampling. +Our method complements popular posterior approximation methods for solving +inverse problems using diffusion/flow models. We evaluate the proposed method's +performance on various linear image restoration tasks across multiple datasets, +employing diffusion and flow-matching models. Notably, on challenging inverse +problems like 4x super-resolution on the ImageNet dataset, our method can +generate high-quality samples in as few as 5 conditional sampling steps and +outperforms competing baselines requiring 20-1000 steps. Our code will be +publicly available at https://github.com/mandt-lab/c-pigdm + +
+
+ comment: 43 pages, NeurIPS'24 Camera Ready +
+
+
+
+
+ + ♻ ☆ SMART: Scalable Multi-agent Real-time Motion Generation via Next-token + Prediction NeurIPS 2024 + + +
+ Data-driven autonomous driving motion generation tasks are frequently +impacted by the limitations of dataset size and the domain gap between +datasets, which precludes their extensive application in real-world scenarios. +To address this issue, we introduce SMART, a novel autonomous driving motion +generation paradigm that models vectorized map and agent trajectory data into +discrete sequence tokens. These tokens are then processed through a +decoder-only transformer architecture to train for the next token prediction +task across spatial-temporal series. This GPT-style method allows the model to +learn the motion distribution in real driving scenarios. SMART achieves +state-of-the-art performance across most of the metrics on the generative Sim +Agents challenge, ranking 1st on the leaderboards of Waymo Open Motion Dataset +(WOMD), demonstrating remarkable inference speed. Moreover, SMART represents +the generative model in the autonomous driving motion domain, exhibiting +zero-shot generalization capabilities: Using only the NuPlan dataset for +training and WOMD for validation, SMART achieved a competitive score of 0.72 on +the Sim Agents challenge. Lastly, we have collected over 1 billion motion +tokens from multiple datasets, validating the model's scalability. These +results suggest that SMART has initially emulated two important properties: +scalability and zero-shot generalization, and preliminarily meets the needs of +large-scale real-time simulation applications. We have released all the code to +promote the exploration of models for motion generation in the autonomous +driving field. The source code is available at +https://github.com/rainmaker22/SMART. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ ProvNeRF: Modeling per Point Provenance in NeRFs as a Stochastic Field NeurIPS + 2024 + + +
+ Neural radiance fields (NeRFs) have gained popularity with multiple works +showing promising results across various applications. However, to the best of +our knowledge, existing works do not explicitly model the distribution of +training camera poses, or consequently the triangulation quality, a key factor +affecting reconstruction quality dating back to classical vision literature. We +close this gap with ProvNeRF, an approach that models the \textbf{provenance} +for each point -- i.e., the locations where it is likely visible -- of NeRFs as +a stochastic field. We achieve this by extending implicit maximum likelihood +estimation (IMLE) to functional space with an optimizable objective. We show +that modeling per-point provenance during the NeRF optimization enriches the +model with information on triangulation leading to improvements in novel view +synthesis and uncertainty estimation under the challenging sparse, +unconstrained view setting against competitive baselines. + +
+
+ comment: 38th Conference on Neural Information Processing Systems (NeurIPS + 2024) +
+
+
+
+
+ + ♻ ☆ Adaptive Visual Scene Understanding: Incremental Scene Graph Generation + + +
+ Scene graph generation (SGG) analyzes images to extract meaningful +information about objects and their relationships. In the dynamic visual world, +it is crucial for AI systems to continuously detect new objects and establish +their relationships with existing ones. Recently, numerous studies have focused +on continual learning within the domains of object detection and image +recognition. However, a limited amount of research focuses on a more +challenging continual learning problem in SGG. This increased difficulty arises +from the intricate interactions and dynamic relationships among objects, and +their associated contexts. Thus, in continual learning, SGG models are often +required to expand, modify, retain, and reason scene graphs within the process +of adaptive visual scene understanding. To systematically explore Continual +Scene Graph Generation (CSEGG), we present a comprehensive benchmark comprising +three learning regimes: relationship incremental, scene incremental, and +relationship generalization. Moreover, we introduce a ``Replays via Analysis by +Synthesis" method named RAS. This approach leverages the scene graphs, +decomposes and re-composes them to represent different scenes, and replays the +synthesized scenes based on these compositional scene graphs. The replayed +synthesized scenes act as a means to practice and refine proficiency in SGG in +known and unknown environments. Our experimental results not only highlight the +challenges of directly combining existing continual learning methods with SGG +backbones but also demonstrate the effectiveness of our proposed approach, +enhancing CSEGG efficiency while simultaneously preserving privacy and memory +usage. All data and source code are publicly available online. + +
+
+
+
+
+ + ♻ ☆ STONE: A Submodular Optimization Framework for Active 3D Object + Detection + + +
+ 3D object detection is fundamentally important for various emerging +applications, including autonomous driving and robotics. A key requirement for +training an accurate 3D object detector is the availability of a large amount +of LiDAR-based point cloud data. Unfortunately, labeling point cloud data is +extremely challenging, as accurate 3D bounding boxes and semantic labels are +required for each potential object. This paper proposes a unified active 3D +object detection framework, for greatly reducing the labeling cost of training +3D object detectors. Our framework is based on a novel formulation of +submodular optimization, specifically tailored to the problem of active 3D +object detection. In particular, we address two fundamental challenges +associated with active 3D object detection: data imbalance and the need to +cover the distribution of the data, including LiDAR-based point cloud data of +varying difficulty levels. Extensive experiments demonstrate that our method +achieves state-of-the-art performance with high computational efficiency +compared to existing active learning methods. The code is available at +https://github.com/RuiyuM/STONE. + +
+
+
+
+
+ + ♻ ☆ Make Continual Learning Stronger via C-Flat + + +
+ Model generalization ability upon incrementally acquiring dynamically +updating knowledge from sequentially arriving tasks is crucial to tackle the +sensitivity-stability dilemma in Continual Learning (CL). Weight loss landscape +sharpness minimization seeking for flat minima lying in neighborhoods with +uniform low loss or smooth gradient is proven to be a strong training regime +improving model generalization compared with loss minimization based optimizer +like SGD. Yet only a few works have discussed this training regime for CL, +proving that dedicated designed zeroth-order sharpness optimizer can improve CL +performance. In this work, we propose a Continual Flatness (C-Flat) method +featuring a flatter loss landscape tailored for CL. C-Flat could be easily +called with only one line of code and is plug-and-play to any CL methods. A +general framework of C-Flat applied to all CL categories and a thorough +comparison with loss minima optimizer and flat minima based CL approaches is +presented in this paper, showing that our method can boost CL performance in +almost all cases. Code is available at https://github.com/WanNaa/C-Flat. + +
+
+
+
+
+ + ♻ ☆ HSIGene: A Foundation Model For Hyperspectral Image Generation + + +
+ Hyperspectral image (HSI) plays a vital role in various fields such as +agriculture and environmental monitoring. However, due to the expensive +acquisition cost, the number of hyperspectral images is limited, degenerating +the performance of downstream tasks. Although some recent studies have +attempted to employ diffusion models to synthesize HSIs, they still struggle +with the scarcity of HSIs, affecting the reliability and diversity of the +generated images. Some studies propose to incorporate multi-modal data to +enhance spatial diversity, but the spectral fidelity cannot be ensured. In +addition, existing HSI synthesis models are typically uncontrollable or only +support single-condition control, limiting their ability to generate accurate +and reliable HSIs. To alleviate these issues, we propose HSIGene, a novel HSI +generation foundation model which is based on latent diffusion and supports +multi-condition control, allowing for more precise and reliable HSI generation. +To enhance the spatial diversity of the training data while preserving spectral +fidelity, we propose a new data augmentation method based on spatial +super-resolution, in which HSIs are upscaled first, and thus abundant training +patches could be obtained by cropping the high-resolution HSIs. In addition, to +improve the perceptual quality of the augmented data, we introduce a novel +two-stage HSI super-resolution framework, which first applies RGB bands +super-resolution and then utilizes our proposed Rectangular Guided Attention +Network (RGAN) for guided HSI super-resolution. Experiments demonstrate that +the proposed model is capable of generating a vast quantity of realistic HSIs +for downstream tasks such as denoising and super-resolution. The code and +models are available at https://github.com/LiPang/HSIGene. + +
+
+
+
+
+ + ♻ ☆ GrounDiT: Grounding Diffusion Transformers via Noisy Patch + Transplantation NeurIPS 2024 + + +
+ We introduce GrounDiT, a novel training-free spatial grounding technique for +text-to-image generation using Diffusion Transformers (DiT). Spatial grounding +with bounding boxes has gained attention for its simplicity and versatility, +allowing for enhanced user control in image generation. However, prior +training-free approaches often rely on updating the noisy image during the +reverse diffusion process via backpropagation from custom loss functions, which +frequently struggle to provide precise control over individual bounding boxes. +In this work, we leverage the flexibility of the Transformer architecture, +demonstrating that DiT can generate noisy patches corresponding to each +bounding box, fully encoding the target object and allowing for fine-grained +control over each region. Our approach builds on an intriguing property of DiT, +which we refer to as semantic sharing. Due to semantic sharing, when a smaller +patch is jointly denoised alongside a generatable-size image, the two become +semantic clones. Each patch is denoised in its own branch of the generation +process and then transplanted into the corresponding region of the original +noisy image at each timestep, resulting in robust spatial grounding for each +bounding box. In our experiments on the HRS and DrawBench benchmarks, we +achieve state-of-the-art performance compared to previous training-free +approaches. + +
+
+ comment: Accepted to NeurIPS 2024. Project Page: + https://groundit-diffusion.github.io/ +
+
+
+
+
+ + ♻ ☆ In-Context LoRA for Diffusion Transformers + + +
+ Recent research arXiv:2410.15027 has explored the use of diffusion +transformers (DiTs) for task-agnostic image generation by simply concatenating +attention tokens across images. However, despite substantial computational +resources, the fidelity of the generated images remains suboptimal. In this +study, we reevaluate and streamline this framework by hypothesizing that +text-to-image DiTs inherently possess in-context generation capabilities, +requiring only minimal tuning to activate them. Through diverse task +experiments, we qualitatively demonstrate that existing text-to-image DiTs can +effectively perform in-context generation without any tuning. Building on this +insight, we propose a remarkably simple pipeline to leverage the in-context +abilities of DiTs: (1) concatenate images instead of tokens, (2) perform joint +captioning of multiple images, and (3) apply task-specific LoRA tuning using +small datasets (e.g., $20\sim 100$ samples) instead of full-parameter tuning +with large datasets. We name our models In-Context LoRA (IC-LoRA). This +approach requires no modifications to the original DiT models, only changes to +the training data. Remarkably, our pipeline generates high-fidelity image sets +that better adhere to prompts. While task-specific in terms of tuning data, our +framework remains task-agnostic in architecture and pipeline, offering a +powerful tool for the community and providing valuable insights for further +research on product-level task-agnostic generation systems. We release our +code, data, and models at https://github.com/ali-vilab/In-Context-LoRA + +
+
+ comment: Tech report. Project page: + https://ali-vilab.github.io/In-Context-LoRA-Page/ +
+
+
+
+
+ + ♻ ☆ Uni-Med: A Unified Medical Generalist Foundation Model For Multi-Task + Learning Via Connector-MoE + + +
+ Multi-modal large language models (MLLMs) have shown impressive capabilities +as a general-purpose interface for various visual and linguistic tasks. +However, building a unified MLLM for multi-task learning in the medical field +remains a thorny challenge. To mitigate the tug-of-war problem of multi-modal +multi-task optimization in MLLMs, recent advances primarily focus on improving +the LLM components, while neglecting the connector that bridges the gap between +modalities. In this paper, we introduce Uni-Med, a novel medical generalist +foundation model which consists of a universal visual feature extraction +module, a connector mixture-of-experts (CMoE) module, and an LLM. Benefiting +from the proposed CMoE that leverages a well-designed router with a mixture of +projection experts at the connector, Uni-Med achieves efficient solution to the +tug-of-war problem and can perform six different medical tasks including +question answering, visual question answering, report generation, referring +expression comprehension, referring expression generation and image +classification. To the best of our knowledge, Uni-Med is the first effort to +tackle multi-task interference at the connector in MLLMs. Extensive ablation +experiments validate the effectiveness of introducing CMoE under any +configuration, with up to an average 8% performance gains. We further provide +interpretation analysis of the tug-of-war problem from the perspective of +gradient optimization and parameter statistics. Compared to previous +state-of-the-art medical MLLMs, Uni-Med achieves competitive or superior +evaluation metrics on diverse tasks. Code and resources are available at +https://github.com/tsinghua-msiip/Uni-Med. + +
+
+
+
+
+ + ♻ ☆ DPEC: Dual-Path Error Compensation Method for Enhanced Low-Light Image + Clarity + + +
+ For the task of low-light image enhancement, deep learning-based algorithms +have demonstrated superiority and effectiveness compared to traditional +methods. However, these methods, primarily based on Retinex theory, tend to +overlook the noise and color distortions in input images, leading to +significant noise amplification and local color distortions in enhanced +results. To address these issues, we propose the Dual-Path Error Compensation +(DPEC) method, designed to improve image quality under low-light conditions by +preserving local texture details while restoring global image brightness +without amplifying noise. DPEC incorporates precise pixel-level error +estimation to capture subtle differences and an independent denoising mechanism +to prevent noise amplification. We introduce the HIS-Retinex loss to guide +DPEC's training, ensuring the brightness distribution of enhanced images +closely aligns with real-world conditions. To balance computational speed and +resource efficiency while training DPEC for a comprehensive understanding of +the global context, we integrated the VMamba architecture into its backbone. +Comprehensive quantitative and qualitative experimental results demonstrate +that our algorithm significantly outperforms state-of-the-art methods in +low-light image enhancement. The code is publicly available online at +https://github.com/wangshuang233/DPEC. + +
+
+
+
+
+ + ♻ ☆ Foodfusion: A Novel Approach for Food Image Composition via Diffusion + Models + + +
+ Food image composition requires the use of existing dish images and +background images to synthesize a natural new image, while diffusion models +have made significant advancements in image generation, enabling the +construction of end-to-end architectures that yield promising results. However, +existing diffusion models face challenges in processing and fusing information +from multiple images and lack access to high-quality publicly available +datasets, which prevents the application of diffusion models in food image +composition. In this paper, we introduce a large-scale, high-quality food image +composite dataset, FC22k, which comprises 22,000 foreground, background, and +ground truth ternary image pairs. Additionally, we propose a novel food image +composition method, Foodfusion, which leverages the capabilities of the +pre-trained diffusion models and incorporates a Fusion Module for processing +and integrating foreground and background information. This fused information +aligns the foreground features with the background structure by merging the +global structural information at the cross-attention layer of the denoising +UNet. To further enhance the content and structure of the background, we also +integrate a Content-Structure Control Module. Extensive experiments demonstrate +the effectiveness and scalability of our proposed method. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ Detecting Brittle Decisions for Free: Leveraging Margin Consistency in + Deep Robust Classifiers + + +
+ Despite extensive research on adversarial training strategies to improve +robustness, the decisions of even the most robust deep learning models can +still be quite sensitive to imperceptible perturbations, creating serious risks +when deploying them for high-stakes real-world applications. While detecting +such cases may be critical, evaluating a model's vulnerability at a +per-instance level using adversarial attacks is computationally too intensive +and unsuitable for real-time deployment scenarios. The input space margin is +the exact score to detect non-robust samples and is intractable for deep neural +networks. This paper introduces the concept of margin consistency -- a property +that links the input space margins and the logit margins in robust models -- +for efficient detection of vulnerable samples. First, we establish that margin +consistency is a necessary and sufficient condition to use a model's logit +margin as a score for identifying non-robust samples. Next, through +comprehensive empirical analysis of various robustly trained models on CIFAR10 +and CIFAR100 datasets, we show that they indicate high margin consistency with +a strong correlation between their input space margins and the logit margins. +Then, we show that we can effectively and confidently use the logit margin to +detect brittle decisions with such models. Finally, we address cases where the +model is not sufficiently margin-consistent by learning a pseudo-margin from +the feature representation. Our findings highlight the potential of leveraging +deep representations to assess adversarial vulnerability in deployment +scenarios efficiently. + +
+
+ comment: 10 pages, 6 figures, 2 tables. Version Update: Neurips Camera Ready +
+
+
+
+
+ + ♻ ☆ A Framework for Real-Time Volcano-Seismic Event Recognition Based on + Multi-Station Seismograms and Semantic Segmentation Models + + +
+ In volcano monitoring, effective recognition of seismic events is essential +for understanding volcanic activity and raising timely warning alerts. +Traditional methods rely on manual analysis, which can be subjective and +labor-intensive. Furthermore, current automatic approaches often tackle +detection and classification separately, mostly rely on single station +information and generally require tailored preprocessing and representations to +perform predictions. These limitations often hinder their application to +real-time monitoring and utilization across different volcano conditions. This +study introduces a novel approach that utilizes Semantic Segmentation models to +automate seismic event recognition by applying a straight forward +transformation of multi-channel 1D signals into 2D representations, enabling +their use as images. Our framework employs a data-driven, end-to-end design +that integrates multi-station seismic data with minimal preprocessing, +performing both detection and classification simultaneously for five seismic +event classes. We evaluated four state-of-the-art segmentation models (UNet, +UNet++, DeepLabV3+ and SwinUNet) on approximately 25.000 seismic events +recorded at four different Chilean volcanoes: Nevados del Chill\'an Volcanic +Complex, Laguna del Maule, Villarrica and Puyehue-Cord\'on Caulle. Among these +models, the UNet architecture was identified as the most effective model, +achieving mean F1 and Intersection over Union (IoU) scores of up to 0.91 and +0.88, respectively, and demonstrating superior noise robustness and model +flexibility to unseen volcano datasets. + +
+
+ comment: 10 pages, 9 figures. This is a pre-print, it is currently under + review for publication +
+
+
+
+
+ + ♻ ☆ From Question to Exploration: Test-Time Adaptation in Semantic + Segmentation? + + +
+ Test-time adaptation (TTA) aims to adapt a model, initially trained on +training data, to test data with potential distribution shifts. Most existing +TTA methods focus on classification problems. The pronounced success of +classification might lead numerous newcomers and engineers to assume that +classic TTA techniques can be directly applied to the more challenging task of +semantic segmentation. However, this belief is still an open question. In this +paper, we investigate the applicability of existing classic TTA strategies in +semantic segmentation. Our comprehensive results have led to three key +observations. First, the classic normalization updating strategy only brings +slight performance improvement, and in some cases, it might even adversely +affect the results. Even with the application of advanced distribution +estimation techniques like batch renormalization, the problem remains +unresolved. Second, although the teacher-student scheme does enhance the +training stability for segmentation TTA in the presence of noisy pseudo-labels +and temporal correlation, it cannot directly result in performance improvement +compared to the original model without TTA under complex data distribution. +Third, segmentation TTA suffers a severe long-tailed class-imbalance problem, +which is substantially more complex than that in TTA for classification. This +long-tailed challenge negatively affects segmentation TTA performance, even +when the accuracy of pseudo-labels is high. Besides those observations, we find +that visual prompt tuning (VisPT) is promising in segmentation TTA and propose +a novel method named TTAP. The outstanding performance of TTAP has also been +verified. We hope the community can give more attention to this challenging, +yet important, segmentation TTA task in the future. The source code is +available at: \textit{https://github.com/ycarobot/TTAP + +
+
+
+
+
+ + ♻ ☆ LRM-Zero: Training Large Reconstruction Models with Synthesized Data NeurIPS 2024 + + +
+ We present LRM-Zero, a Large Reconstruction Model (LRM) trained entirely on +synthesized 3D data, achieving high-quality sparse-view 3D reconstruction. The +core of LRM-Zero is our procedural 3D dataset, Zeroverse, which is +automatically synthesized from simple primitive shapes with random texturing +and augmentations (e.g., height fields, boolean differences, and wireframes). +Unlike previous 3D datasets (e.g., Objaverse) which are often captured or +crafted by humans to approximate real 3D data, Zeroverse completely ignores +realistic global semantics but is rich in complex geometric and texture details +that are locally similar to or even more intricate than real objects. We +demonstrate that our LRM-Zero, trained with our fully synthesized Zeroverse, +can achieve high visual quality in the reconstruction of real-world objects, +competitive with models trained on Objaverse. We also analyze several critical +design choices of Zeroverse that contribute to LRM-Zero's capability and +training stability. Our work demonstrates that 3D reconstruction, one of the +core tasks in 3D vision, can potentially be addressed without the semantics of +real-world objects. The Zeroverse's procedural synthesis code and interactive +visualization are available at: https://desaixie.github.io/lrm-zero/. + +
+
+ comment: 23 pages, 8 figures. Our code and interactive visualization are + available at: https://desaixie.github.io/lrm-zero/. v2: NeurIPS 2024 Camera + Ready version +
+
+
+
+
+ + ♻ ☆ DiffusionPDE: Generative PDE-Solving Under Partial Observation NeurIPS 2024 + + +
+ We introduce a general framework for solving partial differential equations +(PDEs) using generative diffusion models. In particular, we focus on the +scenarios where we do not have the full knowledge of the scene necessary to +apply classical solvers. Most existing forward or inverse PDE approaches +perform poorly when the observations on the data or the underlying coefficients +are incomplete, which is a common assumption for real-world measurements. In +this work, we propose DiffusionPDE that can simultaneously fill in the missing +information and solve a PDE by modeling the joint distribution of the solution +and coefficient spaces. We show that the learned generative priors lead to a +versatile framework for accurately solving a wide range of PDEs under partial +observation, significantly outperforming the state-of-the-art methods for both +forward and inverse directions. + +
+
+ comment: NeurIPS 2024. Project page: + https://jhhuangchloe.github.io/Diffusion-PDE/ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 64 + +
+
+
+ + ☆ Tensegrity Robot Proprioceptive State Estimation with Geometric + Constraints + + +
+ Tensegrity robots, characterized by a synergistic assembly of rigid rods and +elastic cables, form robust structures that are resistant to impacts. However, +this design introduces complexities in kinematics and dynamics, complicating +control and state estimation. This work presents a novel proprioceptive state +estimator for tensegrity robots. The estimator initially uses the geometric +constraints of 3-bar prism tensegrity structures, combined with IMU and motor +encoder measurements, to reconstruct the robot's shape and orientation. It then +employs a contact-aided invariant extended Kalman filter with forward +kinematics to estimate the global position and orientation of the tensegrity +robot. The state estimator's accuracy is assessed against ground truth data in +both simulated environments and real-world tensegrity robot applications. It +achieves an average drift percentage of 4.2%, comparable to the state +estimation performance of traditional rigid robots. This state estimator +advances the state of the art in tensegrity robot state estimation and has the +potential to run in real-time using onboard sensors, paving the way for full +autonomy of tensegrity robots in unstructured environments. + +
+
+ comment: Preprint; 8 pages, 11 figures, 2 tables; Code at + https://github.com/Jonathan-Twz/tensegrity-robot-state-estimator +
+
+
+
+
+ + ☆ EgoMimic: Scaling Imitation Learning via Egocentric Video + + +
+ The scale and diversity of demonstration data required for imitation learning +is a significant challenge. We present EgoMimic, a full-stack framework which +scales manipulation via human embodiment data, specifically egocentric human +videos paired with 3D hand tracking. EgoMimic achieves this through: (1) a +system to capture human embodiment data using the ergonomic Project Aria +glasses, (2) a low-cost bimanual manipulator that minimizes the kinematic gap +to human data, (3) cross-domain data alignment techniques, and (4) an imitation +learning architecture that co-trains on human and robot data. Compared to prior +works that only extract high-level intent from human videos, our approach +treats human and robot data equally as embodied demonstration data and learns a +unified policy from both data sources. EgoMimic achieves significant +improvement on a diverse set of long-horizon, single-arm and bimanual +manipulation tasks over state-of-the-art imitation learning methods and enables +generalization to entirely new scenes. Finally, we show a favorable scaling +trend for EgoMimic, where adding 1 hour of additional hand data is +significantly more valuable than 1 hour of additional robot data. Videos and +additional information can be found at https://egomimic.github.io/ + +
+
+
+
+
+ + ☆ Teaching Embodied Reinforcement Learning Agents: Informativeness and + Diversity of Language Use + + +
+ In real-world scenarios, it is desirable for embodied agents to have the +ability to leverage human language to gain explicit or implicit knowledge for +learning tasks. Despite recent progress, most previous approaches adopt simple +low-level instructions as language inputs, which may not reflect natural human +communication. It's not clear how to incorporate rich language use to +facilitate task learning. To address this question, this paper studies +different types of language inputs in facilitating reinforcement learning (RL) +embodied agents. More specifically, we examine how different levels of language +informativeness (i.e., feedback on past behaviors and future guidance) and +diversity (i.e., variation of language expressions) impact agent learning and +inference. Our empirical results based on four RL benchmarks demonstrate that +agents trained with diverse and informative language feedback can achieve +enhanced generalization and fast adaptation to new tasks. These findings +highlight the pivotal role of language use in teaching embodied agents new +tasks in an open world. Project website: +https://github.com/sled-group/Teachable_RL + +
+
+ comment: EMNLP 2024 Main. Project website: + https://github.com/sled-group/Teachable_RL +
+
+
+
+
+ + ☆ Zonal RL-RRT: Integrated RL-RRT Path Planning with Collision Probability + and Zone Connectivity + + +
+ Path planning in high-dimensional spaces poses significant challenges, +particularly in achieving both time efficiency and a fair success rate. To +address these issues, we introduce a novel path-planning algorithm, Zonal +RL-RRT, that leverages kd-tree partitioning to segment the map into zones while +addressing zone connectivity, ensuring seamless transitions between zones. By +breaking down the complex environment into multiple zones and using Q-learning +as the high-level decision-maker, our algorithm achieves a 3x improvement in +time efficiency compared to basic sampling methods such as RRT and RRT* in +forest-like maps. Our approach outperforms heuristic-guided methods like BIT* +and Informed RRT* by 1.5x in terms of runtime while maintaining robust and +reliable success rates across 2D to 6D environments. Compared to learning-based +methods like NeuralRRT* and MPNetSMP, as well as the heuristic RRT*J, our +algorithm demonstrates, on average, 1.5x better performance in the same +environments. We also evaluate the effectiveness of our approach through +simulations of the UR10e arm manipulator in the MuJoCo environment. A key +observation of our approach lies in its use of zone partitioning and +Reinforcement Learning (RL) for adaptive high-level planning allowing the +algorithm to accommodate flexible policies across diverse environments, making +it a versatile tool for advanced path planning. + +
+
+
+
+
+ + ☆ DiffPano: Scalable and Consistent Text to Panorama Generation with + Spherical Epipolar-Aware Diffusion NeurIPS2024 + + +
+ Diffusion-based methods have achieved remarkable achievements in 2D image or +3D object generation, however, the generation of 3D scenes and even +$360^{\circ}$ images remains constrained, due to the limited number of scene +datasets, the complexity of 3D scenes themselves, and the difficulty of +generating consistent multi-view images. To address these issues, we first +establish a large-scale panoramic video-text dataset containing millions of +consecutive panoramic keyframes with corresponding panoramic depths, camera +poses, and text descriptions. Then, we propose a novel text-driven panoramic +generation framework, termed DiffPano, to achieve scalable, consistent, and +diverse panoramic scene generation. Specifically, benefiting from the powerful +generative capabilities of stable diffusion, we fine-tune a single-view +text-to-panorama diffusion model with LoRA on the established panoramic +video-text dataset. We further design a spherical epipolar-aware multi-view +diffusion model to ensure the multi-view consistency of the generated panoramic +images. Extensive experiments demonstrate that DiffPano can generate scalable, +consistent, and diverse panoramic images with given unseen text descriptions +and camera poses. + +
+
+ comment: NeurIPS2024, Project: https://github.com/zju3dv/DiffPano; Code: + https://github.com/zju3dv/DiffPano +
+
+
+
+
+ + ☆ A Sagittal Planar Ankle-Foot Prosthesis with Powered Plantarflexion and + Socket Alignment + + +
+ Powered ankle-foot prostheses can often reduce the energy cost of walking by +assisting with push-off. However, focus on providing mechanical work may lead +to ignoring or exacerbating common issues with chronic pain, irritation, +pressure ulcer development, and eventual osteoarthritis in persons with +amputation. This paper presents the design and validation of a novel +transtibial prosthesis informed by predictive biomechanical simulations of gait +which minimize a combination of user effort and interaction loading from the +prosthesis socket. From these findings, the device was designed with a +non-biomimetic anterior-posterior translation degree of freedom with a 10 cm +range of motion which is primarily position-controlled to change the alignment +of the prosthetic foot with the residual limb. The system is both mobile and +tethered, with the batteries, actuators, and majority of electronics located in +a small backpack. Mechanical loads are transmitted through cables to the +prosthesis, minimizing the distal mass carriage required. We measured torque +and force sensing accuracy, open loop actuator performance, closed loop torque +and position control bandwidth, and torque and position tracking error during +walking. The system is capable of producing up to 160 N-m of plantarflexion +torque and 394 N of AP translation force with a closed loop control bandwidth +of about 7 Hz in both degrees of freedom. Torque tracking during walking was +accurate within about 10 N-m but position tracking was substantially affected +by phase lag, possibly due to cable slack in the bidirectional mechanism. The +prototype was capable of replicating our simulated prosthesis dynamics during +gait and offers useful insights into the advantages and the practical +considerations of using predictive biomechanical simulation as a design tool +for wearable robots. + +
+
+ comment: 9 pages, 8 figures, 1 table +
+
+
+
+
+ + ☆ DexMimicGen: Automated Data Generation for Bimanual Dexterous + Manipulation via Imitation Learning + + +
+ Imitation learning from human demonstrations is an effective means to teach +robots manipulation skills. But data acquisition is a major bottleneck in +applying this paradigm more broadly, due to the amount of cost and human effort +involved. There has been significant interest in imitation learning for +bimanual dexterous robots, like humanoids. Unfortunately, data collection is +even more challenging here due to the challenges of simultaneously controlling +multiple arms and multi-fingered hands. Automated data generation in simulation +is a compelling, scalable alternative to fuel this need for data. To this end, +we introduce DexMimicGen, a large-scale automated data generation system that +synthesizes trajectories from a handful of human demonstrations for humanoid +robots with dexterous hands. We present a collection of simulation environments +in the setting of bimanual dexterous manipulation, spanning a range of +manipulation behaviors and different requirements for coordination among the +two arms. We generate 21K demos across these tasks from just 60 source human +demos and study the effect of several data generation and policy learning +decisions on agent performance. Finally, we present a real-to-sim-to-real +pipeline and deploy it on a real-world humanoid can sorting task. Videos and +more are at https://dexmimicgen.github.io/ + +
+
+ comment: Project website: https://dexmimicgen.github.io/ +
+
+
+
+
+ + ☆ $π_0$: A Vision-Language-Action Flow Model for General Robot Control + + +
+ Robot learning holds tremendous promise to unlock the full potential of +flexible, general, and dexterous robot systems, as well as to address some of +the deepest questions in artificial intelligence. However, bringing robot +learning to the level of generality required for effective real-world systems +faces major obstacles in terms of data, generalization, and robustness. In this +paper, we discuss how generalist robot policies (i.e., robot foundation models) +can address these challenges, and how we can design effective generalist robot +policies for complex and highly dexterous tasks. We propose a novel flow +matching architecture built on top of a pre-trained vision-language model (VLM) +to inherit Internet-scale semantic knowledge. We then discuss how this model +can be trained on a large and diverse dataset from multiple dexterous robot +platforms, including single-arm robots, dual-arm robots, and mobile +manipulators. We evaluate our model in terms of its ability to perform tasks in +zero shot after pre-training, follow language instructions from people and from +a high-level VLM policy, and its ability to acquire new skills via fine-tuning. +Our results cover a wide variety of tasks, such as laundry folding, table +cleaning, and assembling boxes. + +
+
+ comment: See project website for videos: + https://physicalintelligence.company/blog/pi0 +
+
+
+
+
+ + ☆ Language-Driven Policy Distillation for Cooperative Driving in + Multi-Agent Reinforcement Learning + + +
+ The cooperative driving technology of Connected and Autonomous Vehicles +(CAVs) is crucial for improving the efficiency and safety of transportation +systems. Learning-based methods, such as Multi-Agent Reinforcement Learning +(MARL), have demonstrated strong capabilities in cooperative decision-making +tasks. However, existing MARL approaches still face challenges in terms of +learning efficiency and performance. In recent years, Large Language Models +(LLMs) have rapidly advanced and shown remarkable abilities in various +sequential decision-making tasks. To enhance the learning capabilities of +cooperative agents while ensuring decision-making efficiency and +cost-effectiveness, we propose LDPD, a language-driven policy distillation +method for guiding MARL exploration. In this framework, a teacher agent based +on LLM trains smaller student agents to achieve cooperative decision-making +through its own decision-making demonstrations. The teacher agent enhances the +observation information of CAVs and utilizes LLMs to perform complex +cooperative decision-making reasoning, which also leverages carefully designed +decision-making tools to achieve expert-level decisions, providing high-quality +teaching experiences. The student agent then refines the teacher's prior +knowledge into its own model through gradient policy updates. The experiments +demonstrate that the students can rapidly improve their capabilities with +minimal guidance from the teacher and eventually surpass the teacher's +performance. Extensive experiments show that our approach demonstrates better +performance and learning efficiency compared to baseline methods. + +
+
+
+
+
+ + ☆ 3D-ViTac: Learning Fine-Grained Manipulation with Visuo-Tactile Sensing + + +
+ Tactile and visual perception are both crucial for humans to perform +fine-grained interactions with their environment. Developing similar +multi-modal sensing capabilities for robots can significantly enhance and +expand their manipulation skills. This paper introduces \textbf{3D-ViTac}, a +multi-modal sensing and learning system designed for dexterous bimanual +manipulation. Our system features tactile sensors equipped with dense sensing +units, each covering an area of 3$mm^2$. These sensors are low-cost and +flexible, providing detailed and extensive coverage of physical contacts, +effectively complementing visual information. To integrate tactile and visual +data, we fuse them into a unified 3D representation space that preserves their +3D structures and spatial relationships. The multi-modal representation can +then be coupled with diffusion policies for imitation learning. Through +concrete hardware experiments, we demonstrate that even low-cost robots can +perform precise manipulations and significantly outperform vision-only +policies, particularly in safe interactions with fragile items and executing +long-horizon tasks involving in-hand manipulation. Our project page is +available at \url{https://binghao-huang.github.io/3D-ViTac/}. + +
+
+ comment: Accepted at Conference on Robot Learning (CoRL) 2024 +
+
+
+
+
+ + ☆ Sparsh: Self-supervised touch representations for vision-based tactile + sensing + + +
+ In this work, we introduce general purpose touch representations for the +increasingly accessible class of vision-based tactile sensors. Such sensors +have led to many recent advances in robot manipulation as they markedly +complement vision, yet solutions today often rely on task and sensor specific +handcrafted perception models. Collecting real data at scale with task centric +ground truth labels, like contact forces and slip, is a challenge further +compounded by sensors of various form factor differing in aspects like lighting +and gel markings. To tackle this we turn to self-supervised learning (SSL) that +has demonstrated remarkable performance in computer vision. We present Sparsh, +a family of SSL models that can support various vision-based tactile sensors, +alleviating the need for custom labels through pre-training on 460k+ tactile +images with masking and self-distillation in pixel and latent spaces. We also +build TacBench, to facilitate standardized benchmarking across sensors and +models, comprising of six tasks ranging from comprehending tactile properties +to enabling physical perception and manipulation planning. In evaluations, we +find that SSL pre-training for touch representation outperforms task and +sensor-specific end-to-end training by 95.1% on average over TacBench, and +Sparsh (DINO) and Sparsh (IJEPA) are the most competitive, indicating the +merits of learning in latent space for tactile images. Project page: +https://sparsh-ssl.github.io/ + +
+
+ comment: Conference on Robot Learning (CoRL), 2024 +
+
+
+
+
+ + ☆ State- and context-dependent robotic manipulation and grasping via + uncertainty-aware imitation learning + + +
+ Generating context-adaptive manipulation and grasping actions is a +challenging problem in robotics. Classical planning and control algorithms tend +to be inflexible with regard to parameterization by external variables such as +object shapes. In contrast, Learning from Demonstration (LfD) approaches, due +to their nature as function approximators, allow for introducing external +variables to modulate policies in response to the environment. In this paper, +we utilize this property by introducing an LfD approach to acquire +context-dependent grasping and manipulation strategies. We treat the problem as +a kernel-based function approximation, where the kernel inputs include generic +context variables describing task-dependent parameters such as the object +shape. We build on existing work on policy fusion with uncertainty +quantification to propose a state-dependent approach that automatically returns +to demonstrations, avoiding unpredictable behavior while smoothly adapting to +context changes. The approach is evaluated against the LASA handwriting dataset +and on a real 7-DoF robot in two scenarios: adaptation to slippage while +grasping and manipulating a deformable food item. + +
+
+
+
+
+ + ☆ GAMap: Zero-Shot Object Goal Navigation with Multi-Scale + Geometric-Affordance Guidance + + +
+ Zero-Shot Object Goal Navigation (ZS-OGN) enables robots or agents to +navigate toward objects of unseen categories without object-specific training. +Traditional approaches often leverage categorical semantic information for +navigation guidance, which struggles when only objects are partially observed +or detailed and functional representations of the environment are lacking. To +resolve the above two issues, we propose \textit{Geometric-part and Affordance +Maps} (GAMap), a novel method that integrates object parts and affordance +attributes as navigation guidance. Our method includes a multi-scale scoring +approach to capture geometric-part and affordance attributes of objects at +different scales. Comprehensive experiments conducted on HM3D and Gibson +benchmark datasets demonstrate improvements in Success Rate and Success +weighted by Path Length, underscoring the efficacy of our geometric-part and +affordance-guided navigation approach in enhancing robot autonomy and +versatility, without any additional object-specific training or fine-tuning +with the semantics of unseen objects and/or the locomotions of the robot. + +
+
+ comment: 16 pages, 8 figures, 7 tables +
+
+
+
+
+ + ☆ EmbodiedRAG: Dynamic 3D Scene Graph Retrieval for Efficient and Scalable + Robot Task Planning + + +
+ Recent advances in Large Language Models (LLMs) have helped facilitate +exciting progress for robotic planning in real, open-world environments. 3D +scene graphs (3DSGs) offer a promising environment representation for grounding +such LLM-based planners as they are compact and semantically rich. However, as +the robot's environment scales (e.g., number of entities tracked) and the +complexity of scene graph information increases (e.g., maintaining more +attributes), providing the 3DSG as-is to an LLM-based planner quickly becomes +infeasible due to input token count limits and attentional biases present in +LLMs. Inspired by the successes of Retrieval-Augmented Generation (RAG) methods +that retrieve query-relevant document chunks for LLM question and answering, we +adapt the paradigm for our embodied domain. Specifically, we propose a 3D scene +subgraph retrieval framework, called EmbodiedRAG, that we augment an LLM-based +planner with for executing natural language robotic tasks. Notably, our +retrieved subgraphs adapt to changes in the environment as well as changes in +task-relevancy as the robot executes its plan. We demonstrate EmbodiedRAG's +ability to significantly reduce input token counts (by an order of magnitude) +and planning time (up to 70% reduction in average time per planning step) while +improving success rates on AI2Thor simulated household tasks with a single-arm, +mobile manipulator. Additionally, we implement EmbodiedRAG on a quadruped with +a manipulator to highlight the performance benefits for robot deployment at the +edge in real environments. + +
+
+
+
+
+ + ☆ Exploiting Information Theory for Intuitive Robot Programming of Manual + Activities + + +
+ Observational learning is a promising approach to enable people without +expertise in programming to transfer skills to robots in a user-friendly +manner, since it mirrors how humans learn new behaviors by observing others. +Many existing methods focus on instructing robots to mimic human trajectories, +but motion-level strategies often pose challenges in skills generalization +across diverse environments. This paper proposes a novel framework that allows +robots to achieve a \textit{higher-level} understanding of human-demonstrated +manual tasks recorded in RGB videos. By recognizing the task structure and +goals, robots generalize what observed to unseen scenarios. We found our task +representation on Shannon's Information Theory (IT), which is applied for the +first time to manual tasks. IT helps extract the active scene elements and +quantify the information shared between hands and objects. We exploit scene +graph properties to encode the extracted interaction features in a compact +structure and segment the demonstration into blocks, streamlining the +generation of Behavior Trees for robot replicas. Experiments validated the +effectiveness of IT to automatically generate robot execution plans from a +single human demonstration. Additionally, we provide HANDSOME, an open-source +dataset of HAND Skills demOnstrated by Multi-subjEcts, to promote further +research and evaluation in this field. + +
+
+
+
+
+ + ☆ Redundant Observer-Based Tracking Control for Object Extraction Using a + Cable Connected UAV + + +
+ A new disturbance observer based control scheme is developed for a quadrotor +under the concurrent disturbances from a lightweight elastic tether cable and a +lumped vertical disturbance. This elastic tether is unusual as it creates a +disturbance proportional to the multicopter's translational movement. This +paper takes an observer-based approach to estimate the stiffness coefficient of +the cable and uses the system model to update the estimates of the external +forces, which are then compensated in the control action. Given that the +tethered cable force affects both horizontal channels of the quadrotor and is +also coupled with the vertical channel, the proposed disturbance observer is +constructed to exploit the redundant measurements across all three channels to +jointly estimate the cable stiffness and the vertical disturbance. A +pseudo-inverse method is used to determine the observer gain functions, such +that the estimation of the two quantities is decoupled and stable. Compared to +standard disturbance observers which assume nearly constant disturbances, the +proposed approach can quickly adjust its total force estimate as the tethered +quadrotor changes its position or tautness of the tether. This is applied to +two experiments - a tracking performance test where the multicopter moves under +a constant tether strain, and an object extraction test. In the second test, +the multicopter manipulates a nonlinear mechanism mimicking the extraction of a +wedged object. In both cases, the proposed approach shows significant +improvement over standard Disturbance Observer and Extended State Observer +approaches. A video summary of the experiments can be found at +https://youtu.be/9gKr13WTj-k. + +
+
+
+
+
+ + ☆ Transformer-based Model Predictive Control: Trajectory Optimization via + Sequence Modeling + + +
+ Model predictive control (MPC) has established itself as the primary +methodology for constrained control, enabling general-purpose robot autonomy in +diverse real-world scenarios. However, for most problems of interest, MPC +relies on the recursive solution of highly non-convex trajectory optimization +problems, leading to high computational complexity and strong dependency on +initialization. In this work, we present a unified framework to combine the +main strengths of optimization-based and learning-based methods for MPC. Our +approach entails embedding high-capacity, transformer-based neural network +models within the optimization process for trajectory generation, whereby the +transformer provides a near-optimal initial guess, or target plan, to a +non-convex optimization problem. Our experiments, performed in simulation and +the real world onboard a free flyer platform, demonstrate the capabilities of +our framework to improve MPC convergence and runtime. Compared to purely +optimization-based approaches, results show that our approach can improve +trajectory generation performance by up to 75%, reduce the number of solver +iterations by up to 45%, and improve overall MPC runtime by 7x without loss in +performance. + +
+
+ comment: 8 pages, 7 figures. Datasets, videos and code available at: + https://transformermpc.github.io +
+
+
+
+
+ + ☆ Analysing the Interplay of Vision and Touch for Dexterous Insertion + Tasks + + +
+ Robotic insertion tasks remain challenging due to uncertainties in perception +and the need for precise control, particularly in unstructured environments. +While humans seamlessly combine vision and touch for such tasks, effectively +integrating these modalities in robotic systems is still an open problem. Our +work presents an extensive analysis of the interplay between visual and tactile +feedback during dexterous insertion tasks, showing that tactile sensing can +greatly enhance success rates on challenging insertions with tight tolerances +and varied hole orientations that vision alone cannot solve. These findings +provide valuable insights for designing more effective multi-modal robotic +control systems and highlight the critical role of tactile feedback in +contact-rich manipulation tasks. + +
+
+
+
+
+ + ☆ A Comprehensive Review of Current Robot- Based Pollinators in Greenhouse + Farming + + +
+ The decline of bee and wind-based pollination systems in greenhouses due to +controlled environments and limited access has boost the importance of finding +alternative pollination methods. Robotic based pollination systems have emerged +as a promising solution, ensuring adequate crop yield even in challenging +pollination scenarios. This paper presents a comprehensive review of the +current robotic-based pollinators employed in greenhouses. The review +categorizes pollinator technologies into major categories such as air-jet, +water-jet, linear actuator, ultrasonic wave, and air-liquid spray, each +suitable for specific crop pollination requirements. However, these +technologies are often tailored to particular crops, limiting their +versatility. The advancement of science and technology has led to the +integration of automated pollination technology, encompassing information +technology, automatic perception, detection, control, and operation. This +integration not only reduces labor costs but also fosters the ongoing progress +of modern agriculture by refining technology, enhancing automation, and +promoting intelligence in agricultural practices. Finally, the challenges +encountered in design of pollinator are addressed, and a forward-looking +perspective is taken towards future developments, aiming to contribute to the +sustainable advancement of this technology. + +
+
+ comment: 20 pages, 21 figures +
+
+
+
+
+ + ☆ Features characterizing safe aerial-aquatic robots + + +
+ This paper underscores the importance of environmental monitoring, and +specifically of freshwater ecosystems, which play a critical role in sustaining +life and global economy. Despite their importance, insufficient data +availability prevents a comprehensive understanding of these ecosystems, +thereby impeding informed decision-making concerning their preservation. +Aerial-aquatic robots are identified as effective tools for freshwater sensing, +offering rapid deployment and avoiding the need of using ships and manned +teams. + To advance the field of aerial aquatic robots, this paper conducts a +comprehensive review of air-water transitions focusing on the water entry +strategy of existing prototypes. This analysis also highlights the safety risks +associated with each transition and proposes a set of design requirements +relating to robots' tasks, mission objectives, and safety measures. To further +explore the proposed design requirements, we present a novel robot with VTOL +capability, enabling seamless air water transitions. + +
+
+ comment: Peer-reviewed and accepted in IEEE Ubiquitous Robots 2024, New York + City +
+
+
+
+
+ + ☆ Get a Grip: Multi-Finger Grasp Evaluation at Scale Enables Robust + Sim-to-Real Transfer + + +
+ This work explores conditions under which multi-finger grasping algorithms +can attain robust sim-to-real transfer. While numerous large datasets +facilitate learning generative models for multi-finger grasping at scale, +reliable real-world dexterous grasping remains challenging, with most methods +degrading when deployed on hardware. An alternate strategy is to use +discriminative grasp evaluation models for grasp selection and refinement, +conditioned on real-world sensor measurements. This paradigm has produced +state-of-the-art results for vision-based parallel-jaw grasping, but remains +unproven in the multi-finger setting. In this work, we find that existing +datasets and methods have been insufficient for training discriminitive models +for multi-finger grasping. To train grasp evaluators at scale, datasets must +provide on the order of millions of grasps, including both positive and +negative examples, with corresponding visual data resembling measurements at +inference time. To that end, we release a new, open-source dataset of 3.5M +grasps on 4.3K objects annotated with RGB images, point clouds, and trained +NeRFs. Leveraging this dataset, we train vision-based grasp evaluators that +outperform both analytic and generative modeling-based baselines on extensive +simulated and real-world trials across a diverse range of objects. We show via +numerous ablations that the key factor for performance is indeed the evaluator, +and that its quality degrades as the dataset shrinks, demonstrating the +importance of our new dataset. Project website at: +https://sites.google.com/view/get-a-grip-dataset. + +
+
+
+
+
+ + ☆ XRDSLAM: A Flexible and Modular Framework for Deep Learning based SLAM + + +
+ In this paper, we propose a flexible SLAM framework, XRDSLAM. It adopts a +modular code design and a multi-process running mechanism, providing highly +reusable foundational modules such as unified dataset management, 3d +visualization, algorithm configuration, and metrics evaluation. It can help +developers quickly build a complete SLAM system, flexibly combine different +algorithm modules, and conduct standardized benchmarking for accuracy and +efficiency comparison. Within this framework, we integrate several +state-of-the-art SLAM algorithms with different types, including NeRF and 3DGS +based SLAM, and even odometry or reconstruction algorithms, which demonstrates +the flexibility and extensibility. We also conduct a comprehensive comparison +and evaluation of these integrated algorithms, analyzing the characteristics of +each. Finally, we contribute all the code, configuration and data to the +open-source community, which aims to promote the widespread research and +development of SLAM technology within the open-source ecosystem. + +
+
+
+
+
+ + ☆ CubiXMusashi: Fusion of Wire-Driven CubiX and Musculoskeletal Humanoid + Musashi toward Unlimited Performance + + +
+ Humanoids exhibit a wide variety in terms of joint configuration, actuators, +and degrees of freedom, resulting in different achievable movements and tasks +for each type. Particularly, musculoskeletal humanoids are developed to closely +emulate human body structure and movement functions, consisting of a skeletal +framework driven by numerous muscle actuators. The redundant arrangement of +muscles relative to the skeletal degrees of freedom has been used to represent +the flexible and complex body movements observed in humans. However, due to +this flexible body and high degrees of freedom, modeling, simulation, and +control become extremely challenging, limiting the feasible movements and +tasks. In this study, we integrate the musculoskeletal humanoid Musashi with +the wire-driven robot CubiX, capable of connecting to the environment, to form +CubiXMusashi. This combination addresses the shortcomings of traditional +musculoskeletal humanoids and enables movements beyond the capabilities of +other humanoids. CubiXMusashi connects to the environment with wires and drives +by winding them, successfully achieving movements such as pull-up, rising from +a lying pose, and mid-air kicking, which are difficult for Musashi alone. This +concept demonstrates that various humanoids, not limited to musculoskeletal +humanoids, can mitigate their physical constraints and acquire new abilities by +connecting to the environment and driving through wires. + +
+
+ comment: Accepted Humanoids2024, website - + https://shin0805.github.io/cubixmusashi/, YouTube - + https://youtu.be/IvzP98-r_mo +
+
+
+
+
+ + ☆ SceneComplete: Open-World 3D Scene Completion in Complex Real World + Environments for Robot Manipulation + + +
+ Careful robot manipulation in every-day cluttered environments requires an +accurate understanding of the 3D scene, in order to grasp and place objects +stably and reliably and to avoid mistakenly colliding with other objects. In +general, we must construct such a 3D interpretation of a complex scene based on +limited input, such as a single RGB-D image. We describe SceneComplete, a +system for constructing a complete, segmented, 3D model of a scene from a +single view. It provides a novel pipeline for composing general-purpose +pretrained perception modules (vision-language, segmentation, image-inpainting, +image-to-3D, and pose-estimation) to obtain high-accuracy results. We +demonstrate its accuracy and effectiveness with respect to ground-truth models +in a large benchmark dataset and show that its accurate whole-object +reconstruction enables robust grasp proposal generation, including for a +dexterous hand. + +
+
+
+
+
+ + ☆ SuctionPrompt: Visual-assisted Robotic Picking with a Suction Cup Using + Vision-Language Models and Facile Hardware Design + + +
+ The development of large language models and vision-language models (VLMs) +has resulted in the increasing use of robotic systems in various fields. +However, the effective integration of these models into real-world robotic +tasks is a key challenge. We developed a versatile robotic system called +SuctionPrompt that utilizes prompting techniques of VLMs combined with 3D +detections to perform product-picking tasks in diverse and dynamic +environments. Our method highlights the importance of integrating 3D spatial +information with adaptive action planning to enable robots to approach and +manipulate objects in novel environments. In the validation experiments, the +system accurately selected suction points 75.4%, and achieved a 65.0% success +rate in picking common items. This study highlights the effectiveness of VLMs +in robotic manipulation tasks, even with simple 3D processing. + +
+
+ comment: 11 pages, 7 figures, 4 tables +
+
+
+
+
+ + ☆ Tiny Learning-Based MPC for Multirotors: Solver-Aware Learning for + Efficient Embedded Predictive Control + + +
+ Tiny aerial robots show promise for applications like environmental +monitoring and search-and-rescue but face challenges in control due to their +limited computing power and complex dynamics. Model Predictive Control (MPC) +can achieve agile trajectory tracking and handle constraints. Although current +learning-based MPC methods, such as Gaussian Process (GP) MPC, improve control +performance by learning residual dynamics, they are computationally demanding, +limiting their onboard application on tiny robots. This paper introduces Tiny +Learning-Based Model Predictive Control (LB MPC), a novel framework for +resource-constrained micro multirotor platforms. By exploiting multirotor +dynamics' structure and developing an efficient solver, our approach enables +high-rate control at 100 Hz on a Crazyflie 2.1 with a Teensy 4.0 +microcontroller. We demonstrate a 23\% average improvement in tracking +performance over existing embedded MPC methods, achieving the first onboard +implementation of learning-based MPC on a tiny multirotor (53 g). + +
+
+
+
+
+ + ☆ Multi-Robot Pursuit in Parameterized Formation via Imitation Learning + + +
+ This paper studies the problem of multi-robot pursuit of how to coordinate a +group of defending robots to capture a faster attacker before it enters a +protected area. Such operation for defending robots is challenging due to the +unknown avoidance strategy and higher speed of the attacker, coupled with the +limited communication capabilities of defenders. To solve this problem, we +propose a parameterized formation controller that allows defending robots to +adapt their formation shape using five adjustable parameters. Moreover, we +develop an imitation-learning based approach integrated with model predictive +control to optimize these shape parameters. We make full use of these two +techniques to enhance the capture capabilities of defending robots through +ongoing training. Both simulation and experiment are provided to verify the +effectiveness and robustness of our proposed controller. Simulation results +show that defending robots can rapidly learn an effective strategy for +capturing the attacker, and moreover the learned strategy remains effective +across varying numbers of defenders. Experiment results on real robot platforms +further validated these findings. + +
+
+
+
+
+ + ☆ Distributed Formation Shape Control of Identity-less Robot Swarms + + +
+ Different from most of the formation strategies where robots require unique +labels to identify topological neighbors to satisfy the predefined shape +constraints, we here study the problem of identity-less distributed shape +formation in homogeneous swarms, which is rarely studied in the literature. The +absence of identities creates a unique challenge: how to design appropriate +target formations and local behaviors that are suitable for identity-less +formation shape control. To address this challenge, we propose the following +novel results. First, to avoid using unique identities, we propose a dynamic +formation description method and solve the formation consensus of robots in a +locally distributed manner. Second, to handle identity-less distributed +formations, we propose a fully distributed control law for homogeneous swarms +based on locally sensed information. While the existing methods are applicable +to simple cases where the target formation is stationary, ours can tackle more +general maneuvering formations such as translation, rotation, or even shape +deformation. Both numerical simulation and flight experiment are presented to +verify the effectiveness and robustness of our proposed formation strategy. + +
+
+
+
+
+ + ☆ Dual Agent Learning Based Aerial Trajectory Tracking + + +
+ This paper presents a novel reinforcement learning framework for trajectory +tracking of unmanned aerial vehicles in cluttered environments using a +dual-agent architecture. Traditional optimization methods for trajectory +tracking face significant computational challenges and lack robustness in +dynamic environments. Our approach employs deep reinforcement learning (RL) to +overcome these limitations, leveraging 3D pointcloud data to perceive the +environment without relying on memory-intensive obstacle representations like +occupancy grids. The proposed system features two RL agents: one for predicting +UAV velocities to follow a reference trajectory and another for managing +collision avoidance in the presence of obstacles. This architecture ensures +real-time performance and adaptability to uncertainties. We demonstrate the +efficacy of our approach through simulated and real-world experiments, +highlighting improvements over state-of-the-art RL and optimization-based +methods. Additionally, a curriculum learning paradigm is employed to scale the +algorithms to more complex environments, ensuring robust trajectory tracking +and obstacle avoidance in both static and dynamic scenarios. + +
+
+
+
+
+ + ☆ Simulating User Agents for Embodied Conversational-AI + + +
+ Embodied agents designed to assist users with tasks must engage in natural +language interactions, interpret instructions, execute actions, and communicate +effectively to resolve issues. However, collecting large-scale, diverse +datasets of situated human-robot dialogues to train and evaluate such agents is +expensive, labor-intensive, and time-consuming. To address this challenge, we +propose building a large language model (LLM)-based user agent that can +simulate user behavior during interactions with an embodied agent in a virtual +environment. Given a user goal (e.g., make breakfast), at each time step, the +user agent may observe" the robot actions or speak" to either intervene with +the robot or answer questions. Such a user agent assists in improving the +scalability and efficiency of embodied dialogues dataset generation and is +critical for enhancing and evaluating the robot's interaction and task +completion ability, as well as for research in reinforcement learning using AI +feedback. We evaluate our user agent's ability to generate human-like behaviors +by comparing its simulated dialogues with the TEACh dataset. We perform three +experiments: zero-shot prompting to predict dialogue acts, few-shot prompting, +and fine-tuning on the TEACh training subset. Results show the LLM-based user +agent achieves an F-measure of 42% with zero-shot prompting and 43.4% with +few-shot prompting in mimicking human speaking behavior. Through fine-tuning, +performance in deciding when to speak remained stable, while deciding what to +say improved from 51.1% to 62.5%. These findings showcase the feasibility of +the proposed approach for assessing and enhancing the effectiveness of robot +task completion through natural language communication. + +
+
+ comment: 8 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ LBurst: Learning-Based Robotic Burst Feature Extraction for 3D + Reconstruction in Low Light + + +
+ Drones have revolutionized the fields of aerial imaging, mapping, and +disaster recovery. However, the deployment of drones in low-light conditions is +constrained by the image quality produced by their on-board cameras. In this +paper, we present a learning architecture for improving 3D reconstructions in +low-light conditions by finding features in a burst. Our approach enhances +visual reconstruction by detecting and describing high quality true features +and less spurious features in low signal-to-noise ratio images. We demonstrate +that our method is capable of handling challenging scenes in millilux +illumination, making it a significant step towards drones operating at night +and in extremely low-light applications such as underground mining and search +and rescue operations. + +
+
+ comment: 7 pages, 8 figures, 3 tables, for associated project page, see + https://roboticimaging.org/Projects/LBurst/ +
+
+
+
+
+ + ☆ BOMP: Bin-Optimized Motion Planning + + +
+ In logistics, the ability to quickly compute and execute pick-and-place +motions from bins is critical to increasing productivity. We present +Bin-Optimized Motion Planning (BOMP), a motion planning framework that plans +arm motions for a six-axis industrial robot with a long-nosed suction tool to +remove boxes from deep bins. BOMP considers robot arm kinematics, actuation +limits, the dimensions of a grasped box, and a varying height map of a bin +environment to rapidly generate time-optimized, jerk-limited, and +collision-free trajectories. The optimization is warm-started using a deep +neural network trained offline in simulation with 25,000 scenes and +corresponding trajectories. Experiments with 96 simulated and 15 physical +environments suggest that BOMP generates collision-free trajectories that are +up to 58 % faster than baseline sampling-based planners and up to 36 % faster +than an industry-standard Up-Over-Down algorithm, which has an extremely low 15 +% success rate in this context. BOMP also generates jerk-limited trajectories +while baselines do not. Website: https://sites.google.com/berkeley.edu/bomp. + +
+
+
+
+
+ + ☆ Pedestrian Trajectory Prediction with Missing Data: Datasets, + Imputation, and Benchmarking NeurIPS 2024 + + +
+ Pedestrian trajectory prediction is crucial for several applications such as +robotics and self-driving vehicles. Significant progress has been made in the +past decade thanks to the availability of pedestrian trajectory datasets, which +enable trajectory prediction methods to learn from pedestrians' past movements +and predict future trajectories. However, these datasets and methods typically +assume that the observed trajectory sequence is complete, ignoring real-world +issues such as sensor failure, occlusion, and limited fields of view that can +result in missing values in observed trajectories. To address this challenge, +we present TrajImpute, a pedestrian trajectory prediction dataset that +simulates missing coordinates in the observed trajectory, enhancing real-world +applicability. TrajImpute maintains a uniform distribution of missing data +within the observed trajectories. In this work, we comprehensively examine +several imputation methods to reconstruct the missing coordinates and benchmark +them for imputing pedestrian trajectories. Furthermore, we provide a thorough +analysis of recent trajectory prediction methods and evaluate the performance +of these models on the imputed trajectories. Our experimental evaluation of the +imputation and trajectory prediction methods offers several valuable insights. +Our dataset provides a foundational resource for future research on +imputation-aware pedestrian trajectory prediction, potentially accelerating the +deployment of these methods in real-world applications. Publicly accessible +links to the datasets and code files are available at +https://github.com/Pranav-chib/TrajImpute. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ☆ Learning Low-Dimensional Strain Models of Soft Robots by Looking at the + Evolution of Their Shape with Application to Model-Based Control + + +
+ Obtaining dynamic models of continuum soft robots is central to the analysis +and control of soft robots, and researchers have devoted much attention to the +challenge of proposing both data-driven and first-principle solutions. Both +avenues have, however, shown their limitations; the former lacks structure and +performs poorly outside training data, while the latter requires significant +simplifications and extensive expert knowledge to be used in practice. This +paper introduces a streamlined method for learning low-dimensional, +physics-based models that are both accurate and easy to interpret. We start +with an algorithm that uses image data (i.e., shape evolutions) to determine +the minimal necessary segments for describing a soft robot's movement. +Following this, we apply a dynamic regression and strain sparsification +algorithm to identify relevant strains and define the model's dynamics. We +validate our approach through simulations with various planar soft +manipulators, comparing its performance against other learning strategies, +showing that our models are both computationally efficient and 25x more +accurate on out-of-training distribution inputs. Finally, we demonstrate that +thanks to the capability of the method of generating physically compatible +models, the learned models can be straightforwardly combined with model-based +control policies. + +
+
+ comment: 8 pages, under review +
+
+
+
+
+ + ☆ Cost-Aware Query Policies in Active Learning for Efficient Autonomous + Robotic Exploration + + +
+ In missions constrained by finite resources, efficient data collection is +critical. Informative path planning, driven by automated decision-making, +optimizes exploration by reducing the costs associated with accurate +characterization of a target in an environment. Previous implementations of +active learning did not consider the action cost for regression problems or +only considered the action cost for classification problems. This paper +analyzes an AL algorithm for Gaussian Process regression while incorporating +action cost. The algorithm's performance is compared on various regression +problems to include terrain mapping on diverse simulated surfaces along metrics +of root mean square error, samples and distance until convergence, and model +variance upon convergence. The cost-dependent acquisition policy doesn't +organically optimize information gain over distance. Instead, the traditional +uncertainty metric with a distance constraint best minimizes root-mean-square +error over trajectory distance. This studys impact is to provide insight into +incorporating action cost with AL methods to optimize exploration under +realistic mission constraints. + +
+
+
+
+
+ + ☆ First, Learn What You Don't Know: Active Information Gathering for + Driving at the Limits of Handling + + +
+ Combining data-driven models that adapt online and model predictive control +(MPC) has enabled effective control of nonlinear systems. However, when +deployed on unstable systems, online adaptation may not be fast enough to +ensure reliable simultaneous learning and control. For example, controllers on +a vehicle executing highly dynamic maneuvers may push the tires to their +friction limits, destabilizing the vehicle and allowing modeling errors to +quickly compound and cause a loss of control. In this work, we present a +Bayesian meta-learning MPC framework. We propose an expressive vehicle dynamics +model that leverages Bayesian last-layer meta-learning to enable rapid online +adaptation. The model's uncertainty estimates are used to guide informative +data collection and quickly improve the model prior to deployment. Experiments +on a Toyota Supra show that (i) the framework enables reliable control in +dynamic drifting maneuvers, (ii) online adaptation alone may not suffice for +zero-shot control of a vehicle at the edge of stability, and (iii) active data +collection helps achieve reliable performance. + +
+
+
+
+
+ + ☆ Learning Visual Parkour from Generated Images + + +
+ Fast and accurate physics simulation is an essential component of robot +learning, where robots can explore failure scenarios that are difficult to +produce in the real world and learn from unlimited on-policy data. Yet, it +remains challenging to incorporate RGB-color perception into the sim-to-real +pipeline that matches the real world in its richness and realism. In this work, +we train a robot dog in simulation for visual parkour. We propose a way to use +generative models to synthesize diverse and physically accurate image sequences +of the scene from the robot's ego-centric perspective. We present +demonstrations of zero-shot transfer to the RGB-only observations of the real +world on a robot equipped with a low-cost, off-the-shelf color camera. website +visit https://lucidsim.github.io + +
+
+ comment: 17 pages, 19 figures +
+
+
+
+
+ + ☆ PARTNR: A Benchmark for Planning and Reasoning in Embodied Multi-agent + Tasks + + +
+ We present a benchmark for Planning And Reasoning Tasks in humaN-Robot +collaboration (PARTNR) designed to study human-robot coordination in household +activities. PARTNR tasks exhibit characteristics of everyday tasks, such as +spatial, temporal, and heterogeneous agent capability constraints. We employ a +semi-automated task generation pipeline using Large Language Models (LLMs), +incorporating simulation in the loop for grounding and verification. PARTNR +stands as the largest benchmark of its kind, comprising 100,000 natural +language tasks, spanning 60 houses and 5,819 unique objects. We analyze +state-of-the-art LLMs on PARTNR tasks, across the axes of planning, perception +and skill execution. The analysis reveals significant limitations in SoTA +models, such as poor coordination and failures in task tracking and recovery +from errors. When LLMs are paired with real humans, they require 1.5x as many +steps as two humans collaborating and 1.1x more steps than a single human, +underscoring the potential for improvement in these models. We further show +that fine-tuning smaller LLMs with planning data can achieve performance on par +with models 9 times larger, while being 8.6x faster at inference. Overall, +PARTNR highlights significant challenges facing collaborative embodied agents +and aims to drive research in this direction. + +
+
+ comment: Alphabetical author order +
+
+
+
+
+ + ♻ ☆ NAVSIM: Data-Driven Non-Reactive Autonomous Vehicle Simulation and + Benchmarking NeurIPS 2024 + + +
+ Benchmarking vision-based driving policies is challenging. On one hand, +open-loop evaluation with real data is easy, but these results do not reflect +closed-loop performance. On the other, closed-loop evaluation is possible in +simulation, but is hard to scale due to its significant computational demands. +Further, the simulators available today exhibit a large domain gap to real +data. This has resulted in an inability to draw clear conclusions from the +rapidly growing body of research on end-to-end autonomous driving. In this +paper, we present NAVSIM, a middle ground between these evaluation paradigms, +where we use large datasets in combination with a non-reactive simulator to +enable large-scale real-world benchmarking. Specifically, we gather +simulation-based metrics, such as progress and time to collision, by unrolling +bird's eye view abstractions of the test scenes for a short simulation horizon. +Our simulation is non-reactive, i.e., the evaluated policy and environment do +not influence each other. As we demonstrate empirically, this decoupling allows +open-loop metric computation while being better aligned with closed-loop +evaluations than traditional displacement errors. NAVSIM enabled a new +competition held at CVPR 2024, where 143 teams submitted 463 entries, resulting +in several new insights. On a large set of challenging scenarios, we observe +that simple methods with moderate compute requirements such as TransFuser can +match recent large-scale end-to-end driving architectures such as UniAD. Our +modular framework can potentially be extended with new datasets, data curation +strategies, and metrics, and will be continually maintained to host future +challenges. Our code is available at +https://github.com/autonomousvision/navsim. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ A Joint Modeling of Vision-Language-Action for Target-oriented Grasping + in Clutter ICRA 2023 + + +
+ We focus on the task of language-conditioned grasping in clutter, in which a +robot is supposed to grasp the target object based on a language instruction. +Previous works separately conduct visual grounding to localize the target +object, and generate a grasp for that object. However, these works require +object labels or visual attributes for grounding, which calls for handcrafted +rules in planner and restricts the range of language instructions. In this +paper, we propose to jointly model vision, language and action with +object-centric representation. Our method is applicable under more flexible +language instructions, and not limited by visual grounding error. Besides, by +utilizing the powerful priors from the pre-trained multi-modal model and grasp +model, sample efficiency is effectively improved and the sim2real problem is +relived without additional data for transfer. A series of experiments carried +out in simulation and real world indicate that our method can achieve better +task success rate by less times of motion under more flexible language +instructions. Moreover, our method is capable of generalizing better to +scenarios with unseen objects and language instructions. Our code is available +at https://github.com/xukechun/Vision-Language-Grasping + +
+
+ comment: Accepted by ICRA 2023 +
+
+
+
+
+ + ♻ ☆ Visual place recognition for aerial imagery: A survey + + +
+ Aerial imagery and its direct application to visual localization is an +essential problem for many Robotics and Computer Vision tasks. While Global +Navigation Satellite Systems (GNSS) are the standard default solution for +solving the aerial localization problem, it is subject to a number of +limitations, such as, signal instability or solution unreliability that make +this option not so desirable. Consequently, visual geolocalization is emerging +as a viable alternative. However, adapting Visual Place Recognition (VPR) task +to aerial imagery presents significant challenges, including weather variations +and repetitive patterns. Current VPR reviews largely neglect the specific +context of aerial data. This paper introduces a methodology tailored for +evaluating VPR techniques specifically in the domain of aerial imagery, +providing a comprehensive assessment of various methods and their performance. +However, we not only compare various VPR methods, but also demonstrate the +importance of selecting appropriate zoom and overlap levels when constructing +map tiles to achieve maximum efficiency of VPR algorithms in the case of aerial +imagery. The code is available on our GitHub repository -- +https://github.com/prime-slam/aero-vloc. + +
+
+
+
+
+ + ♻ ☆ GPTR: Gaussian Process Trajectory Representation for Continuous-Time + Motion Estimation + + +
+ Continuous-time trajectory representation has gained significant popularity +in recent years, as it offers an elegant formulation that allows the fusion of +a larger number of sensors and sensing modalities, overcoming limitations of +traditional discrete-time frameworks. To bolster the adoption of the +continuous-time paradigm, we propose a so-called Gaussian Process Trajectory +Representation (GPTR) framework for continuous-time motion estimation (CTME) +tasks. Our approach stands out by employing a third-order random jerk model, +featuring closed-form expressions for both rotational and translational state +derivatives. This model provides smooth, continuous trajectory representations +that are crucial for precise estimation of complex motion. To support the wider +robotics and computer vision communities, we have made the source code for GPTR +available as a light-weight header-only library. This format was chosen for its +ease of integration, allowing developers to incorporate GPTR into existing +systems without needing extensive code modifications. Moreover, we also provide +a set of optimization examples with LiDAR, camera, IMU, UWB factors, and +closed-form analytical Jacobians under the proposed GP framework. Our +experiments demonstrate the efficacy and efficiency of GP-based trajectory +representation in various motion estimation tasks, and the examples can serve +as the prototype to help researchers quickly develop future applications such +as batch optimization, calibration, sensor fusion, trajectory planning, etc., +with continuous-time trajectory representation. Our project is accessible at +https://github.com/brytsknguyen/gptr . + +
+
+ comment: The source code has been released. All feedbacks are welcome +
+
+
+
+
+ + ♻ ☆ SuperFusion: Multilevel LiDAR-Camera Fusion for Long-Range HD Map + Generation ICRA 2024 + + +
+ High-definition (HD) semantic map generation of the environment is an +essential component of autonomous driving. Existing methods have achieved good +performance in this task by fusing different sensor modalities, such as LiDAR +and camera. However, current works are based on raw data or network +feature-level fusion and only consider short-range HD map generation, limiting +their deployment to realistic autonomous driving applications. In this paper, +we focus on the task of building the HD maps in both short ranges, i.e., within +30 m, and also predicting long-range HD maps up to 90 m, which is required by +downstream path planning and control tasks to improve the smoothness and safety +of autonomous driving. To this end, we propose a novel network named +SuperFusion, exploiting the fusion of LiDAR and camera data at multiple levels. +We use LiDAR depth to improve image depth estimation and use image features to +guide long-range LiDAR feature prediction. We benchmark our SuperFusion on the +nuScenes dataset and a self-recorded dataset and show that it outperforms the +state-of-the-art baseline methods with large margins on all intervals. +Additionally, we apply the generated HD map to a downstream path planning task, +demonstrating that the long-range HD maps predicted by our method can lead to +better path planning for autonomous vehicles. Our code has been released at +https://github.com/haomo-ai/SuperFusion. + +
+
+ comment: ICRA 2024 +
+
+
+
+
+ + ♻ ☆ Embracing Events and Frames with Hierarchical Feature Refinement Network + for Object Detection ECCV 2024 + + +
+ In frame-based vision, object detection faces substantial performance +degradation under challenging conditions due to the limited sensing capability +of conventional cameras. Event cameras output sparse and asynchronous events, +providing a potential solution to solve these problems. However, effectively +fusing two heterogeneous modalities remains an open issue. In this work, we +propose a novel hierarchical feature refinement network for event-frame fusion. +The core concept is the design of the coarse-to-fine fusion module, denoted as +the cross-modality adaptive feature refinement (CAFR) module. In the initial +phase, the bidirectional cross-modality interaction (BCI) part facilitates +information bridging from two distinct sources. Subsequently, the features are +further refined by aligning the channel-level mean and variance in the two-fold +adaptive feature refinement (TAFR) part. We conducted extensive experiments on +two benchmarks: the low-resolution PKU-DDD17-Car dataset and the +high-resolution DSEC dataset. Experimental results show that our method +surpasses the state-of-the-art by an impressive margin of $\textbf{8.0}\%$ on +the DSEC dataset. Besides, our method exhibits significantly better robustness +(\textbf{69.5}\% versus \textbf{38.7}\%) when introducing 15 different +corruption types to the frame images. The code can be found at the link +(https://github.com/HuCaoFighting/FRN). + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Robotics meets Fluid Dynamics: A Characterization of the Induced Airflow + below a Quadrotor as a Turbulent Jet + + +
+ The widespread adoption of quadrotors for diverse applications, from +agriculture to public safety, necessitates an understanding of the aerodynamic +disturbances they create. This paper introduces a computationally lightweight +model for estimating the time-averaged magnitude of the induced flow below +quadrotors in hover. Unlike related approaches that rely on expensive +computational fluid dynamics (CFD) simulations or drone specific time-consuming +empirical measurements, our method leverages classical theory from turbulent +flows. By analyzing over 16 hours of flight data from drones of varying sizes +within a large motion capture system, we show for the first time that the +combined flow from all drone propellers is well-approximated by a turbulent jet +after 2.5 drone-diameters below the vehicle. Using a novel normalization and +scaling, we experimentally identify model parameters that describe a unified +mean velocity field below differently sized quadrotors. The model, which +requires only the drone's mass, propeller size, and drone size for +calculations, accurately describes the far-field airflow over a long-range in a +very large volume which is impractical to simulate using CFD. Our model offers +a practical tool for ensuring safer operations near humans, optimizing sensor +placements and drone control in multi-agent scenarios. We demonstrate the +latter by designing a controller that compensates for the downwash of another +drone, leading to a four times lower altitude deviation when passing below. + +
+
+ comment: 7+1 pages +
+
+
+
+
+ + ♻ ☆ Optimizing Structured Data Processing through Robotic Process Automation + + +
+ Robotic Process Automation (RPA) has emerged as a game-changing technology in +data extraction, revolutionizing the way organizations process and analyze +large volumes of documents such as invoices, purchase orders, and payment +advices. This study investigates the use of RPA for structured data extraction +and evaluates its advantages over manual processes. By comparing +human-performed tasks with those executed by RPA software bots, we assess +efficiency and accuracy in data extraction from invoices, focusing on the +effectiveness of the RPA system. Through four distinct scenarios involving +varying numbers of invoices, we measure efficiency in terms of time and effort +required for task completion, as well as accuracy by comparing error rates +between manual and RPA processes. Our findings highlight the significant +efficiency gains achieved by RPA, with bots completing tasks in significantly +less time compared to manual efforts across all cases. Moreover, the RPA system +consistently achieves perfect accuracy, mitigating the risk of errors and +enhancing process reliability. These results underscore the transformative +potential of RPA in optimizing operational efficiency, reducing human labor +costs, and improving overall business performance. + +
+
+
+
+
+ + ♻ ☆ GIC: Gaussian-Informed Continuum for Physical Property Identification + and Simulation NeurIPS 2024 + + +
+ This paper studies the problem of estimating physical properties (system +identification) through visual observations. To facilitate geometry-aware +guidance in physical property estimation, we introduce a novel hybrid framework +that leverages 3D Gaussian representation to not only capture explicit shapes +but also enable the simulated continuum to render object masks as 2D shape +surrogates during training. We propose a new dynamic 3D Gaussian framework +based on motion factorization to recover the object as 3D Gaussian point sets +across different time states. Furthermore, we develop a coarse-to-fine filling +strategy to generate the density fields of the object from the Gaussian +reconstruction, allowing for the extraction of object continuums along with +their surfaces and the integration of Gaussian attributes into these continuum. +In addition to the extracted object surfaces, the Gaussian-informed continuum +also enables the rendering of object masks during simulations, serving as +2D-shape guidance for physical property estimation. Extensive experimental +evaluations demonstrate that our pipeline achieves state-of-the-art performance +across multiple benchmarks and metrics. Additionally, we illustrate the +effectiveness of the proposed method through real-world demonstrations, +showcasing its practical utility. Our project page is at +https://jukgei.github.io/project/gic. + +
+
+ comment: 21 pages, 8 figures, NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ An NMPC-ECBF Framework for Dynamic Motion Planning and Execution in + vision-based Human-Robot Collaboration + + +
+ To enable safe and effective human-robot collaboration (HRC) in smart +manufacturing, seamless integration of sensing, cognition, and prediction into +the robot controller is critical for real-time awareness, response, and +communication inside a heterogeneous environment (robots, humans, and +equipment). The proposed approach takes advantage of the prediction +capabilities of nonlinear model predictive control (NMPC) to execute a safe +path planning based on feedback from a vision system. In order to satisfy the +requirement of real-time path planning, an embedded solver based on a penalty +method is applied. However, due to tight sampling times NMPC solutions are +approximate, and hence the safety of the system cannot be guaranteed. To +address this we formulate a novel safety-critical paradigm with an exponential +control barrier function (ECBF) used as a safety filter. We also design a +simple human-robot collaboration scenario using V-REP to evaluate the +performance of the proposed controller and investigate whether integrating +human pose prediction can help with safe and efficient collaboration. The robot +uses OptiTrack cameras for perception and dynamically generates collision-free +trajectories to the predicted target interactive position. Results for a number +of different configurations confirm the efficiency of the proposed motion +planning and execution framework. It yields a 19.8% reduction in execution time +for the HRC task considered. + +
+
+
+
+
+ + ♻ ☆ iVideoGPT: Interactive VideoGPTs are Scalable World Models NeurIPS 2024 + + +
+ World models empower model-based agents to interactively explore, reason, and +plan within imagined environments for real-world decision-making. However, the +high demand for interactivity poses challenges in harnessing recent +advancements in video generative models for developing world models at scale. +This work introduces Interactive VideoGPT (iVideoGPT), a scalable +autoregressive transformer framework that integrates multimodal signals--visual +observations, actions, and rewards--into a sequence of tokens, facilitating an +interactive experience of agents via next-token prediction. iVideoGPT features +a novel compressive tokenization technique that efficiently discretizes +high-dimensional visual observations. Leveraging its scalable architecture, we +are able to pre-train iVideoGPT on millions of human and robotic manipulation +trajectories, establishing a versatile foundation that is adaptable to serve as +interactive world models for a wide range of downstream tasks. These include +action-conditioned video prediction, visual planning, and model-based +reinforcement learning, where iVideoGPT achieves competitive performance +compared with state-of-the-art methods. Our work advances the development of +interactive general world models, bridging the gap between generative video +models and practical model-based reinforcement learning applications. Code and +pre-trained models are available at https://thuml.github.io/iVideoGPT. + +
+
+ comment: NeurIPS 2024. Code is available at project website: + https://thuml.github.io/iVideoGPT +
+
+
+
+
+ + ♻ ☆ FM-Fusion: Instance-aware Semantic Mapping Boosted by Vision-Language + Foundation Models + + +
+ Semantic mapping based on the supervised object detectors is sensitive to +image distribution. In real-world environments, the object detection and +segmentation performance can lead to a major drop, preventing the use of +semantic mapping in a wider domain. On the other hand, the development of +vision-language foundation models demonstrates a strong zero-shot +transferability across data distribution. It provides an opportunity to +construct generalizable instance-aware semantic maps. Hence, this work explores +how to boost instance-aware semantic mapping from object detection generated +from foundation models. We propose a probabilistic label fusion method to +predict close-set semantic classes from open-set label measurements. An +instance refinement module merges the over-segmented instances caused by +inconsistent segmentation. We integrate all the modules into a unified semantic +mapping system. Reading a sequence of RGB-D input, our work incrementally +reconstructs an instance-aware semantic map. We evaluate the zero-shot +performance of our method in ScanNet and SceneNN datasets. Our method achieves +40.3 mean average precision (mAP) on the ScanNet semantic instance segmentation +task. It outperforms the traditional semantic mapping method significantly. + +
+
+ comment: Published in IEEE RAL +
+
+
+
+
+ + ♻ ☆ Tube RRT*: Efficient Homotopic Path Planning for Swarm Robotics + Passing-Through Large-Scale Obstacle Environments RA-L + + +
+ Recently, the concept of homotopic trajectory planning has emerged as a novel +solution to navigation in large-scale obstacle environments for swarm robotics, +offering a wide ranging of applications. However, it lacks an efficient +homotopic path planning method in large-scale obstacle environments. This paper +introduces Tube RRT*, an innovative homotopic path planning method that builds +upon and improves the Rapidly-exploring Random Tree (RRT) algorithm. Tube RRT* +is specifically designed to generate homotopic paths, strategically considering +gap volume and path length to mitigate swarm congestion and ensure agile +navigation. Through comprehensive simulations and experiments, the +effectiveness of Tube RRT* is validated. + +
+
+ comment: 8 pages, 8 figures, submitted to RA-L +
+
+
+
+
+ + ♻ ☆ End-to-End Driving via Self-Supervised Imitation Learning Using Camera + and LiDAR Data + + +
+ In autonomous driving, the end-to-end (E2E) driving approach that predicts +vehicle control signals directly from sensor data is rapidly gaining attention. +To learn a safe E2E driving system, one needs an extensive amount of driving +data and human intervention. Vehicle control data is constructed by many hours +of human driving, and it is challenging to construct large vehicle control +datasets. Often, publicly available driving datasets are collected with limited +driving scenes, and collecting vehicle control data is only available by +vehicle manufacturers. To address these challenges, this letter proposes the +first fully self-supervised learning framework, self-supervised imitation +learning (SSIL), for E2E driving, based on the self-supervised regression +learning framework. The proposed SSIL framework can learn E2E driving networks +without using driving command data. To construct pseudo steering angle data, +proposed SSIL predicts a pseudo target from the vehicle's poses at the current +and previous time points that are estimated with light detection and ranging +sensors. In addition, we propose two modified E2E driving networks that predict +driving commands depending on high-level instruction. Our numerical experiments +with three different benchmark datasets demonstrate that the proposed SSIL +framework achieves very comparable E2E driving accuracy with the supervised +learning counterpart. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ DiffTORI: Differentiable Trajectory Optimization for Deep Reinforcement + and Imitation Learning NeurIPS 2024 + + +
+ This paper introduces DiffTORI, which utilizes Differentiable Trajectory +Optimization as the policy representation to generate actions for deep +Reinforcement and Imitation learning. Trajectory optimization is a powerful and +widely used algorithm in control, parameterized by a cost and a dynamics +function. The key to our approach is to leverage the recent progress in +differentiable trajectory optimization, which enables computing the gradients +of the loss with respect to the parameters of trajectory optimization. As a +result, the cost and dynamics functions of trajectory optimization can be +learned end-to-end. DiffTORI addresses the ``objective mismatch'' issue of +prior model-based RL algorithms, as the dynamics model in DiffTORI is learned +to directly maximize task performance by differentiating the policy gradient +loss through the trajectory optimization process. We further benchmark DiffTORI +for imitation learning on standard robotic manipulation task suites with +high-dimensional sensory observations and compare our method to feed-forward +policy classes as well as Energy-Based Models (EBM) and Diffusion. Across 15 +model-based RL tasks and 35 imitation learning tasks with high-dimensional +image and point cloud inputs, DiffTORI outperforms prior state-of-the-art +methods in both domains. + +
+
+ comment: NeurIPS 2024 (Spotlight) +
+
+
+
+
+ + ♻ ☆ Triple Regression for Camera Agnostic Sim2Real Robot Grasping and + Manipulation Tasks + + +
+ Sim2Real (Simulation to Reality) techniques have gained prominence in robotic +manipulation and motion planning due to their ability to enhance success rates +by enabling agents to test and evaluate various policies and trajectories. In +this paper, we investigate the advantages of integrating Sim2Real into robotic +frameworks. We introduce the Triple Regression Sim2Real framework, which +constructs a real-time digital twin. This twin serves as a replica of reality +to simulate and evaluate multiple plans before their execution in real-world +scenarios. Our triple regression approach addresses the reality gap by: (1) +mitigating projection errors between real and simulated camera perspectives +through the first two regression models, and (2) detecting discrepancies in +robot control using the third regression model. Experiments on 6-DoF grasp and +manipulation tasks (where the gripper can approach from any direction) +highlight the effectiveness of our framework. Remarkably, with only RGB input +images, our method achieves state-of-the-art success rates. This research +advances efficient robot training methods and sets the stage for rapid +advancements in robotics and automation. + +
+
+
+
+
+ + ♻ ☆ VLMimic: Vision Language Models are Visual Imitation Learner for + Fine-grained Actions NeurIPS 2024 + + +
+ Visual imitation learning (VIL) provides an efficient and intuitive strategy +for robotic systems to acquire novel skills. Recent advancements in Vision +Language Models (VLMs) have demonstrated remarkable performance in vision and +language reasoning capabilities for VIL tasks. Despite the progress, current +VIL methods naively employ VLMs to learn high-level plans from human videos, +relying on pre-defined motion primitives for executing physical interactions, +which remains a major bottleneck. In this work, we present VLMimic, a novel +paradigm that harnesses VLMs to directly learn even fine-grained action levels, +only given a limited number of human videos. Specifically, VLMimic first +grounds object-centric movements from human videos, and learns skills using +hierarchical constraint representations, facilitating the derivation of skills +with fine-grained action levels from limited human videos. These skills are +refined and updated through an iterative comparison strategy, enabling +efficient adaptation to unseen environments. Our extensive experiments exhibit +that our VLMimic, using only 5 human videos, yields significant improvements of +over 27% and 21% in RLBench and real-world manipulation tasks, and surpasses +baselines by over 37% in long-horizon tasks. + +
+
+ comment: accepted for publication in the 38th Conference on Neural Information + Processing Systems (NeurIPS 2024) +
+
+
+
+
+ + ♻ ☆ DistillNeRF: Perceiving 3D Scenes from Single-Glance Images by + Distilling Neural Fields and Foundation Model Features NeurIPS 2024 + + +
+ We propose DistillNeRF, a self-supervised learning framework addressing the +challenge of understanding 3D environments from limited 2D observations in +outdoor autonomous driving scenes. Our method is a generalizable feedforward +model that predicts a rich neural scene representation from sparse, +single-frame multi-view camera inputs with limited view overlap, and is trained +self-supervised with differentiable rendering to reconstruct RGB, depth, or +feature images. Our first insight is to exploit per-scene optimized Neural +Radiance Fields (NeRFs) by generating dense depth and virtual camera targets +from them, which helps our model to learn enhanced 3D geometry from sparse +non-overlapping image inputs. Second, to learn a semantically rich 3D +representation, we propose distilling features from pre-trained 2D foundation +models, such as CLIP or DINOv2, thereby enabling various downstream tasks +without the need for costly 3D human annotations. To leverage these two +insights, we introduce a novel model architecture with a two-stage +lift-splat-shoot encoder and a parameterized sparse hierarchical voxel +representation. Experimental results on the NuScenes and Waymo NOTR datasets +demonstrate that DistillNeRF significantly outperforms existing comparable +state-of-the-art self-supervised methods for scene reconstruction, novel view +synthesis, and depth estimation; and it allows for competitive zero-shot 3D +semantic occupancy prediction, as well as open-world scene understanding +through distilled foundation model features. Demos and code will be available +at https://distillnerf.github.io/. + +
+
+ comment: Accepted by Advances in Neural Information Processing Systems + (NeurIPS 2024) +
+
+
+
+
+ + ♻ ☆ RIs-Calib: An Open-Source Spatiotemporal Calibrator for Multiple 3D + Radars and IMUs Based on Continuous-Time Estimation + + +
+ Aided inertial navigation system (INS), typically consisting of an inertial +measurement unit (IMU) and an exteroceptive sensor, has been widely accepted as +a feasible solution for navigation. Compared with vision-aided and LiDAR-aided +INS, radar-aided INS could achieve better performance in adverse weather +conditions since the radar utilizes low-frequency measuring signals with less +attenuation effect in atmospheric gases and rain. For such a radar-aided INS, +accurate spatiotemporal transformation is a fundamental prerequisite to +achieving optimal information fusion. In this work, we present RIs-Calib: a +spatiotemporal calibrator for multiple 3D radars and IMUs based on +continuous-time estimation, which enables accurate spatiotemporal calibration +and does not require any additional artificial infrastructure or prior +knowledge. Our approach starts with a rigorous and robust procedure for state +initialization, followed by batch optimizations, where all parameters can be +refined to global optimal states steadily. We validate and evaluate RIs-Calib +on both simulated and real-world experiments, and the results demonstrate that +RIs-Calib is capable of accurate and consistent calibration. We open-source our +implementations at (https://github.com/Unsigned-Long/RIs-Calib) to benefit the +research community. + +
+
+
+
+
+ + ♻ ☆ Grasp as You Say: Language-guided Dexterous Grasp Generation NeurIPS2024 + + +
+ This paper explores a novel task "Dexterous Grasp as You Say" (DexGYS), +enabling robots to perform dexterous grasping based on human commands expressed +in natural language. However, the development of this field is hindered by the +lack of datasets with natural human guidance; thus, we propose a +language-guided dexterous grasp dataset, named DexGYSNet, offering high-quality +dexterous grasp annotations along with flexible and fine-grained human language +guidance. Our dataset construction is cost-efficient, with the carefully-design +hand-object interaction retargeting strategy, and the LLM-assisted language +guidance annotation system. Equipped with this dataset, we introduce the +DexGYSGrasp framework for generating dexterous grasps based on human language +instructions, with the capability of producing grasps that are intent-aligned, +high quality and diversity. To achieve this capability, our framework +decomposes the complex learning process into two manageable progressive +objectives and introduce two components to realize them. The first component +learns the grasp distribution focusing on intention alignment and generation +diversity. And the second component refines the grasp quality while maintaining +intention consistency. Extensive experiments are conducted on DexGYSNet and +real world environments for validation. + +
+
+ comment: Accepted by NeurIPS2024 +
+
+
+
+
+ + ♻ ☆ iKalibr-RGBD: Partially-Specialized Target-Free Visual-Inertial + Spatiotemporal Calibration For RGBDs via Continuous-Time Velocity Estimation + + +
+ Visual-inertial systems have been widely studied and applied in the last two +decades (from the early 2000s to the present), mainly due to their low cost and +power consumption, small footprint, and high availability. Such a trend +simultaneously leads to a large amount of visual-inertial calibration methods +being presented, as accurate spatiotemporal parameters between sensors are a +prerequisite for visual-inertial fusion. In our previous work, i.e., iKalibr, a +continuous-time-based visual-inertial calibration method was proposed as a part +of one-shot multi-sensor resilient spatiotemporal calibration. While requiring +no artificial target brings considerable convenience, computationally expensive +pose estimation is demanded in initialization and batch optimization, limiting +its availability. Fortunately, this could be vastly improved for the RGBDs with +additional depth information, by employing mapping-free ego-velocity estimation +instead of mapping-based pose estimation. In this paper, we present the +continuous-time ego-velocity estimation-based RGBD-inertial spatiotemporal +calibration, termed as iKalibr-RGBD, which is also targetless but +computationally efficient. The general pipeline of iKalibr-RGBD is inherited +from iKalibr, composed of a rigorous initialization procedure and several +continuous-time batch optimizations. The implementation of iKalibr-RGBD is +open-sourced at (https://github.com/Unsigned-Long/iKalibr) to benefit the +research community. + +
+
+
+
+
+ + ♻ ☆ An Active Perception Game for Robust Information Gathering + + +
+ Active perception approaches select future viewpoints by using some estimate +of the information gain. An inaccurate estimate can be detrimental in critical +situations, e.g., locating a person in distress. However the true information +gained can only be calculated post hoc, i.e., after the observation is +realized. We present an approach for estimating the discrepancy between the +information gain (which is the average over putative future observations) and +the true information gain. The key idea is to analyze the mathematical +relationship between active perception and the estimation error of the +information gain in a game-theoretic setting. Using this, we develop an online +estimation approach that achieves sub-linear regret (in the number of +time-steps) for the estimation of the true information gain and reduces the +sub-optimality of active perception systems. + We demonstrate our approach for active perception using a comprehensive set +of experiments on: (a) different types of environments, including a quadrotor +in a photorealistic simulation, real-world robotic data, and real-world +experiments with ground robots exploring indoor and outdoor scenes; (b) +different types of robotic perception data; and (c) different map +representations. On average, our approach reduces information gain estimation +errors by 42%, increases the information gain by 7%, PSNR by 5%, and semantic +accuracy (measured as the number of objects that are localized correctly) by +6%. In real-world experiments with a Jackal ground robot, our approach +demonstrated complex trajectories to explore occluded regions. + +
+
+
+
+
+ + ♻ ☆ Extended Reality for Enhanced Human-Robot Collaboration: a + Human-in-the-Loop Approach + + +
+ The rise of automation has provided an opportunity to achieve higher +efficiency in manufacturing processes, yet it often compromises the flexibility +required to promptly respond to evolving market needs and meet the demand for +customization. Human-robot collaboration attempts to tackle these challenges by +combining the strength and precision of machines with human ingenuity and +perceptual understanding. In this paper, we conceptualize and propose an +implementation framework for an autonomous, machine learning-based manipulator +that incorporates human-in-the-loop principles and leverages Extended Reality +(XR) to facilitate intuitive communication and programming between humans and +robots. Furthermore, the conceptual framework foresees human involvement +directly in the robot learning process, resulting in higher adaptability and +task generalization. The paper highlights key technologies enabling the +proposed framework, emphasizing the importance of developing the digital +ecosystem as a whole. Additionally, we review the existent implementation +approaches of XR in human-robot collaboration, showcasing diverse perspectives +and methodologies. The challenges and future outlooks are discussed, delving +into the major obstacles and potential research avenues of XR for more natural +human-robot interaction and integration in the industrial landscape. + +
+
+ comment: Published in IEEE International Conference on Robot and Human + Interactive Communication (RO-MAN) 2024 +
+
+
+
+
+ + ♻ ☆ M^3RS: Multi-robot, Multi-objective, and Multi-mode Routing and + Scheduling + + +
+ The quality of task execution can significantly impact a multi-robot mission. +While higher quality is desirable, it may not always be feasible due to mission +constraints. Existing multi-robot task allocation literature generally +overlooks quality of service as a decision variable. Addressing this gap, we +introduce the multi-robot, multi-objective, and multi-mode routing and +scheduling (M^3RS) problem, designed for time-bound, multi-robot, +multi-objective missions. In M^3RS, each task offers multiple execution modes, +each with different resource requirements, execution time, and quality. M^3RS +optimizes task sequences and execution modes for each agent. The need for M^3RS +comes from multi-robot applications in which a trade-off between multiple +criteria can be achieved by varying the task level quality of service through +task execution modes. Such ability is particularly useful for service robot +applications. We use M^3RS for the application of multi-robot disinfection in +healthcare environments and other public locations. The objectives considered +for disinfection application are disinfection quality and number of tasks +completed. A mixed-integer linear programming (MIP) model is proposed for +M^3RS. Further, a clustering-based column generation (CCG) algorithm is +proposed to handle larger problem instances. Through synthetic, simulated, and +hardware case studies, we demonstrate the advantages of M^3RS, showing it +provides flexibility and strong performance across multiple metrics. Our CCG +algorithm generates solutions 2.5x faster than a baseline MIP optimizer, +maintaining competitive performance. The videos for the experiments are +available on the project website: https://sites.google.com/view/g-robot/m3rs/ + +
+
+ comment: Submitted to IEEE Systems +
+
+
+
+
+ + ♻ ☆ LucidGrasp: Robotic Framework for Autonomous Manipulation of Laboratory + Equipment with Different Degrees of Transparency via 6D Pose Estimation + + +
+ Many modern robotic systems operate autonomously, however they often lack the +ability to accurately analyze the environment and adapt to changing external +conditions, while teleoperation systems often require special operator skills. +In the field of laboratory automation, the number of automated processes is +growing, however such systems are usually developed to perform specific tasks. +In addition, many of the objects used in this field are transparent, making it +difficult to analyze them using visual channels. The contributions of this work +include the development of a robotic framework with autonomous mode for +manipulating liquid-filled objects with different degrees of transparency in +complex pose combinations. The conducted experiments demonstrated the +robustness of the designed visual perception system to accurately estimate +object poses for autonomous manipulation, and confirmed the performance of the +algorithms in dexterous operations such as liquid dispensing. The proposed +robotic framework can be applied for laboratory automation, since it allows +solving the problem of performing non-trivial manipulation tasks with the +analysis of object poses of varying degrees of transparency and liquid levels, +requiring high accuracy and repeatability. + +
+
+ comment: Accepted to the 2024 IEEE International Conference on Robotics and + Biomimetics (IEEE ROBIO 2024), 6 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ SlowFast-VGen: Slow-Fast Learning for Action-Driven Long Video + Generation + + +
+ Human beings are endowed with a complementary learning system, which bridges +the slow learning of general world dynamics with fast storage of episodic +memory from a new experience. Previous video generation models, however, +primarily focus on slow learning by pre-training on vast amounts of data, +overlooking the fast learning phase crucial for episodic memory storage. This +oversight leads to inconsistencies across temporally distant frames when +generating longer videos, as these frames fall beyond the model's context +window. To this end, we introduce SlowFast-VGen, a novel dual-speed learning +system for action-driven long video generation. Our approach incorporates a +masked conditional video diffusion model for the slow learning of world +dynamics, alongside an inference-time fast learning strategy based on a +temporal LoRA module. Specifically, the fast learning process updates its +temporal LoRA parameters based on local inputs and outputs, thereby efficiently +storing episodic memory in its parameters. We further propose a slow-fast +learning loop algorithm that seamlessly integrates the inner fast learning loop +into the outer slow learning loop, enabling the recall of prior multi-episode +experiences for context-aware skill learning. To facilitate the slow learning +of an approximate world model, we collect a large-scale dataset of 200k videos +with language action annotations, covering a wide range of scenarios. Extensive +experiments show that SlowFast-VGen outperforms baselines across various +metrics for action-driven video generation, achieving an FVD score of 514 +compared to 782, and maintaining consistency in longer videos, with an average +of 0.37 scene cuts versus 0.89. The slow-fast learning loop algorithm +significantly enhances performances on long-horizon planning tasks as well. +Project Website: https://slowfast-vgen.github.io + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 168 + +
+
+
+ + ☆ URAvatar: Universal Relightable Gaussian Codec Avatars SIGGRAPH + + +
+ We present a new approach to creating photorealistic and relightable head +avatars from a phone scan with unknown illumination. The reconstructed avatars +can be animated and relit in real time with the global illumination of diverse +environments. Unlike existing approaches that estimate parametric reflectance +parameters via inverse rendering, our approach directly models learnable +radiance transfer that incorporates global light transport in an efficient +manner for real-time rendering. However, learning such a complex light +transport that can generalize across identities is non-trivial. A phone scan in +a single environment lacks sufficient information to infer how the head would +appear in general environments. To address this, we build a universal +relightable avatar model represented by 3D Gaussians. We train on hundreds of +high-quality multi-view human scans with controllable point lights. +High-resolution geometric guidance further enhances the reconstruction accuracy +and generalization. Once trained, we finetune the pretrained model on a phone +scan using inverse rendering to obtain a personalized relightable avatar. Our +experiments establish the efficacy of our design, outperforming existing +approaches while retaining real-time rendering capability. + +
+
+ comment: SIGGRAPH Asia 2024. Website: + https://junxuan-li.github.io/urgca-website/ +
+
+
+
+
+ + ☆ EgoMimic: Scaling Imitation Learning via Egocentric Video + + +
+ The scale and diversity of demonstration data required for imitation learning +is a significant challenge. We present EgoMimic, a full-stack framework which +scales manipulation via human embodiment data, specifically egocentric human +videos paired with 3D hand tracking. EgoMimic achieves this through: (1) a +system to capture human embodiment data using the ergonomic Project Aria +glasses, (2) a low-cost bimanual manipulator that minimizes the kinematic gap +to human data, (3) cross-domain data alignment techniques, and (4) an imitation +learning architecture that co-trains on human and robot data. Compared to prior +works that only extract high-level intent from human videos, our approach +treats human and robot data equally as embodied demonstration data and learns a +unified policy from both data sources. EgoMimic achieves significant +improvement on a diverse set of long-horizon, single-arm and bimanual +manipulation tasks over state-of-the-art imitation learning methods and enables +generalization to entirely new scenes. Finally, we show a favorable scaling +trend for EgoMimic, where adding 1 hour of additional hand data is +significantly more valuable than 1 hour of additional robot data. Videos and +additional information can be found at https://egomimic.github.io/ + +
+
+
+
+
+ + ☆ Enhancing Motion in Text-to-Video Generation with Decomposed Encoding + and Conditioning NeurIPS 2024 + + +
+ Despite advancements in Text-to-Video (T2V) generation, producing videos with +realistic motion remains challenging. Current models often yield static or +minimally dynamic outputs, failing to capture complex motions described by +text. This issue stems from the internal biases in text encoding, which +overlooks motions, and inadequate conditioning mechanisms in T2V generation +models. To address this, we propose a novel framework called DEcomposed MOtion +(DEMO), which enhances motion synthesis in T2V generation by decomposing both +text encoding and conditioning into content and motion components. Our method +includes a content encoder for static elements and a motion encoder for +temporal dynamics, alongside separate content and motion conditioning +mechanisms. Crucially, we introduce text-motion and video-motion supervision to +improve the model's understanding and generation of motion. Evaluations on +benchmarks such as MSR-VTT, UCF-101, WebVid-10M, EvalCrafter, and VBench +demonstrate DEMO's superior ability to produce videos with enhanced motion +dynamics while maintaining high visual quality. Our approach significantly +advances T2V generation by integrating comprehensive motion understanding +directly from textual descriptions. Project page: +https://PR-Ryan.github.io/DEMO-project/ + +
+
+ comment: Accepted at NeurIPS 2024, code available at + https://github.com/PR-Ryan/DEMO +
+
+
+
+
+ + ☆ Teaching Embodied Reinforcement Learning Agents: Informativeness and + Diversity of Language Use + + +
+ In real-world scenarios, it is desirable for embodied agents to have the +ability to leverage human language to gain explicit or implicit knowledge for +learning tasks. Despite recent progress, most previous approaches adopt simple +low-level instructions as language inputs, which may not reflect natural human +communication. It's not clear how to incorporate rich language use to +facilitate task learning. To address this question, this paper studies +different types of language inputs in facilitating reinforcement learning (RL) +embodied agents. More specifically, we examine how different levels of language +informativeness (i.e., feedback on past behaviors and future guidance) and +diversity (i.e., variation of language expressions) impact agent learning and +inference. Our empirical results based on four RL benchmarks demonstrate that +agents trained with diverse and informative language feedback can achieve +enhanced generalization and fast adaptation to new tasks. These findings +highlight the pivotal role of language use in teaching embodied agents new +tasks in an open world. Project website: +https://github.com/sled-group/Teachable_RL + +
+
+ comment: EMNLP 2024 Main. Project website: + https://github.com/sled-group/Teachable_RL +
+
+
+
+
+ + ☆ ARQ: A Mixed-Precision Quantization Framework for Accurate and + Certifiably Robust DNNs + + +
+ Mixed precision quantization has become an important technique for enabling +the execution of deep neural networks (DNNs) on limited resource computing +platforms. Traditional quantization methods have primarily concentrated on +maintaining neural network accuracy, either ignoring the impact of quantization +on the robustness of the network, or using only empirical techniques for +improving robustness. In contrast, techniques for robustness certification, +which can provide strong guarantees about the robustness of DNNs have not been +used during quantization due to their high computation cost. + This paper introduces ARQ, an innovative mixed-precision quantization method +that not only preserves the clean accuracy of the smoothed classifiers but also +maintains their certified robustness. ARQ uses reinforcement learning to find +accurate and robust DNN quantization, while efficiently leveraging randomized +smoothing, a popular class of statistical DNN verification algorithms, to guide +the search process. + We compare ARQ with multiple state-of-the-art quantization techniques on +several DNN architectures commonly used in quantization studies: ResNet-20 on +CIFAR-10, ResNet-50 on ImageNet, and MobileNetV2 on ImageNet. We demonstrate +that ARQ consistently performs better than these baselines across all the +benchmarks and the input perturbation levels. In many cases, the performance of +ARQ quantized networks can reach that of the original DNN with floating-point +weights, but with only 1.5% instructions. + +
+
+
+
+
+ + ☆ Learning Video Representations without Natural Videos + + +
+ In this paper, we show that useful video representations can be learned from +synthetic videos and natural images, without incorporating natural videos in +the training. We propose a progression of video datasets synthesized by simple +generative processes, that model a growing set of natural video properties +(e.g. motion, acceleration, and shape transformations). The downstream +performance of video models pre-trained on these generated datasets gradually +increases with the dataset progression. A VideoMAE model pre-trained on our +synthetic videos closes 97.2% of the performance gap on UCF101 action +classification between training from scratch and self-supervised pre-training +from natural videos, and outperforms the pre-trained model on HMDB51. +Introducing crops of static images to the pre-training stage results in similar +performance to UCF101 pre-training and outperforms the UCF101 pre-trained model +on 11 out of 14 out-of-distribution datasets of UCF101-P. Analyzing the +low-level properties of the datasets, we identify correlations between frame +diversity, frame similarity to natural data, and downstream performance. Our +approach provides a more controllable and transparent alternative to video data +curation processes for pre-training. + +
+
+ comment: Project page: https://unicorn53547.github.io/video_syn_rep/ +
+
+
+
+
+ + ☆ DELTA: Dense Efficient Long-range 3D Tracking for any video + + +
+ Tracking dense 3D motion from monocular videos remains challenging, +particularly when aiming for pixel-level precision over long sequences. We +introduce \Approach, a novel method that efficiently tracks every pixel in 3D +space, enabling accurate motion estimation across entire videos. Our approach +leverages a joint global-local attention mechanism for reduced-resolution +tracking, followed by a transformer-based upsampler to achieve high-resolution +predictions. Unlike existing methods, which are limited by computational +inefficiency or sparse tracking, \Approach delivers dense 3D tracking at scale, +running over 8x faster than previous methods while achieving state-of-the-art +accuracy. Furthermore, we explore the impact of depth representation on +tracking performance and identify log-depth as the optimal choice. Extensive +experiments demonstrate the superiority of \Approach on multiple benchmarks, +achieving new state-of-the-art results in both 2D and 3D dense tracking tasks. +Our method provides a robust solution for applications requiring fine-grained, +long-term motion tracking in 3D space. + +
+
+ comment: Project Page: https://snap-research.github.io/DELTA/ +
+
+
+
+
+ + ☆ No Pose, No Problem: Surprisingly Simple 3D Gaussian Splats from Sparse + Unposed Images + + +
+ We introduce NoPoSplat, a feed-forward model capable of reconstructing 3D +scenes parameterized by 3D Gaussians from \textit{unposed} sparse multi-view +images. Our model, trained exclusively with photometric loss, achieves +real-time 3D Gaussian reconstruction during inference. To eliminate the need +for accurate pose input during reconstruction, we anchor one input view's local +camera coordinates as the canonical space and train the network to predict +Gaussian primitives for all views within this space. This approach obviates the +need to transform Gaussian primitives from local coordinates into a global +coordinate system, thus avoiding errors associated with per-frame Gaussians and +pose estimation. To resolve scale ambiguity, we design and compare various +intrinsic embedding methods, ultimately opting to convert camera intrinsics +into a token embedding and concatenate it with image tokens as input to the +model, enabling accurate scene scale prediction. We utilize the reconstructed +3D Gaussians for novel view synthesis and pose estimation tasks and propose a +two-stage coarse-to-fine pipeline for accurate pose estimation. Experimental +results demonstrate that our pose-free approach can achieve superior novel view +synthesis quality compared to pose-required methods, particularly in scenarios +with limited input image overlap. For pose estimation, our method, trained +without ground truth depth or explicit matching loss, significantly outperforms +the state-of-the-art methods with substantial improvements. This work makes +significant advances in pose-free generalizable 3D reconstruction and +demonstrates its applicability to real-world scenarios. Code and trained models +are available at https://noposplat.github.io/. + +
+
+ comment: Project page: https://noposplat.github.io/ +
+
+
+
+
+ + ☆ GeoSplatting: Towards Geometry Guided Gaussian Splatting for + Physically-based Inverse Rendering + + +
+ We consider the problem of physically-based inverse rendering using 3D +Gaussian Splatting (3DGS) representations. While recent 3DGS methods have +achieved remarkable results in novel view synthesis (NVS), accurately capturing +high-fidelity geometry, physically interpretable materials and lighting remains +challenging, as it requires precise geometry modeling to provide accurate +surface normals, along with physically-based rendering (PBR) techniques to +ensure correct material and lighting disentanglement. Previous 3DGS methods +resort to approximating surface normals, but often struggle with noisy local +geometry, leading to inaccurate normal estimation and suboptimal +material-lighting decomposition. In this paper, we introduce GeoSplatting, a +novel hybrid representation that augments 3DGS with explicit geometric guidance +and differentiable PBR equations. Specifically, we bridge isosurface and 3DGS +together, where we first extract isosurface mesh from a scalar field, then +convert it into 3DGS points and formulate PBR equations for them in a fully +differentiable manner. In GeoSplatting, 3DGS is grounded on the mesh geometry, +enabling precise surface normal modeling, which facilitates the use of PBR +frameworks for material decomposition. This approach further maintains the +efficiency and quality of NVS from 3DGS while ensuring accurate geometry from +the isosurface. Comprehensive evaluations across diverse datasets demonstrate +the superiority of GeoSplatting, consistently outperforming existing methods +both quantitatively and qualitatively. + +
+
+
+
+
+ + ☆ DiffPano: Scalable and Consistent Text to Panorama Generation with + Spherical Epipolar-Aware Diffusion NeurIPS2024 + + +
+ Diffusion-based methods have achieved remarkable achievements in 2D image or +3D object generation, however, the generation of 3D scenes and even +$360^{\circ}$ images remains constrained, due to the limited number of scene +datasets, the complexity of 3D scenes themselves, and the difficulty of +generating consistent multi-view images. To address these issues, we first +establish a large-scale panoramic video-text dataset containing millions of +consecutive panoramic keyframes with corresponding panoramic depths, camera +poses, and text descriptions. Then, we propose a novel text-driven panoramic +generation framework, termed DiffPano, to achieve scalable, consistent, and +diverse panoramic scene generation. Specifically, benefiting from the powerful +generative capabilities of stable diffusion, we fine-tune a single-view +text-to-panorama diffusion model with LoRA on the established panoramic +video-text dataset. We further design a spherical epipolar-aware multi-view +diffusion model to ensure the multi-view consistency of the generated panoramic +images. Extensive experiments demonstrate that DiffPano can generate scalable, +consistent, and diverse panoramic images with given unseen text descriptions +and camera poses. + +
+
+ comment: NeurIPS2024, Project: https://github.com/zju3dv/DiffPano; Code: + https://github.com/zju3dv/DiffPano +
+
+
+
+
+ + ☆ Chasing Better Deep Image Priors between Over- and + Under-parameterization + + +
+ Deep Neural Networks (DNNs) are well-known to act as over-parameterized deep +image priors (DIP) that regularize various image inverse problems. Meanwhile, +researchers also proposed extremely compact, under-parameterized image priors +(e.g., deep decoder) that are strikingly competent for image restoration too, +despite a loss of accuracy. These two extremes push us to think whether there +exists a better solution in the middle: between over- and under-parameterized +image priors, can one identify "intermediate" parameterized image priors that +achieve better trade-offs between performance, efficiency, and even preserving +strong transferability? Drawing inspirations from the lottery ticket hypothesis +(LTH), we conjecture and study a novel "lottery image prior" (LIP) by +exploiting DNN inherent sparsity, stated as: given an over-parameterized +DNN-based image prior, it will contain a sparse subnetwork that can be trained +in isolation, to match the original DNN's performance when being applied as a +prior to various image inverse problems. Our results validate the superiority +of LIPs: we can successfully locate the LIP subnetworks from over-parameterized +DIPs at substantial sparsity ranges. Those LIP subnetworks significantly +outperform deep decoders under comparably compact model sizes (by often fully +preserving the effectiveness of their over-parameterized counterparts), and +they also possess high transferability across different images as well as +restoration task types. Besides, we also extend LIP to compressive sensing +image reconstruction, where a pre-trained GAN generator is used as the prior +(in contrast to untrained DIP or deep decoder), and confirm its validity in +this setting too. To our best knowledge, this is the first time that LTH is +demonstrated to be relevant in the context of inverse problems or image priors. + +
+
+ comment: Codes are available at + https://github.com/VITA-Group/Chasing-Better-DIPs +
+
+
+
+
+ + ☆ DexMimicGen: Automated Data Generation for Bimanual Dexterous + Manipulation via Imitation Learning + + +
+ Imitation learning from human demonstrations is an effective means to teach +robots manipulation skills. But data acquisition is a major bottleneck in +applying this paradigm more broadly, due to the amount of cost and human effort +involved. There has been significant interest in imitation learning for +bimanual dexterous robots, like humanoids. Unfortunately, data collection is +even more challenging here due to the challenges of simultaneously controlling +multiple arms and multi-fingered hands. Automated data generation in simulation +is a compelling, scalable alternative to fuel this need for data. To this end, +we introduce DexMimicGen, a large-scale automated data generation system that +synthesizes trajectories from a handful of human demonstrations for humanoid +robots with dexterous hands. We present a collection of simulation environments +in the setting of bimanual dexterous manipulation, spanning a range of +manipulation behaviors and different requirements for coordination among the +two arms. We generate 21K demos across these tasks from just 60 source human +demos and study the effect of several data generation and policy learning +decisions on agent performance. Finally, we present a real-to-sim-to-real +pipeline and deploy it on a real-world humanoid can sorting task. Videos and +more are at https://dexmimicgen.github.io/ + +
+
+ comment: Project website: https://dexmimicgen.github.io/ +
+
+
+
+
+ + ☆ Extended Object Tracking and Classification based on Linear Splines + + +
+ This paper introduces a framework based on linear splines for 2-dimensional +extended object tracking and classification. Unlike state of the art models, +linear splines allow to represent extended objects whose contour is an +arbitrarily complex curve. An exact likelihood is derived for the case in which +noisy measurements can be scattered from any point on the contour of the +extended object, while an approximate Monte Carlo likelihood is provided for +the case wherein scattering points can be anywhere, i.e. inside or on the +contour, on the object surface. Exploiting such likelihood to measure how well +the observed data fit a given shape, a suitable estimator is developed. The +proposed estimator models the extended object in terms of a kinematic state, +providing object position and orientation, along with a shape vector, +characterizing object contour and surface. The kinematic state is estimated via +a nonlinear Kalman filter, while the shape vector is estimated via a Bayesian +classifier so that classification is implicitly solved during shape estimation. +Numerical experiments are provided to assess, compared to state of the art +extended object estimators, the effectiveness of the proposed one. + +
+
+
+
+
+ + ☆ Federated Black-Box Adaptation for Semantic Segmentation + + +
+ Federated Learning (FL) is a form of distributed learning that allows +multiple institutions or clients to collaboratively learn a global model to +solve a task. This allows the model to utilize the information from every +institute while preserving data privacy. However, recent studies show that the +promise of protecting the privacy of data is not upheld by existing methods and +that it is possible to recreate the training data from the different +institutions. This is done by utilizing gradients transferred between the +clients and the global server during training or by knowing the model +architecture at the client end. In this paper, we propose a federated learning +framework for semantic segmentation without knowing the model architecture nor +transferring gradients between the client and the server, thus enabling better +privacy preservation. We propose BlackFed - a black-box adaptation of neural +networks that utilizes zero order optimization (ZOO) to update the client model +weights and first order optimization (FOO) to update the server weights. We +evaluate our approach on several computer vision and medical imaging datasets +to demonstrate its effectiveness. To the best of our knowledge, this work is +one of the first works in employing federated learning for segmentation, devoid +of gradients or model information exchange. Code: +https://github.com/JayParanjape/blackfed/tree/master + +
+
+ comment: Accepted at NEURIPS 2024 +
+
+
+
+
+ + ☆ Redefining in Dictionary: Towards a Enhanced Semantic + Understanding of Creative Generation + + +
+ Creativity, both in human and diffusion models, remains an inherently +abstract concept; thus, simply adding "creative" to a prompt does not yield +reliable semantic recognition by the model. In this work, we concretize the +abstract notion of "creative" through the TP2O task, which aims to merge two +unrelated concepts, and introduce CreTok, redefining "creative" as the token +$\texttt{}$. This redefinition offers a more concrete and universally +adaptable representation for concept blending. This redefinition occurs +continuously, involving the repeated random sampling of text pairs with +different concepts and optimizing cosine similarity between target and constant +prompts. This approach enables $\texttt{}$ to learn a method for +creative concept fusion. Extensive experiments demonstrate that the creative +capability enabled by $\texttt{}$ substantially surpasses recent SOTA +diffusion models and achieves superior creative generation. CreTok exhibits +greater flexibility and reduced time overhead, as $\texttt{}$ can +function as a universal token for any concept, facilitating creative generation +without retraining. + +
+
+
+
+
+ + ☆ Scaling Concept With Text-Guided Diffusion Models + + +
+ Text-guided diffusion models have revolutionized generative tasks by +producing high-fidelity content from text descriptions. They have also enabled +an editing paradigm where concepts can be replaced through text conditioning +(e.g., a dog to a tiger). In this work, we explore a novel approach: instead of +replacing a concept, can we enhance or suppress the concept itself? Through an +empirical study, we identify a trend where concepts can be decomposed in +text-guided diffusion models. Leveraging this insight, we introduce +ScalingConcept, a simple yet effective method to scale decomposed concepts up +or down in real input without introducing new elements. To systematically +evaluate our approach, we present the WeakConcept-10 dataset, where concepts +are imperfect and need to be enhanced. More importantly, ScalingConcept enables +a variety of novel zero-shot applications across image and audio domains, +including tasks such as canonical pose generation and generative sound +highlighting or removal. + +
+
+ comment: Project page: https://wikichao.github.io/ScalingConcept/ +
+
+
+
+
+ + ☆ Exploring Vision Language Models for Facial Attribute Recognition: + Emotion, Race, Gender, and Age + + +
+ Technologies for recognizing facial attributes like race, gender, age, and +emotion have several applications, such as surveillance, advertising content, +sentiment analysis, and the study of demographic trends and social behaviors. +Analyzing demographic characteristics based on images and analyzing facial +expressions have several challenges due to the complexity of humans' facial +attributes. Traditional approaches have employed CNNs and various other deep +learning techniques, trained on extensive collections of labeled images. While +these methods demonstrated effective performance, there remains potential for +further enhancements. In this paper, we propose to utilize vision language +models (VLMs) such as generative pre-trained transformer (GPT), GEMINI, large +language and vision assistant (LLAVA), PaliGemma, and Microsoft Florence2 to +recognize facial attributes such as race, gender, age, and emotion from images +with human faces. Various datasets like FairFace, AffectNet, and UTKFace have +been utilized to evaluate the solutions. The results show that VLMs are +competitive if not superior to traditional techniques. Additionally, we propose +"FaceScanPaliGemma"--a fine-tuned PaliGemma model--for race, gender, age, and +emotion recognition. The results show an accuracy of 81.1%, 95.8%, 80%, and +59.4% for race, gender, age group, and emotion classification, respectively, +outperforming pre-trained version of PaliGemma, other VLMs, and SotA methods. +Finally, we propose "FaceScanGPT", which is a GPT-4o model to recognize the +above attributes when several individuals are present in the image using a +prompt engineered for a person with specific facial and/or physical attributes. +The results underscore the superior multitasking capability of FaceScanGPT to +detect the individual's attributes like hair cut, clothing color, postures, +etc., using only a prompt to drive the detection and recognition tasks. + +
+
+ comment: 52 pages, 13 figures +
+
+
+
+
+ + ☆ HoloChrome: Polychromatic Illumination for Speckle Reduction in + Holographic Near-Eye Displays + + +
+ Holographic displays hold the promise of providing authentic depth cues, +resulting in enhanced immersive visual experiences for near-eye applications. +However, current holographic displays are hindered by speckle noise, which +limits accurate reproduction of color and texture in displayed images. We +present HoloChrome, a polychromatic holographic display framework designed to +mitigate these limitations. HoloChrome utilizes an ultrafast, +wavelength-adjustable laser and a dual-Spatial Light Modulator (SLM) +architecture, enabling the multiplexing of a large set of discrete wavelengths +across the visible spectrum. By leveraging spatial separation in our dual-SLM +setup, we independently manipulate speckle patterns across multiple +wavelengths. This novel approach effectively reduces speckle noise through +incoherent averaging achieved by wavelength multiplexing. Our method is +complementary to existing speckle reduction techniques, offering a new pathway +to address this challenge. Furthermore, the use of polychromatic illumination +broadens the achievable color gamut compared to traditional three-color primary +holographic displays. + Our simulations and tabletop experiments validate that HoloChrome +significantly reduces speckle noise and expands the color gamut. These +advancements enhance the performance of holographic near-eye displays, moving +us closer to practical, immersive next-generation visual experiences. + +
+
+
+
+
+ + ☆ COSNet: A Novel Semantic Segmentation Network using Enhanced Boundaries + in Cluttered Scenes + + +
+ Automated waste recycling aims to efficiently separate the recyclable objects +from the waste by employing vision-based systems. However, the presence of +varying shaped objects having different material types makes it a challenging +problem, especially in cluttered environments. Existing segmentation methods +perform reasonably on many semantic segmentation datasets by employing +multi-contextual representations, however, their performance is degraded when +utilized for waste object segmentation in cluttered scenarios. In addition, +plastic objects further increase the complexity of the problem due to their +translucent nature. To address these limitations, we introduce an efficacious +segmentation network, named COSNet, that uses boundary cues along with +multi-contextual information to accurately segment the objects in cluttered +scenes. COSNet introduces novel components including feature sharpening block +(FSB) and boundary enhancement module (BEM) for enhancing the features and +highlighting the boundary information of irregular waste objects in cluttered +environment. Extensive experiments on three challenging datasets including +ZeroWaste-f, SpectralWaste, and ADE20K demonstrate the effectiveness of the +proposed method. Our COSNet achieves a significant gain of 1.8% on ZeroWaste-f +and 2.1% on SpectralWaste datasets respectively in terms of mIoU metric. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ☆ AIDOVECL: AI-generated Dataset of Outpainted Vehicles for Eye-level + Classification and Localization + + +
+ Image labeling is a critical bottleneck in the development of computer vision +technologies, often constraining the potential of machine learning models due +to the time-intensive nature of manual annotations. This work introduces a +novel approach that leverages outpainting to address the problem of annotated +data scarcity by generating artificial contexts and annotations, significantly +reducing manual labeling efforts. We apply this technique to a particularly +acute challenge in autonomous driving, urban planning, and environmental +monitoring: the lack of diverse, eye-level vehicle images in desired classes. +Our dataset comprises AI-generated vehicle images obtained by detecting and +cropping vehicles from manually selected seed images, which are then outpainted +onto larger canvases to simulate varied real-world conditions. The outpainted +images include detailed annotations, providing high-quality ground truth data. +Advanced outpainting techniques and image quality assessments ensure visual +fidelity and contextual relevance. Augmentation with outpainted vehicles +improves overall performance metrics by up to 8\% and enhances prediction of +underrepresented classes by up to 20\%. This approach, exemplifying outpainting +as a self-annotating paradigm, presents a solution that enhances dataset +versatility across multiple domains of machine learning. The code and links to +datasets used in this study are available for further research and replication +at https://github.com/amir-kazemi/aidovecl. + +
+
+ comment: 19 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ Nearest Neighbor Normalization Improves Multimodal Retrieval + + +
+ Multimodal models leverage large-scale pre-training to achieve strong but +still imperfect performance on tasks such as image captioning, visual question +answering, and cross-modal retrieval. In this paper, we present a simple and +efficient method for correcting errors in trained contrastive image-text +retrieval models with no additional training, called Nearest Neighbor +Normalization (NNN). We show an improvement on retrieval metrics in both text +retrieval and image retrieval for all of the contrastive models that we tested +(CLIP, BLIP, ALBEF, SigLIP, BEiT) and for both of the datasets that we used +(MS-COCO and Flickr30k). NNN requires a reference database, but does not +require any training on this database, and can even increase the retrieval +accuracy of a model after finetuning. + +
+
+
+
+
+ + ☆ Parameter choices in HaarPSI for IQA with medical images + + +
+ When developing machine learning models, image quality assessment (IQA) +measures are a crucial component for evaluation. However, commonly used IQA +measures have been primarily developed and optimized for natural images. In +many specialized settings, such as medical images, this poses an +often-overlooked problem regarding suitability. In previous studies, the IQA +measure HaarPSI showed promising behavior for natural and medical images. +HaarPSI is based on Haar wavelet representations and the framework allows +optimization of two parameters. So far, these parameters have been aligned for +natural images. Here, we optimize these parameters for two annotated medical +data sets, a photoacoustic and a chest X-Ray data set. We observe that they are +more sensitive to the parameter choices than the employed natural images, and +on the other hand both medical data sets lead to similar parameter values when +optimized. We denote the optimized setting, which improves the performance for +the medical images notably, by HaarPSI$_{MED}$. The results suggest that +adapting common IQA measures within their frameworks for medical images can +provide a valuable, generalizable addition to the employment of more specific +task-based measures. + +
+
+ comment: 5 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ Identifying Spatio-Temporal Drivers of Extreme Events NeurIPS 2024 + + +
+ The spatio-temporal relations of impacts of extreme events and their drivers +in climate data are not fully understood and there is a need of machine +learning approaches to identify such spatio-temporal relations from data. The +task, however, is very challenging since there are time delays between extremes +and their drivers, and the spatial response of such drivers is inhomogeneous. +In this work, we propose a first approach and benchmarks to tackle this +challenge. Our approach is trained end-to-end to predict spatio-temporally +extremes and spatio-temporally drivers in the physical input variables jointly. +By enforcing the network to predict extremes from spatio-temporal binary masks +of identified drivers, the network successfully identifies drivers that are +correlated with extremes. We evaluate our approach on three newly created +synthetic benchmarks, where two of them are based on remote sensing or +reanalysis climate data, and on two real-world reanalysis datasets. The source +code and datasets are publicly available at the project page +https://hakamshams.github.io/IDE. + +
+
+ comment: Accepted at the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024) +
+
+
+
+
+ + ☆ Understanding Generalizability of Diffusion Models Requires Rethinking + the Hidden Gaussian Structure + + +
+ In this work, we study the generalizability of diffusion models by looking +into the hidden properties of the learned score functions, which are +essentially a series of deep denoisers trained on various noise levels. We +observe that as diffusion models transition from memorization to +generalization, their corresponding nonlinear diffusion denoisers exhibit +increasing linearity. This discovery leads us to investigate the linear +counterparts of the nonlinear diffusion models, which are a series of linear +models trained to match the function mappings of the nonlinear diffusion +denoisers. Surprisingly, these linear denoisers are approximately the optimal +denoisers for a multivariate Gaussian distribution characterized by the +empirical mean and covariance of the training dataset. This finding implies +that diffusion models have the inductive bias towards capturing and utilizing +the Gaussian structure (covariance information) of the training dataset for +data generation. We empirically demonstrate that this inductive bias is a +unique property of diffusion models in the generalization regime, which becomes +increasingly evident when the model's capacity is relatively small compared to +the training dataset size. In the case that the model is highly +overparameterized, this inductive bias emerges during the initial training +phases before the model fully memorizes its training data. Our study provides +crucial insights into understanding the notable strong generalization +phenomenon recently observed in real-world diffusion models. + +
+
+
+
+
+ + ☆ Advanced Predictive Quality Assessment for Ultrasonic Additive + Manufacturing with Deep Learning Model + + +
+ Ultrasonic Additive Manufacturing (UAM) employs ultrasonic welding to bond +similar or dissimilar metal foils to a substrate, resulting in solid, +consolidated metal components. However, certain processing conditions can lead +to inter-layer defects, affecting the final product's quality. This study +develops a method to monitor in-process quality using deep learning-based +convolutional neural networks (CNNs). The CNN models were evaluated on their +ability to classify samples with and without embedded thermocouples across five +power levels (300W, 600W, 900W, 1200W, 1500W) using thermal images with +supervised labeling. Four distinct CNN classification models were created for +different scenarios including without (baseline) and with thermocouples, only +without thermocouples across power levels, only with thermocouples across power +levels, and combined without and with thermocouples across power levels. The +models achieved 98.29% accuracy on combined baseline and thermocouple images, +97.10% for baseline images across power levels, 97.43% for thermocouple images, +and 97.27% for both types across power levels. The high accuracy, above 97%, +demonstrates the system's effectiveness in identifying and classifying +conditions within the UAM process, providing a reliable tool for quality +assurance and process control in manufacturing environments. + +
+
+
+
+
+ + ☆ Deep Learning with HM-VGG: AI Strategies for Multi-modal Image Analysis + + +
+ This study introduces the Hybrid Multi-modal VGG (HM-VGG) model, a +cutting-edge deep learning approach for the early diagnosis of glaucoma. The +HM-VGG model utilizes an attention mechanism to process Visual Field (VF) data, +enabling the extraction of key features that are vital for identifying early +signs of glaucoma. Despite the common reliance on large annotated datasets, the +HM-VGG model excels in scenarios with limited data, achieving remarkable +results with small sample sizes. The model's performance is underscored by its +high metrics in Precision, Accuracy, and F1-Score, indicating its potential for +real-world application in glaucoma detection. The paper also discusses the +challenges associated with ophthalmic image analysis, particularly the +difficulty of obtaining large volumes of annotated data. It highlights the +importance of moving beyond single-modality data, such as VF or Optical +Coherence Tomography (OCT) images alone, to a multimodal approach that can +provide a richer, more comprehensive dataset. This integration of different +data types is shown to significantly enhance diagnostic accuracy. The HM- VGG +model offers a promising tool for doctors, streamlining the diagnostic process +and improving patient outcomes. Furthermore, its applicability extends to +telemedicine and mobile healthcare, making diagnostic services more accessible. +The research presented in this paper is a significant step forward in the field +of medical image processing and has profound implications for clinical +ophthalmology. + +
+
+
+
+
+ + ☆ TPC: Test-time Procrustes Calibration for Diffusion-based Human Image + Animation NeurIPS 2024 + + +
+ Human image animation aims to generate a human motion video from the inputs +of a reference human image and a target motion video. Current diffusion-based +image animation systems exhibit high precision in transferring human identity +into targeted motion, yet they still exhibit irregular quality in their +outputs. Their optimal precision is achieved only when the physical +compositions (i.e., scale and rotation) of the human shapes in the reference +image and target pose frame are aligned. In the absence of such alignment, +there is a noticeable decline in fidelity and consistency. Especially, in +real-world environments, this compositional misalignment commonly occurs, +posing significant challenges to the practical usage of current systems. To +this end, we propose Test-time Procrustes Calibration (TPC), which enhances the +robustness of diffusion-based image animation systems by maintaining optimal +performance even when faced with compositional misalignment, effectively +addressing real-world scenarios. The TPC provides a calibrated reference image +for the diffusion model, enhancing its capability to understand the +correspondence between human shapes in the reference and target images. Our +method is simple and can be applied to any diffusion-based image animation +system in a model-agnostic manner, improving the effectiveness at test time +without additional training. + +
+
+ comment: 24 pages, 16 figures, NeurIPS 2024 +
+
+
+
+
+ + ☆ Handwriting Recognition in Historical Documents with Multimodal LLM + + +
+ There is an immense quantity of historical and cultural documentation that +exists only as handwritten manuscripts. At the same time, performing OCR across +scripts and different handwriting styles has proven to be an enormously +difficult problem relative to the process of digitizing print. While recent +Transformer based models have achieved relatively strong performance, they rely +heavily on manually transcribed training data and have difficulty generalizing +across writers. Multimodal LLM, such as GPT-4v and Gemini, have demonstrated +effectiveness in performing OCR and computer vision tasks with few shot +prompting. In this paper, I evaluate the accuracy of handwritten document +transcriptions generated by Gemini against the current state of the art +Transformer based methods. + Keywords: Optical Character Recognition, Multimodal Language Models, Cultural +Preservation, Mass digitization, Handwriting Recognitio + +
+
+
+
+
+ + ☆ A Multi-Modal Approach for Face Anti-Spoofing in Non-Calibrated Systems + using Disparity Maps + + +
+ Face recognition technologies are increasingly used in various applications, +yet they are vulnerable to face spoofing attacks. These spoofing attacks often +involve unique 3D structures, such as printed papers or mobile device screens. +Although stereo-depth cameras can detect such attacks effectively, their +high-cost limits their widespread adoption. Conversely, two-sensor systems +without extrinsic calibration offer a cost-effective alternative but are unable +to calculate depth using stereo techniques. In this work, we propose a method +to overcome this challenge by leveraging facial attributes to derive disparity +information and estimate relative depth for anti-spoofing purposes, using +non-calibrated systems. We introduce a multi-modal anti-spoofing model, coined +Disparity Model, that incorporates created disparity maps as a third modality +alongside the two original sensor modalities. We demonstrate the effectiveness +of the Disparity Model in countering various spoof attacks using a +comprehensive dataset collected from the Intel RealSense ID Solution F455. Our +method outperformed existing methods in the literature, achieving an Equal +Error Rate (EER) of 1.71% and a False Negative Rate (FNR) of 2.77% at a False +Positive Rate (FPR) of 1%. These errors are lower by 2.45% and 7.94% than the +errors of the best comparison method, respectively. Additionally, we introduce +a model ensemble that addresses 3D spoof attacks as well, achieving an EER of +2.04% and an FNR of 3.83% at an FPR of 1%. Overall, our work provides a +state-of-the-art solution for the challenging task of anti-spoofing in +non-calibrated systems that lack depth information. + +
+
+
+
+
+ + ☆ Bayesian-guided Label Mapping for Visual Reprogramming + + +
+ Visual reprogramming (VR) leverages the intrinsic capabilities of pretrained +vision models by adapting their input or output interfaces to solve downstream +tasks whose labels (i.e., downstream labels) might be totally different from +the labels associated with the pretrained models (i.e., pretrained labels). +When adapting the output interface, label mapping methods transform the +pretrained labels to downstream labels by establishing a gradient-free +one-to-one correspondence between the two sets of labels. However, in this +paper, we reveal that one-to-one mappings may overlook the complex relationship +between pretrained and downstream labels. Motivated by this observation, we +propose a Bayesian-guided Label Mapping (BLM) method. BLM constructs an +iteratively-updated probabilistic label mapping matrix, with each element +quantifying a pairwise relationship between pretrained and downstream labels. +The assignment of values to the constructed matrix is guided by Bayesian +conditional probability, considering the joint distribution of the downstream +labels and the labels predicted by the pretrained model on downstream samples. +Experiments conducted on both pretrained vision models (e.g., ResNeXt) and +vision-language models (e.g., CLIP) demonstrate the superior performance of BLM +over existing label mapping methods. The success of BLM also offers a +probabilistic lens through which to understand and analyze the effectiveness of +VR. Our code is available at https://github.com/tmlr-group/BayesianLM. + +
+
+
+
+
+ + ☆ Unveiling Synthetic Faces: How Synthetic Datasets Can Expose Real + Identities NeurIPS 2024 + + +
+ Synthetic data generation is gaining increasing popularity in different +computer vision applications. Existing state-of-the-art face recognition models +are trained using large-scale face datasets, which are crawled from the +Internet and raise privacy and ethical concerns. To address such concerns, +several works have proposed generating synthetic face datasets to train face +recognition models. However, these methods depend on generative models, which +are trained on real face images. In this work, we design a simple yet effective +membership inference attack to systematically study if any of the existing +synthetic face recognition datasets leak any information from the real data +used to train the generator model. We provide an extensive study on 6 +state-of-the-art synthetic face recognition datasets, and show that in all +these synthetic datasets, several samples from the original real dataset are +leaked. To our knowledge, this paper is the first work which shows the leakage +from training data of generator models into the generated synthetic face +recognition datasets. Our study demonstrates privacy pitfalls in synthetic face +recognition datasets and paves the way for future studies on generating +responsible synthetic face datasets. + +
+
+ comment: Accepted in NeurIPS 2024 Workshop on New Frontiers in Adversarial + Machine Learning +
+
+
+
+
+ + ☆ Re-assembling the past: The RePAIR dataset and benchmark for real world + 2D and 3D puzzle solving NeurIPS 2024 + + +
+ This paper proposes the RePAIR dataset that represents a challenging +benchmark to test modern computational and data driven methods for +puzzle-solving and reassembly tasks. Our dataset has unique properties that are +uncommon to current benchmarks for 2D and 3D puzzle solving. The fragments and +fractures are realistic, caused by a collapse of a fresco during a World War II +bombing at the Pompeii archaeological park. The fragments are also eroded and +have missing pieces with irregular shapes and different dimensions, challenging +further the reassembly algorithms. The dataset is multi-modal providing high +resolution images with characteristic pictorial elements, detailed 3D scans of +the fragments and meta-data annotated by the archaeologists. Ground truth has +been generated through several years of unceasing fieldwork, including the +excavation and cleaning of each fragment, followed by manual puzzle solving by +archaeologists of a subset of approx. 1000 pieces among the 16000 available. +After digitizing all the fragments in 3D, a benchmark was prepared to challenge +current reassembly and puzzle-solving methods that often solve more simplistic +synthetic scenarios. The tested baselines show that there clearly exists a gap +to fill in solving this computationally complex problem. + +
+
+ comment: NeurIPS 2024, Track Datasets and Benchmarks, 10 pages +
+
+
+
+
+ + ☆ DiffPAD: Denoising Diffusion-based Adversarial Patch Decontamination + + +
+ In the ever-evolving adversarial machine learning landscape, developing +effective defenses against patch attacks has become a critical challenge, +necessitating reliable solutions to safeguard real-world AI systems. Although +diffusion models have shown remarkable capacity in image synthesis and have +been recently utilized to counter $\ell_p$-norm bounded attacks, their +potential in mitigating localized patch attacks remains largely underexplored. +In this work, we propose DiffPAD, a novel framework that harnesses the power of +diffusion models for adversarial patch decontamination. DiffPAD first performs +super-resolution restoration on downsampled input images, then adopts +binarization, dynamic thresholding scheme and sliding window for effective +localization of adversarial patches. Such a design is inspired by the +theoretically derived correlation between patch size and diffusion restoration +error that is generalized across diverse patch attack scenarios. Finally, +DiffPAD applies inpainting techniques to the original input images with the +estimated patch region being masked. By integrating closed-form solutions for +super-resolution restoration and image inpainting into the conditional reverse +sampling process of a pre-trained diffusion model, DiffPAD obviates the need +for text guidance or fine-tuning. Through comprehensive experiments, we +demonstrate that DiffPAD not only achieves state-of-the-art adversarial +robustness against patch attacks but also excels in recovering naturalistic +images without patch remnants. + +
+
+ comment: Accepted to 2025 IEEE/CVF Winter Conference on Applications of + Computer Vision (WACV) +
+
+
+
+
+ + ☆ Assessing the Efficacy of Classical and Deep Neuroimaging Biomarkers in + Early Alzheimer's Disease Diagnosis + + +
+ Alzheimer's disease (AD) is the leading cause of dementia, and its early +detection is crucial for effective intervention, yet current diagnostic methods +often fall short in sensitivity and specificity. This study aims to detect +significant indicators of early AD by extracting and integrating various +imaging biomarkers, including radiomics, hippocampal texture descriptors, +cortical thickness measurements, and deep learning features. We analyze +structural magnetic resonance imaging (MRI) scans from the Alzheimer's Disease +Neuroimaging Initiative (ADNI) cohorts, utilizing comprehensive image analysis +and machine learning techniques. Our results show that combining multiple +biomarkers significantly improves detection accuracy. Radiomics and texture +features emerged as the most effective predictors for early AD, achieving AUCs +of 0.88 and 0.72 for AD and MCI detection, respectively. Although deep learning +features proved to be less effective than traditional approaches, incorporating +age with other biomarkers notably enhanced MCI detection performance. +Additionally, our findings emphasize the continued importance of classical +imaging biomarkers in the face of modern deep-learning approaches, providing a +robust framework for early AD diagnosis. + +
+
+ comment: SPIE Medical Imaging (MI25) +
+
+
+
+
+ + ☆ ImOV3D: Learning Open-Vocabulary Point Clouds 3D Object Detection from + Only 2D Images NeurIPS 2024 + + +
+ Open-vocabulary 3D object detection (OV-3Det) aims to generalize beyond the +limited number of base categories labeled during the training phase. The +biggest bottleneck is the scarcity of annotated 3D data, whereas 2D image +datasets are abundant and richly annotated. Consequently, it is intuitive to +leverage the wealth of annotations in 2D images to alleviate the inherent data +scarcity in OV-3Det. In this paper, we push the task setup to its limits by +exploring the potential of using solely 2D images to learn OV-3Det. The major +challenges for this setup is the modality gap between training images and +testing point clouds, which prevents effective integration of 2D knowledge into +OV-3Det. To address this challenge, we propose a novel framework ImOV3D to +leverage pseudo multimodal representation containing both images and point +clouds (PC) to close the modality gap. The key of ImOV3D lies in flexible +modality conversion where 2D images can be lifted into 3D using monocular depth +estimation and can also be derived from 3D scenes through rendering. This +allows unifying both training images and testing point clouds into a common +image-PC representation, encompassing a wealth of 2D semantic information and +also incorporating the depth and structural characteristics of 3D spatial data. +We carefully conduct such conversion to minimize the domain gap between +training and test cases. Extensive experiments on two benchmark datasets, +SUNRGBD and ScanNet, show that ImOV3D significantly outperforms existing +methods, even in the absence of ground truth 3D training data. With the +inclusion of a minimal amount of real 3D data for fine-tuning, the performance +also significantly surpasses previous state-of-the-art. Codes and pre-trained +models are released on the https://github.com/yangtiming/ImOV3D. + +
+
+ comment: Accepted by NeurIPS 2024. Code link + https://github.com/yangtiming/ImOV3D +
+
+
+
+
+ + ☆ Localization, balance and affinity: a stronger multifaceted + collaborative salient object detector in remote sensing images + + +
+ Despite significant advancements in salient object detection(SOD) in optical +remote sensing images(ORSI), challenges persist due to the intricate edge +structures of ORSIs and the complexity of their contextual relationships. +Current deep learning approaches encounter difficulties in accurately +identifying boundary features and lack efficiency in collaboratively modeling +the foreground and background by leveraging contextual features. To address +these challenges, we propose a stronger multifaceted collaborative salient +object detector in ORSIs, termed LBA-MCNet, which incorporates aspects of +localization, balance, and affinity. The network focuses on accurately locating +targets, balancing detailed features, and modeling image-level global context +information. Specifically, we design the Edge Feature Adaptive Balancing and +Adjusting(EFABA) module for precise edge localization, using edge features to +guide attention to boundaries and preserve spatial details. Moreover, we design +the Global Distributed Affinity Learning(GDAL) module to model global context. +It captures global context by generating an affinity map from the encoders +final layer, ensuring effective modeling of global patterns. Additionally, deep +supervision during deconvolution further enhances feature representation. +Finally, we compared with 28 state of the art approaches on three publicly +available datasets. The results clearly demonstrate the superiority of our +method. + +
+
+
+
+
+ + ☆ JEMA: A Joint Embedding Framework for Scalable Co-Learning with + Multimodal Alignment + + +
+ This work introduces JEMA (Joint Embedding with Multimodal Alignment), a +novel co-learning framework tailored for laser metal deposition (LMD), a +pivotal process in metal additive manufacturing. As Industry 5.0 gains traction +in industrial applications, efficient process monitoring becomes increasingly +crucial. However, limited data and the opaque nature of AI present challenges +for its application in an industrial setting. JEMA addresses this challenges by +leveraging multimodal data, including multi-view images and metadata such as +process parameters, to learn transferable semantic representations. By applying +a supervised contrastive loss function, JEMA enables robust learning and +subsequent process monitoring using only the primary modality, simplifying +hardware requirements and computational overhead. We investigate the +effectiveness of JEMA in LMD process monitoring, focusing specifically on its +generalization to downstream tasks such as melt pool geometry prediction, +achieved without extensive fine-tuning. Our empirical evaluation demonstrates +the high scalability and performance of JEMA, particularly when combined with +Vision Transformer models. We report an 8% increase in performance in +multimodal settings and a 1% improvement in unimodal settings compared to +supervised contrastive learning. Additionally, the learned embedding +representation enables the prediction of metadata, enhancing interpretability +and making possible the assessment of the added metadata's contributions. Our +framework lays the foundation for integrating multisensor data with metadata, +enabling diverse downstream tasks within the LMD domain and beyond. + +
+
+ comment: 26 pages, 14 figures +
+
+
+
+
+ + ☆ TrAct: Making First-layer Pre-Activations Trainable NeurIPS 2024 + + +
+ We consider the training of the first layer of vision models and notice the +clear relationship between pixel values and gradient update magnitudes: the +gradients arriving at the weights of a first layer are by definition directly +proportional to (normalized) input pixel values. Thus, an image with low +contrast has a smaller impact on learning than an image with higher contrast, +and a very bright or very dark image has a stronger impact on the weights than +an image with moderate brightness. In this work, we propose performing gradient +descent on the embeddings produced by the first layer of the model. However, +switching to discrete inputs with an embedding layer is not a reasonable option +for vision models. Thus, we propose the conceptual procedure of (i) a gradient +descent step on first layer activations to construct an activation proposal, +and (ii) finding the optimal weights of the first layer, i.e., those weights +which minimize the squared distance to the activation proposal. We provide a +closed form solution of the procedure and adjust it for robust stochastic +training while computing everything efficiently. Empirically, we find that +TrAct (Training Activations) speeds up training by factors between 1.25x and 4x +while requiring only a small computational overhead. We demonstrate the utility +of TrAct with different optimizers for a range of different vision models +including convolutional and transformer architectures. + +
+
+ comment: Published at NeurIPS 2024 +
+
+
+
+
+ + ☆ Image Synthesis with Class-Aware Semantic Diffusion Models for Surgical + Scene Segmentation + + +
+ Surgical scene segmentation is essential for enhancing surgical precision, +yet it is frequently compromised by the scarcity and imbalance of available +data. To address these challenges, semantic image synthesis methods based on +generative adversarial networks and diffusion models have been developed. +However, these models often yield non-diverse images and fail to capture small, +critical tissue classes, limiting their effectiveness. In response, we propose +the Class-Aware Semantic Diffusion Model (CASDM), a novel approach which +utilizes segmentation maps as conditions for image synthesis to tackle data +scarcity and imbalance. Novel class-aware mean squared error and class-aware +self-perceptual loss functions have been defined to prioritize critical, less +visible classes, thereby enhancing image quality and relevance. Furthermore, to +our knowledge, we are the first to generate multi-class segmentation maps using +text prompts in a novel fashion to specify their contents. These maps are then +used by CASDM to generate surgical scene images, enhancing datasets for +training and validating segmentation models. Our evaluation, which assesses +both image quality and downstream segmentation performance, demonstrates the +strong effectiveness and generalisability of CASDM in producing realistic +image-map pairs, significantly advancing surgical scene segmentation across +diverse and challenging datasets. + +
+
+
+
+
+ + ☆ MV-CC: Mask Enhanced Video Model for Remote Sensing Change Caption + + +
+ Remote sensing image change caption (RSICC) aims to provide natural language +descriptions for bi-temporal remote sensing images. Since Change Caption (CC) +task requires both spatial and temporal features, previous works follow an +encoder-fusion-decoder architecture. They use an image encoder to extract +spatial features and the fusion module to integrate spatial features and +extract temporal features, which leads to increasingly complex manual design of +the fusion module. In this paper, we introduce a novel video model-based +paradigm without design of the fusion module and propose a Mask-enhanced Video +model for Change Caption (MV-CC). Specifically, we use the off-the-shelf video +encoder to simultaneously extract the temporal and spatial features of +bi-temporal images. Furthermore, the types of changes in the CC are set based +on specific task requirements, and to enable the model to better focus on the +regions of interest, we employ masks obtained from the Change Detection (CD) +method to explicitly guide the CC model. Experimental results demonstrate that +our proposed method can obtain better performance compared with other +state-of-the-art RSICC methods. The code is available at +https://github.com/liuruixun/MV-CC. + +
+
+
+
+
+ + ☆ Manipulating Vehicle 3D Shapes through Latent Space Editing + + +
+ Although 3D object editing has the potential to significantly influence +various industries, recent research in 3D generation and editing has primarily +focused on converting text and images into 3D models, often overlooking the +need for fine-grained control over the editing of existing 3D objects. This +paper introduces a framework that employs a pre-trained regressor, enabling +continuous, precise, attribute-specific modifications to both the stylistic and +geometric attributes of vehicle 3D models. Our method not only preserves the +inherent identity of vehicle 3D objects, but also supports multi-attribute +editing, allowing for extensive customization without compromising the model's +structural integrity. Experimental results demonstrate the efficacy of our +approach in achieving detailed edits on various vehicle 3D models. + +
+
+ comment: 18 pages, 12 figures +
+
+
+
+
+ + ☆ BitStack: Fine-Grained Size Control for Compressed Large Language Models + in Variable Memory Environments + + +
+ Large language models (LLMs) have revolutionized numerous applications, yet +their deployment remains challenged by memory constraints on local devices. +While scaling laws have enhanced LLM capabilities, the primary bottleneck has +shifted from \textit{capability} to \textit{availability}, emphasizing the need +for efficient memory management. Traditional compression methods, such as +quantization, often require predefined compression ratios and separate +compression processes for each setting, complicating deployment in variable +memory environments. In this paper, we introduce \textbf{BitStack}, a novel, +training-free weight compression approach that enables megabyte-level +trade-offs between memory usage and model performance. By leveraging weight +decomposition, BitStack can dynamically adjust the model size with minimal +transmission between running memory and storage devices. Our approach +iteratively decomposes weight matrices while considering the significance of +each parameter, resulting in an approximately 1-bit per parameter residual +block in each decomposition iteration. These blocks are sorted and stacked in +storage as basic transmission units, with different quantities loaded based on +current memory availability. Extensive experiments across a wide range of tasks +demonstrate that, despite offering fine-grained size control, BitStack +consistently matches or surpasses strong quantization baselines, particularly +at extreme compression ratios. To the best of our knowledge, this is the first +decomposition-based method that effectively bridges the gap to practical +compression techniques like quantization. Code is available at +https://github.com/xinghaow99/BitStack. + +
+
+
+
+
+ + ☆ Uncertainty Estimation for 3D Object Detection via Evidential Learning + + +
+ 3D object detection is an essential task for computer vision applications in +autonomous vehicles and robotics. However, models often struggle to quantify +detection reliability, leading to poor performance on unfamiliar scenes. We +introduce a framework for quantifying uncertainty in 3D object detection by +leveraging an evidential learning loss on Bird's Eye View representations in +the 3D detector. These uncertainty estimates require minimal computational +overhead and are generalizable across different architectures. We demonstrate +both the efficacy and importance of these uncertainty estimates on identifying +out-of-distribution scenes, poorly localized objects, and missing (false +negative) detections; our framework consistently improves over baselines by +10-20% on average. Finally, we integrate this suite of tasks into a system +where a 3D object detector auto-labels driving scenes and our uncertainty +estimates verify label correctness before the labels are used to train a second +model. Here, our uncertainty-driven verification results in a 1% improvement in +mAP and a 1-2% improvement in NDS. + +
+
+
+
+
+ + ☆ From Web Data to Real Fields: Low-Cost Unsupervised Domain Adaptation + for Agricultural Robots + + +
+ In precision agriculture, vision models often struggle with new, unseen +fields where crops and weeds have been influenced by external factors, +resulting in compositions and appearances that differ from the learned +distribution. This paper aims to adapt to specific fields at low cost using +Unsupervised Domain Adaptation (UDA). We explore a novel domain shift from a +diverse, large pool of internet-sourced data to a small set of data collected +by a robot at specific locations, minimizing the need for extensive on-field +data collection. Additionally, we introduce a novel module -- the Multi-level +Attention-based Adversarial Discriminator (MAAD) -- which can be integrated at +the feature extractor level of any detection model. In this study, we +incorporate MAAD with CenterNet to simultaneously detect leaf, stem, and vein +instances. Our results show significant performance improvements in the +unlabeled target domain compared to baseline models, with a 7.5% increase in +object detection accuracy and a 5.1% improvement in keypoint detection. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Text-DiFuse: An Interactive Multi-Modal Image Fusion Framework based on + Text-modulated Diffusion Model NeurIPS 2024 + + +
+ Existing multi-modal image fusion methods fail to address the compound +degradations presented in source images, resulting in fusion images plagued by +noise, color bias, improper exposure, \textit{etc}. Additionally, these methods +often overlook the specificity of foreground objects, weakening the salience of +the objects of interest within the fused images. To address these challenges, +this study proposes a novel interactive multi-modal image fusion framework +based on the text-modulated diffusion model, called Text-DiFuse. First, this +framework integrates feature-level information integration into the diffusion +process, allowing adaptive degradation removal and multi-modal information +fusion. This is the first attempt to deeply and explicitly embed information +fusion within the diffusion process, effectively addressing compound +degradation in image fusion. Second, by embedding the combination of the text +and zero-shot location model into the diffusion fusion process, a +text-controlled fusion re-modulation strategy is developed. This enables +user-customized text control to improve fusion performance and highlight +foreground objects in the fused images. Extensive experiments on diverse public +datasets show that our Text-DiFuse achieves state-of-the-art fusion performance +across various scenarios with complex degradation. Moreover, the semantic +segmentation experiment validates the significant enhancement in semantic +performance achieved by our text-controlled fusion re-modulation strategy. The +code is publicly available at https://github.com/Leiii-Cao/Text-DiFuse. + +
+
+ comment: Accepted by the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024) +
+
+
+
+
+ + ☆ EZ-HOI: VLM Adaptation via Guided Prompt Learning for Zero-Shot HOI + Detection NeurIPS 2024 + + +
+ Detecting Human-Object Interactions (HOI) in zero-shot settings, where models +must handle unseen classes, poses significant challenges. Existing methods that +rely on aligning visual encoders with large Vision-Language Models (VLMs) to +tap into the extensive knowledge of VLMs, require large, computationally +expensive models and encounter training difficulties. Adapting VLMs with prompt +learning offers an alternative to direct alignment. However, fine-tuning on +task-specific datasets often leads to overfitting to seen classes and +suboptimal performance on unseen classes, due to the absence of unseen class +labels. To address these challenges, we introduce a novel prompt learning-based +framework for Efficient Zero-Shot HOI detection (EZ-HOI). First, we introduce +Large Language Model (LLM) and VLM guidance for learnable prompts, integrating +detailed HOI descriptions and visual semantics to adapt VLMs to HOI tasks. +However, because training datasets contain seen-class labels alone, fine-tuning +VLMs on such datasets tends to optimize learnable prompts for seen classes +instead of unseen ones. Therefore, we design prompt learning for unseen classes +using information from related seen classes, with LLMs utilized to highlight +the differences between unseen and related seen classes. Quantitative +evaluations on benchmark datasets demonstrate that our EZ-HOI achieves +state-of-the-art performance across various zero-shot settings with only 10.35% +to 33.95% of the trainable parameters compared to existing methods. Code is +available at https://github.com/ChelsieLei/EZ-HOI. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ AllClear: A Comprehensive Dataset and Benchmark for Cloud Removal in + Satellite Imagery NeurIPS 2024 + + +
+ Clouds in satellite imagery pose a significant challenge for downstream +applications. A major challenge in current cloud removal research is the +absence of a comprehensive benchmark and a sufficiently large and diverse +training dataset. To address this problem, we introduce the largest public +dataset -- $\textit{AllClear}$ for cloud removal, featuring 23,742 globally +distributed regions of interest (ROIs) with diverse land-use patterns, +comprising 4 million images in total. Each ROI includes complete temporal +captures from the year 2022, with (1) multi-spectral optical imagery from +Sentinel-2 and Landsat 8/9, (2) synthetic aperture radar (SAR) imagery from +Sentinel-1, and (3) auxiliary remote sensing products such as cloud masks and +land cover maps. We validate the effectiveness of our dataset by benchmarking +performance, demonstrating the scaling law -- the PSNR rises from $28.47$ to +$33.87$ with $30\times$ more data, and conducting ablation studies on the +temporal length and the importance of individual modalities. This dataset aims +to provide comprehensive coverage of the Earth's surface and promote better +cloud removal results. + +
+
+ comment: Accepted at NeurIPS 2024 Datasets and Benchmarks Track. Code and data + available at https://allclear.cs.cornell.edu/ +
+
+
+
+
+ + ☆ Airway Labeling Meets Clinical Applications: Reflecting Topology + Consistency and Outliers via Learnable Attentions + + +
+ Accurate airway anatomical labeling is crucial for clinicians to identify and +navigate complex bronchial structures during bronchoscopy. Automatic airway +anatomical labeling is challenging due to significant individual variability +and anatomical variations. Previous methods are prone to generate inconsistent +predictions, which is harmful for preoperative planning and intraoperative +navigation. This paper aims to address these challenges by proposing a novel +method that enhances topological consistency and improves the detection of +abnormal airway branches. + We propose a novel approach incorporating two modules: the Soft Subtree +Consistency (SSC) and the Abnormal Branch Saliency (ABS). The SSC module +constructs a soft subtree to capture clinically relevant topological +relationships, allowing for flexible feature aggregation within and across +subtrees. The ABS module facilitates the interaction between node features and +prototypes to distinguish abnormal branches, preventing the erroneous +aggregation of features between normal and abnormal nodes. + Evaluated on a challenging dataset characterized by severe airway distortion +and atrophy, our method achieves superior performance compared to +state-of-the-art approaches. Specifically, it attains a 91.4% accuracy at the +segmental level and an 83.7% accuracy at the subsegmental level, representing a +1.4% increase in subsegmental accuracy and a 3.1% increase in topological +consistency. Notably, the method demonstrates reliable performance in cases +with disease-induced airway deformities, ensuring consistent and accurate +labeling. + +
+
+
+
+
+ + ☆ Stereo-Talker: Audio-driven 3D Human Synthesis with Prior-Guided + Mixture-of-Experts + + +
+ This paper introduces Stereo-Talker, a novel one-shot audio-driven human +video synthesis system that generates 3D talking videos with precise lip +synchronization, expressive body gestures, temporally consistent +photo-realistic quality, and continuous viewpoint control. The process follows +a two-stage approach. In the first stage, the system maps audio input to +high-fidelity motion sequences, encompassing upper-body gestures and facial +expressions. To enrich motion diversity and authenticity, large language model +(LLM) priors are integrated with text-aligned semantic audio features, +leveraging LLMs' cross-modal generalization power to enhance motion quality. In +the second stage, we improve diffusion-based video generation models by +incorporating a prior-guided Mixture-of-Experts (MoE) mechanism: a view-guided +MoE focuses on view-specific attributes, while a mask-guided MoE enhances +region-based rendering stability. Additionally, a mask prediction module is +devised to derive human masks from motion data, enhancing the stability and +accuracy of masks and enabling mask guiding during inference. We also introduce +a comprehensive human video dataset with 2,203 identities, covering diverse +body gestures and detailed annotations, facilitating broad generalization. The +code, data, and pre-trained models will be released for research purposes. + +
+
+
+
+
+ + ☆ Counterfactual MRI Data Augmentation using Conditional Denoising + Diffusion Generative Models + + +
+ Deep learning (DL) models in medical imaging face challenges in +generalizability and robustness due to variations in image acquisition +parameters (IAP). In this work, we introduce a novel method using conditional +denoising diffusion generative models (cDDGMs) to generate counterfactual +magnetic resonance (MR) images that simulate different IAP without altering +patient anatomy. We demonstrate that using these counterfactual images for data +augmentation can improve segmentation accuracy, particularly in +out-of-distribution settings, enhancing the overall generalizability and +robustness of DL models across diverse imaging conditions. Our approach shows +promise in addressing domain and covariate shifts in medical imaging. The code +is publicly available at https: +//github.com/pedromorao/Counterfactual-MRI-Data-Augmentation + +
+
+
+
+
+ + ☆ Denoising Diffusion Models for Anomaly Localization in Medical Images + + +
+ This chapter explores anomaly localization in medical images using denoising +diffusion models. After providing a brief methodological background of these +models, including their application to image reconstruction and their +conditioning using guidance mechanisms, we provide an overview of available +datasets and evaluation metrics suitable for their application to anomaly +localization in medical images. In this context, we discuss supervision schemes +ranging from fully supervised segmentation to semi-supervised, weakly +supervised, self-supervised, and unsupervised methods, and provide insights +into the effectiveness and limitations of these approaches. Furthermore, we +highlight open challenges in anomaly localization, including detection bias, +domain shift, computational cost, and model interpretability. Our goal is to +provide an overview of the current state of the art in the field, outline +research gaps, and highlight the potential of diffusion models for robust +anomaly localization in medical images. + +
+
+
+
+
+ + ☆ FRoundation: Are Foundation Models Ready for Face Recognition? + + +
+ Foundation models are predominantly trained in an unsupervised or +self-supervised manner on highly diverse and large-scale datasets, making them +broadly applicable to various downstream tasks. In this work, we investigate +for the first time whether such models are suitable for the specific domain of +face recognition. We further propose and demonstrate the adaptation of these +models for face recognition across different levels of data availability. +Extensive experiments are conducted on multiple foundation models and datasets +of varying scales for training and fine-tuning, with evaluation on a wide range +of benchmarks. Our results indicate that, despite their versatility, +pre-trained foundation models underperform in face recognition compared to +similar architectures trained specifically for this task. However, fine-tuning +foundation models yields promising results, often surpassing models trained +from scratch when training data is limited. Even with access to large-scale +face recognition training datasets, fine-tuned foundation models perform +comparably to models trained from scratch, but with lower training +computational costs and without relying on the assumption of extensive data +availability. Our analysis also explores bias in face recognition, with +slightly higher bias observed in some settings when using foundation models. + +
+
+
+
+
+ + ☆ Show Me What and Where has Changed? Question Answering and Grounding for + Remote Sensing Change Detection + + +
+ Remote sensing change detection aims to perceive changes occurring on the +Earth's surface from remote sensing data in different periods, and feed these +changes back to humans. However, most existing methods only focus on detecting +change regions, lacking the ability to interact with users to identify changes +that the users expect. In this paper, we introduce a new task named Change +Detection Question Answering and Grounding (CDQAG), which extends the +traditional change detection task by providing interpretable textual answers +and intuitive visual evidence. To this end, we construct the first CDQAG +benchmark dataset, termed QAG-360K, comprising over 360K triplets of questions, +textual answers, and corresponding high-quality visual masks. It encompasses 10 +essential land-cover categories and 8 comprehensive question types, which +provides a large-scale and diverse dataset for remote sensing applications. +Based on this, we present VisTA, a simple yet effective baseline method that +unifies the tasks of question answering and grounding by delivering both visual +and textual answers. Our method achieves state-of-the-art results on both the +classic CDVQA and the proposed CDQAG datasets. Extensive qualitative and +quantitative experimental results provide useful insights for the development +of better CDQAG models, and we hope that our work can inspire further research +in this important yet underexplored direction. The proposed benchmark dataset +and method are available at https://github.com/like413/VisTA. + +
+
+
+
+
+ + ☆ Parameter-Efficient Fine-Tuning Medical Multimodal Large Language Models + for Medical Visual Grounding + + +
+ Multimodal Large Language Models (MLLMs) inherit the superior text +understanding capabilities of LLMs and extend these capabilities to multimodal +scenarios. These models achieve excellent results in the general domain of +multimodal tasks. However, in the medical domain, the substantial training +costs and the requirement for extensive medical data pose challenges to the +development of medical MLLMs. Furthermore, due to the free-text form of +answers, tasks such as visual grounding that need to produce output in a +prescribed form become difficult for MLLMs. So far, there have been no medical +MLLMs works in medical visual grounding area. For the medical vision grounding +task, which involves identifying locations in medical images based on short +text descriptions, we propose Parameter-efficient Fine-tuning medical +multimodal large language models for Medcial Visual Grounding (PFMVG). To +validate the performance of the model, we evaluate it on a public benchmark +dataset for medical visual grounding, where it achieves competitive results, +and significantly outperforming GPT-4v. Our code will be open sourced after +peer review. + +
+
+
+
+
+ + ☆ Disentangling Disentangled Representations: Towards Improved Latent + Units via Diffusion Models + + +
+ Disentangled representation learning (DRL) aims to break down observed data +into core intrinsic factors for a profound understanding of the data. In +real-world scenarios, manually defining and labeling these factors are +non-trivial, making unsupervised methods attractive. Recently, there have been +limited explorations of utilizing diffusion models (DMs), which are already +mainstream in generative modeling, for unsupervised DRL. They implement their +own inductive bias to ensure that each latent unit input to the DM expresses +only one distinct factor. In this context, we design Dynamic Gaussian Anchoring +to enforce attribute-separated latent units for more interpretable DRL. This +unconventional inductive bias explicitly delineates the decision boundaries +between attributes while also promoting the independence among latent units. +Additionally, we also propose Skip Dropout technique, which easily modifies the +denoising U-Net to be more DRL-friendly, addressing its uncooperative nature +with the disentangling feature extractor. Our methods, which carefully consider +the latent unit semantics and the distinct DM structure, enhance the +practicality of DM-based disentangled representations, demonstrating +state-of-the-art disentanglement performance on both synthetic and real data, +as well as advantages in downstream tasks. + +
+
+
+
+
+ + ☆ Human Action Recognition (HAR) Using Skeleton-based Quantum Spatial + Temporal Relative Transformer Network: ST-RTR + + +
+ Quantum Human Action Recognition (HAR) is an interesting research area in +human-computer interaction used to monitor the activities of elderly and +disabled individuals affected by physical and mental health. In the recent era, +skeleton-based HAR has received much attention because skeleton data has shown +that it can handle changes in striking, body size, camera views, and complex +backgrounds. One key characteristic of ST-GCN is automatically learning spatial +and temporal patterns from skeleton sequences. It has some limitations, as this +method only works for short-range correlation due to its limited receptive +field. Consequently, understanding human action requires long-range +interconnection. To address this issue, we developed a quantum spatial-temporal +relative transformer ST-RTR model. The ST-RTR includes joint and relay nodes, +which allow efficient communication and data transmission within the network. +These nodes help to break the inherent spatial and temporal skeleton +topologies, which enables the model to understand long-range human action +better. Furthermore, we combine quantum ST-RTR with a fusion model for further +performance improvements. To assess the performance of the quantum ST-RTR +method, we conducted experiments on three skeleton-based HAR benchmarks: NTU +RGB+D 60, NTU RGB+D 120, and UAV-Human. It boosted CS and CV by 2.11 % and +1.45% on NTU RGB+D 60, 1.25% and 1.05% on NTU RGB+D 120. On UAV-Human datasets, +accuracy improved by 2.54%. The experimental outcomes explain that the proposed +ST-RTR model significantly improves action recognition associated with the +standard ST-GCN method. + +
+
+
+
+
+ + ☆ SOAR: Self-Occluded Avatar Recovery from a Single Video In the Wild + + +
+ Self-occlusion is common when capturing people in the wild, where the +performer do not follow predefined motion scripts. This challenges existing +monocular human reconstruction systems that assume full body visibility. We +introduce Self-Occluded Avatar Recovery (SOAR), a method for complete human +reconstruction from partial observations where parts of the body are entirely +unobserved. SOAR leverages structural normal prior and generative diffusion +prior to address such an ill-posed reconstruction problem. For structural +normal prior, we model human with an reposable surfel model with well-defined +and easily readable shapes. For generative diffusion prior, we perform an +initial reconstruction and refine it using score distillation. On various +benchmarks, we show that SOAR performs favorably than state-of-the-art +reconstruction and generation methods, and on-par comparing to concurrent +works. Additional video results and code are available at +https://soar-avatar.github.io/. + +
+
+
+
+
+ + ☆ EDT: An Efficient Diffusion Transformer Framework Inspired by Human-like + Sketching NeurIPS 2024 + + +
+ Transformer-based Diffusion Probabilistic Models (DPMs) have shown more +potential than CNN-based DPMs, yet their extensive computational requirements +hinder widespread practical applications. To reduce the computation budget of +transformer-based DPMs, this work proposes the Efficient Diffusion Transformer +(EDT) framework. The framework includes a lightweight-design diffusion model +architecture, and a training-free Attention Modulation Matrix and its +alternation arrangement in EDT inspired by human-like sketching. Additionally, +we propose a token relation-enhanced masking training strategy tailored +explicitly for EDT to augment its token relation learning capability. Our +extensive experiments demonstrate the efficacy of EDT. The EDT framework +reduces training and inference costs and surpasses existing transformer-based +diffusion models in image synthesis performance, thereby achieving a +significant overall enhancement. With lower FID, EDT-S, EDT-B, and EDT-XL +attained speed-ups of 3.93x, 2.84x, and 1.92x respectively in the training +phase, and 2.29x, 2.29x, and 2.22x respectively in inference, compared to the +corresponding sizes of MDTv2. The source code is released at +https://github.com/xinwangChen/EDT. + +
+
+ comment: Xinwang Chen and Ning Liu are with equal contributions. This paper + has been accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Video Token Merging for Long-form Video Understanding NeurIPS 2024 + + +
+ As the scale of data and models for video understanding rapidly expand, +handling long-form video input in transformer-based models presents a practical +challenge. Rather than resorting to input sampling or token dropping, which may +result in information loss, token merging shows promising results when used in +collaboration with transformers. However, the application of token merging for +long-form video processing is not trivial. We begin with the premise that token +merging should not rely solely on the similarity of video tokens; the saliency +of tokens should also be considered. To address this, we explore various video +token merging strategies for long-form video classification, starting with a +simple extension of image token merging, moving to region-concentrated merging, +and finally proposing a learnable video token merging (VTM) algorithm that +dynamically merges tokens based on their saliency. Extensive experimental +results show that we achieve better or comparable performances on the LVU, +COIN, and Breakfast datasets. Moreover, our approach significantly reduces +memory costs by 84% and boosts throughput by approximately 6.89 times compared +to baseline algorithms. + +
+
+ comment: 21 pages, NeurIPS 2024 +
+
+
+
+
+ + ☆ Driving by the Rules: A Benchmark for Integrating Traffic Sign + Regulations into Vectorized HD Map + + +
+ Ensuring adherence to traffic sign regulations is essential for both human +and autonomous vehicle navigation. While current benchmark datasets concentrate +on lane perception or basic traffic sign recognition, they often overlook the +intricate task of integrating these regulations into lane operations. +Addressing this gap, we introduce MapDR, a novel dataset designed for the +extraction of Driving Rules from traffic signs and their association with +vectorized, locally perceived HD Maps. MapDR features over 10,000 annotated +video clips that capture the intricate correlation between traffic sign +regulations and lanes. We define two pivotal sub-tasks: 1) Rule Extraction from +Traffic Sign, which accurately deciphers regulatory instructions, and 2) +Rule-Lane Correspondence Reasoning, which aligns these rules with their +respective lanes. Built upon this benchmark, we provide a multimodal solution +that offers a strong baseline for advancing autonomous driving technologies. It +fills a critical gap in the integration of traffic sign rules, contributing to +the development of reliable autonomous navigation systems. + +
+
+ comment: 27 pages, 13 figures +
+
+
+
+
+ + ☆ In-Context LoRA for Diffusion Transformers + + +
+ Recent research arXiv:2410.15027 has explored the use of diffusion +transformers (DiTs) for task-agnostic image generation by simply concatenating +attention tokens across images. However, despite substantial computational +resources, the fidelity of the generated images remains suboptimal. In this +study, we reevaluate and streamline this framework by hypothesizing that +text-to-image DiTs inherently possess in-context generation capabilities, +requiring only minimal tuning to activate them. Through diverse task +experiments, we qualitatively demonstrate that existing text-to-image DiTs can +effectively perform in-context generation without any tuning. Building on this +insight, we propose a remarkably simple pipeline to leverage the in-context +abilities of DiTs: (1) concatenate images instead of tokens, (2) perform joint +captioning of multiple images, and (3) apply task-specific LoRA tuning using +small datasets (e.g., $20\sim 100$ samples) instead of full-parameter tuning +with large datasets. We name our models In-Context LoRA (IC-LoRA). This +approach requires no modifications to the original DiT models, only changes to +the training data. Remarkably, our pipeline generates high-fidelity image sets +that better adhere to prompts. While task-specific in terms of tuning data, our +framework remains task-agnostic in architecture and pipeline, offering a +powerful tool for the community and providing valuable insights for further +research on product-level task-agnostic generation systems. We release our +code, data, and models at https://github.com/ali-vilab/In-Context-LoRA + +
+
+ comment: Project page: https://ali-vilab.github.io/In-Context-Lora-Page/ +
+
+
+
+
+ + ☆ Open-Set 3D object detection in LiDAR data as an Out-of-Distribution + problem + + +
+ 3D Object Detection from LiDAR data has achieved industry-ready performance +in controlled environments through advanced deep learning methods. However, +these neural network models are limited by a finite set of inlier object +categories. Our work redefines the open-set 3D Object Detection problem in +LiDAR data as an Out-Of-Distribution (OOD) problem to detect outlier objects. +This approach brings additional information in comparison with traditional +object detection. We establish a comparative benchmark and show that two-stage +OOD methods, notably autolabelling, show promising results for 3D OOD Object +Detection. Our contributions include setting a rigorous evaluation protocol by +examining the evaluation of hyperparameters and evaluating strategies for +generating additional data to train an OOD-aware 3D object detector. This +comprehensive analysis is essential for developing robust 3D object detection +systems that can perform reliably in diverse and unpredictable real-world +scenarios. + +
+
+
+
+
+ + ☆ Reverse Attitude Statistics Based Star Map Identification Method + + +
+ The star tracker is generally affected by the atmospheric background light +and the aerodynamic environment when working in near space, which results in +missing stars or false stars. Moreover, high-speed maneuvering may cause star +trailing, which reduces the accuracy of the star position. To address the +challenges for starmap identification, a reverse attitude statistics based +method is proposed to handle position noise, false stars, and missing stars. +Conversely to existing methods which match before solving for attitude, this +method introduces attitude solving into the matching process, and obtains the +final match and the correct attitude simultaneously by frequency statistics. +Firstly, based on stable angular distance features, the initial matching is +obtained by utilizing spatial hash indexing. Then, the dual-vector attitude +determination is introduced to calculate potential attitude. Finally, the star +pairs are accurately matched by applying a frequency statistics filtering +method. In addition, Bayesian optimization is employed to find optimal +parameters under the impact of noises, which is able to enhance the algorithm +performance further. In this work, the proposed method is validated in +simulation, field test and on-orbit experiment. Compared with the +state-of-the-art, the identification rate is improved by more than 14.3%, and +the solving time is reduced by over 28.5%. + +
+
+ comment: 10 pages, 17figures, 4 tables, 4663 words, submitted to IEEE Sensors + Journal +
+
+
+
+
+ + ☆ EXACFS -- A CIL Method to mitigate Catastrophic Forgetting + + +
+ Deep neural networks (DNNS) excel at learning from static datasets but +struggle with continual learning, where data arrives sequentially. Catastrophic +forgetting, the phenomenon of forgetting previously learned knowledge, is a +primary challenge. This paper introduces EXponentially Averaged Class-wise +Feature Significance (EXACFS) to mitigate this issue in the class incremental +learning (CIL) setting. By estimating the significance of model features for +each learned class using loss gradients, gradually aging the significance +through the incremental tasks and preserving the significant features through a +distillation loss, EXACFS effectively balances remembering old knowledge +(stability) and learning new knowledge (plasticity). Extensive experiments on +CIFAR-100 and ImageNet-100 demonstrate EXACFS's superior performance in +preserving stability while acquiring plasticity. + +
+
+
+
+
+ + ☆ EchoNarrator: Generating natural text explanations for ejection fraction + predictions + + +
+ Ejection fraction (EF) of the left ventricle (LV) is considered as one of the +most important measurements for diagnosing acute heart failure and can be +estimated during cardiac ultrasound acquisition. While recent successes in deep +learning research successfully estimate EF values, the proposed models often +lack an explanation for the prediction. However, providing clear and intuitive +explanations for clinical measurement predictions would increase the trust of +cardiologists in these models. In this paper, we explore predicting EF +measurements with Natural Language Explanation (NLE). We propose a model that +in a single forward pass combines estimation of the LV contour over multiple +frames, together with a set of modules and routines for computing various +motion and shape attributes that are associated with ejection fraction. It then +feeds the attributes into a large language model to generate text that helps to +explain the network's outcome in a human-like manner. We provide experimental +evaluation of our explanatory output, as well as EF prediction, and show that +our model can provide EF comparable to state-of-the-art together with +meaningful and accurate natural language explanation to the prediction. The +project page can be found at https://github.com/guybenyosef/EchoNarrator . + +
+
+ comment: accepted for MICCAI 2024 +
+
+
+
+
+ + ☆ Scaled Inverse Graphics: Efficiently Learning Large Sets of 3D Scenes + + +
+ While the field of inverse graphics has been witnessing continuous growth, +techniques devised thus far predominantly focus on learning individual scene +representations. In contrast, learning large sets of scenes has been a +considerable bottleneck in NeRF developments, as repeatedly applying inverse +graphics on a sequence of scenes, though essential for various applications, +remains largely prohibitive in terms of resource costs. We introduce a +framework termed "scaled inverse graphics", aimed at efficiently learning large +sets of scene representations, and propose a novel method to this end. It +operates in two stages: (i) training a compression model on a subset of scenes, +then (ii) training NeRF models on the resulting smaller representations, +thereby reducing the optimization space per new scene. In practice, we compact +the representation of scenes by learning NeRFs in a latent space to reduce the +image resolution, and sharing information across scenes to reduce NeRF +representation complexity. We experimentally show that our method presents both +the lowest training time and memory footprint in scaled inverse graphics +compared to other methods applied independently on each scene. Our codebase is +publicly available as open-source. Our project page can be found at +https://scaled-ig.github.io . + +
+
+
+
+
+ + ☆ MLLA-UNet: Mamba-like Linear Attention in an Efficient U-Shape Model for + Medical Image Segmentation + + +
+ Recent advancements in medical imaging have resulted in more complex and +diverse images, with challenges such as high anatomical variability, blurred +tissue boundaries, low organ contrast, and noise. Traditional segmentation +methods struggle to address these challenges, making deep learning approaches, +particularly U-shaped architectures, increasingly prominent. However, the +quadratic complexity of standard self-attention makes Transformers +computationally prohibitive for high-resolution images. To address these +challenges, we propose MLLA-UNet (Mamba-Like Linear Attention UNet), a novel +architecture that achieves linear computational complexity while maintaining +high segmentation accuracy through its innovative combination of linear +attention and Mamba-inspired adaptive mechanisms, complemented by an efficient +symmetric sampling structure for enhanced feature processing. Our architecture +effectively preserves essential spatial features while capturing long-range +dependencies at reduced computational complexity. Additionally, we introduce a +novel sampling strategy for multi-scale feature fusion. Experiments demonstrate +that MLLA-UNet achieves state-of-the-art performance on six challenging +datasets with 24 different segmentation tasks, including but not limited to +FLARE22, AMOS CT, and ACDC, with an average DSC of 88.32%. These results +underscore the superiority of MLLA-UNet over existing methods. Our +contributions include the novel 2D segmentation architecture and its empirical +validation. The code is available via https://github.com/csyfjiang/MLLA-UNet. + +
+
+
+
+
+ + ☆ MoTaDual: Modality-Task Dual Alignment for Enhanced Zero-shot Composed + Image Retrieval + + +
+ Composed Image Retrieval (CIR) is a challenging vision-language task, +utilizing bi-modal (image+text) queries to retrieve target images. Despite the +impressive performance of supervised CIR, the dependence on costly, +manually-labeled triplets limits its scalability and zero-shot capability. To +address this issue, zero-shot composed image retrieval (ZS-CIR) is presented +along with projection-based approaches. However, such methods face two major +problems, i.e., task discrepancy between pre-training (image $\leftrightarrow$ +text) and inference (image+text $\rightarrow$ image), and modality discrepancy. +The latter pertains to approaches based on text-only projection training due to +the necessity of feature extraction from the reference image during inference. +In this paper, we propose a two-stage framework to tackle both discrepancies. +First, to ensure efficiency and scalability, a textual inversion network is +pre-trained on large-scale caption datasets. Subsequently, we put forward +Modality-Task Dual Alignment (MoTaDual) as the second stage, where +large-language models (LLMs) generate triplet data for fine-tuning, and +additionally, prompt learning is introduced in a multi-modal context to +effectively alleviate both modality and task discrepancies. The experimental +results show that our MoTaDual achieves the state-of-the-art performance across +four widely used ZS-CIR benchmarks, while maintaining low training time and +computational cost. The code will be released soon. + +
+
+
+
+
+ + ☆ An Empirical Analysis of GPT-4V's Performance on Fashion Aesthetic + Evaluation + + +
+ Fashion aesthetic evaluation is the task of estimating how well the outfits +worn by individuals in images suit them. In this work, we examine the zero-shot +performance of GPT-4V on this task for the first time. We show that its +predictions align fairly well with human judgments on our datasets, and also +find that it struggles with ranking outfits in similar colors. The code is +available at https://github.com/st-tech/gpt4v-fashion-aesthetic-evaluation. + +
+
+
+
+
+ + ☆ GaussianMarker: Uncertainty-Aware Copyright Protection of 3D Gaussian + Splatting + + +
+ 3D Gaussian Splatting (3DGS) has become a crucial method for acquiring 3D +assets. To protect the copyright of these assets, digital watermarking +techniques can be applied to embed ownership information discreetly within 3DGS +models. However, existing watermarking methods for meshes, point clouds, and +implicit radiance fields cannot be directly applied to 3DGS models, as 3DGS +models use explicit 3D Gaussians with distinct structures and do not rely on +neural networks. Naively embedding the watermark on a pre-trained 3DGS can +cause obvious distortion in rendered images. In our work, we propose an +uncertainty-based method that constrains the perturbation of model parameters +to achieve invisible watermarking for 3DGS. At the message decoding stage, the +copyright messages can be reliably extracted from both 3D Gaussians and 2D +rendered images even under various forms of 3D and 2D distortions. We conduct +extensive experiments on the Blender, LLFF and MipNeRF-360 datasets to validate +the effectiveness of our proposed method, demonstrating state-of-the-art +performance on both message decoding accuracy and view synthesis quality. + +
+
+
+
+
+ + ☆ Aggregate-and-Adapt Natural Language Prompts for Downstream + Generalization of CLIP NeurIPS 2024 + + +
+ Large pretrained vision-language models like CLIP have shown promising +generalization capability, but may struggle in specialized domains (e.g., +satellite imagery) or fine-grained classification (e.g., car models) where the +visual concepts are unseen or under-represented during pretraining. Prompt +learning offers a parameter-efficient finetuning framework that can adapt CLIP +to downstream tasks even when limited annotation data are available. In this +paper, we improve prompt learning by distilling the textual knowledge from +natural language prompts (either human- or LLM-generated) to provide rich +priors for those under-represented concepts. We first obtain a prompt +``summary'' aligned to each input image via a learned prompt aggregator. Then +we jointly train a prompt generator, optimized to produce a prompt embedding +that stays close to the aggregated summary while minimizing task loss at the +same time. We dub such prompt embedding as Aggregate-and-Adapted Prompt +Embedding (AAPE). AAPE is shown to be able to generalize to different +downstream data distributions and tasks, including vision-language +understanding tasks (e.g., few-shot classification, VQA) and generation tasks +(image captioning) where AAPE achieves competitive performance. We also show +AAPE is particularly helpful to handle non-canonical and OOD examples. +Furthermore, AAPE learning eliminates LLM-based inference cost as required by +baselines, and scales better with data and LLM model size. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ XRDSLAM: A Flexible and Modular Framework for Deep Learning based SLAM + + +
+ In this paper, we propose a flexible SLAM framework, XRDSLAM. It adopts a +modular code design and a multi-process running mechanism, providing highly +reusable foundational modules such as unified dataset management, 3d +visualization, algorithm configuration, and metrics evaluation. It can help +developers quickly build a complete SLAM system, flexibly combine different +algorithm modules, and conduct standardized benchmarking for accuracy and +efficiency comparison. Within this framework, we integrate several +state-of-the-art SLAM algorithms with different types, including NeRF and 3DGS +based SLAM, and even odometry or reconstruction algorithms, which demonstrates +the flexibility and extensibility. We also conduct a comprehensive comparison +and evaluation of these integrated algorithms, analyzing the characteristics of +each. Finally, we contribute all the code, configuration and data to the +open-source community, which aims to promote the widespread research and +development of SLAM technology within the open-source ecosystem. + +
+
+
+
+
+ + ☆ Adversarial Attacks of Vision Tasks in the Past 10 Years: A Survey + + +
+ Adversarial attacks, which manipulate input data to undermine model +availability and integrity, pose significant security threats during machine +learning inference. With the advent of Large Vision-Language Models (LVLMs), +new attack vectors, such as cognitive bias, prompt injection, and jailbreak +techniques, have emerged. Understanding these attacks is crucial for developing +more robust systems and demystifying the inner workings of neural networks. +However, existing reviews often focus on attack classifications and lack +comprehensive, in-depth analysis. The research community currently needs: 1) +unified insights into adversariality, transferability, and generalization; 2) +detailed evaluations of existing methods; 3) motivation-driven attack +categorizations; and 4) an integrated perspective on both traditional and LVLM +attacks. This article addresses these gaps by offering a thorough summary of +traditional and LVLM adversarial attacks, emphasizing their connections and +distinctions, and providing actionable insights for future research. + +
+
+
+
+
+ + ☆ Wide Two-Layer Networks can Learn from Adversarial Perturbations NeurIPS24 + + +
+ Adversarial examples have raised several open questions, such as why they can +deceive classifiers and transfer between different models. A prevailing +hypothesis to explain these phenomena suggests that adversarial perturbations +appear as random noise but contain class-specific features. This hypothesis is +supported by the success of perturbation learning, where classifiers trained +solely on adversarial examples and the corresponding incorrect labels +generalize well to correctly labeled test data. Although this hypothesis and +perturbation learning are effective in explaining intriguing properties of +adversarial examples, their solid theoretical foundation is limited. In this +study, we theoretically explain the counterintuitive success of perturbation +learning. We assume wide two-layer networks and the results hold for any data +distribution. We prove that adversarial perturbations contain sufficient +class-specific features for networks to generalize from them. Moreover, the +predictions of classifiers trained on mislabeled adversarial examples coincide +with those of classifiers trained on correctly labeled clean samples. The code +is available at https://github.com/s-kumano/perturbation-learning. + +
+
+ comment: NeurIPS24 +
+
+
+
+
+ + ☆ Web-Scale Visual Entity Recognition: An LLM-Driven Data Approach NeurIPS 2024 + + +
+ Web-scale visual entity recognition, the task of associating images with +their corresponding entities within vast knowledge bases like Wikipedia, +presents significant challenges due to the lack of clean, large-scale training +data. In this paper, we propose a novel methodology to curate such a dataset, +leveraging a multimodal large language model (LLM) for label verification, +metadata generation, and rationale explanation. Instead of relying on the +multimodal LLM to directly annotate data, which we found to be suboptimal, we +prompt it to reason about potential candidate entity labels by accessing +additional contextually relevant information (such as Wikipedia), resulting in +more accurate annotations. We further use the multimodal LLM to enrich the +dataset by generating question-answer pairs and a grounded finegrained textual +description (referred to as "rationale") that explains the connection between +images and their assigned entities. Experiments demonstrate that models trained +on this automatically curated data achieve state-of-the-art performance on +web-scale visual entity recognition tasks (e.g. +6.9% improvement in OVEN +entity task), underscoring the importance of high-quality training data in this +domain. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ DIP: Diffusion Learning of Inconsistency Pattern for General DeepFake + Detection + + +
+ With the advancement of deepfake generation techniques, the importance of +deepfake detection in protecting multimedia content integrity has become +increasingly obvious. Recently, temporal inconsistency clues have been explored +to improve the generalizability of deepfake video detection. According to our +observation, the temporal artifacts of forged videos in terms of motion +information usually exhibits quite distinct inconsistency patterns along +horizontal and vertical directions, which could be leveraged to improve the +generalizability of detectors. In this paper, a transformer-based framework for +Diffusion Learning of Inconsistency Pattern (DIP) is proposed, which exploits +directional inconsistencies for deepfake video detection. Specifically, DIP +begins with a spatiotemporal encoder to represent spatiotemporal information. A +directional inconsistency decoder is adopted accordingly, where direction-aware +attention and inconsistency diffusion are incorporated to explore potential +inconsistency patterns and jointly learn the inherent relationships. In +addition, the SpatioTemporal Invariant Loss (STI Loss) is introduced to +contrast spatiotemporally augmented sample pairs and prevent the model from +overfitting nonessential forgery artifacts. Extensive experiments on several +public datasets demonstrate that our method could effectively identify +directional forgery clues and achieve state-of-the-art performance. + +
+
+ comment: 13 pages, accepted with IEEE Trans. on Multimedia +
+
+
+
+
+ + ☆ GS-Blur: A 3D Scene-Based Dataset for Realistic Image Deblurring NeurIPS 2024 + + +
+ To train a deblurring network, an appropriate dataset with paired blurry and +sharp images is essential. Existing datasets collect blurry images either +synthetically by aggregating consecutive sharp frames or using sophisticated +camera systems to capture real blur. However, these methods offer limited +diversity in blur types (blur trajectories) or require extensive human effort +to reconstruct large-scale datasets, failing to fully reflect real-world blur +scenarios. To address this, we propose GS-Blur, a dataset of synthesized +realistic blurry images created using a novel approach. To this end, we first +reconstruct 3D scenes from multi-view images using 3D Gaussian Splatting +(3DGS), then render blurry images by moving the camera view along the randomly +generated motion trajectories. By adopting various camera trajectories in +reconstructing our GS-Blur, our dataset contains realistic and diverse types of +blur, offering a large-scale dataset that generalizes well to real-world blur. +Using GS-Blur with various deblurring methods, we demonstrate its ability to +generalize effectively compared to previous synthetic or real blur datasets, +showing significant improvements in deblurring performance. + +
+
+ comment: Accepted at NeurIPS 2024 Datasets & Benchmarks Track +
+
+
+
+
+ + ☆ Novel Clinical-Grade Prostate Cancer Detection and Grading Model: + Development and Prospective Validation Using Real World Data, with + Performance Assessment on IHC Requested Cases + + +
+ Artificial intelligence may assist healthcare systems in meeting increasing +demand for pathology services while maintaining diagnostic quality and reducing +turnaround time and costs. We aimed to investigate the performance of an +institutionally developed system for prostate cancer detection, grading, and +workflow optimization and to contrast this with commercial alternatives. From +August 2021 to March 2023, we scanned 21,396 slides from 1,147 patients with +positive biopsies. We developed models for cancer detection, grading, and +screening of equivocal cases for IHC ordering. We compared a task-specific +model trained using the PANDA dataset of prostate cancer biopsies with one +built using features extracted by the general-purpose histology foundation +model, UNI and compare their performance in an unfiltered prospectively +collected dataset that reflects our patient population (1737 slides,95 +patients). We evaluated the contributions of a bespoke model designed to +improve sensitivity in detecting small cancer foci and scoring of broader +patterns observed at lower resolution. We found high concordance between the +developed systems and pathologist reference in detection (AUC 98.5, sensitivity +95.0, and specificity 97.8), ISUP grading (quadratic Cohen's kappa 0.869), +grade group 3 or higher (AUC 97.5, sensitivity 94.9, specificity 96.6) and +comparable to published data from commercial systems. Screening could reduce +IHC ordering for equivocal cases by 44.5% with an overall error rate of 1.8% +(1.4% false positive, 0.4% false negative rates). Institutions like academic +medical centers that have high scanning volumes and report abstraction +capabilities can develop accurate computational pathology models for internal +use. These models have the potential to aid in quality control role and to +improve workflow in the pathology lab to help meet future challenges in +prostate cancer diagnosis. + +
+
+
+
+
+ + ☆ Recovering Complete Actions for Cross-dataset Skeleton Action + Recognition NeurIPS 2024 + + +
+ Despite huge progress in skeleton-based action recognition, its +generalizability to different domains remains a challenging issue. In this +paper, to solve the skeleton action generalization problem, we present a +recover-and-resample augmentation framework based on a novel complete action +prior. We observe that human daily actions are confronted with temporal +mismatch across different datasets, as they are usually partial observations of +their complete action sequences. By recovering complete actions and resampling +from these full sequences, we can generate strong augmentations for unseen +domains. At the same time, we discover the nature of general action +completeness within large datasets, indicated by the per-frame diversity over +time. This allows us to exploit two assets of transferable knowledge that can +be shared across action samples and be helpful for action completion: boundary +poses for determining the action start, and linear temporal transforms for +capturing global action patterns. Therefore, we formulate the recovering stage +as a two-step stochastic action completion with boundary pose-conditioned +extrapolation followed by smooth linear transforms. Both the boundary poses and +linear transforms can be efficiently learned from the whole dataset via +clustering. We validate our approach on a cross-dataset setting with three +skeleton action datasets, outperforming other domain generalization approaches +by a considerable margin. + +
+
+ comment: accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Posture-Informed Muscular Force Learning for Robust Hand Pressure + Estimation NeurIPS 2024 + + +
+ We present PiMForce, a novel framework that enhances hand pressure estimation +by leveraging 3D hand posture information to augment forearm surface +electromyography (sEMG) signals. Our approach utilizes detailed spatial +information from 3D hand poses in conjunction with dynamic muscle activity from +sEMG to enable accurate and robust whole-hand pressure measurements under +diverse hand-object interactions. We also developed a multimodal data +collection system that combines a pressure glove, an sEMG armband, and a +markerless finger-tracking module. We created a comprehensive dataset from 21 +participants, capturing synchronized data of hand posture, sEMG signals, and +exerted hand pressure across various hand postures and hand-object interaction +scenarios using our collection system. Our framework enables precise hand +pressure estimation in complex and natural interaction scenarios. Our approach +substantially mitigates the limitations of traditional sEMG-based or +vision-based methods by integrating 3D hand posture information with sEMG +signals. Video demos, data, and code are available online. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ Cycle-Constrained Adversarial Denoising Convolutional Network for PET + Image Denoising: Multi-Dimensional Validation on Large Datasets with Reader + Study and Real Low-Dose Data + + +
+ Positron emission tomography (PET) is a critical tool for diagnosing tumors +and neurological disorders but poses radiation risks to patients, particularly +to sensitive populations. While reducing injected radiation dose mitigates this +risk, it often compromises image quality. To reconstruct full-dose-quality +images from low-dose scans, we propose a Cycle-constrained Adversarial +Denoising Convolutional Network (Cycle-DCN). This model integrates a noise +predictor, two discriminators, and a consistency network, and is optimized +using a combination of supervised loss, adversarial loss, cycle consistency +loss, identity loss, and neighboring Structural Similarity Index (SSIM) loss. +Experiments were conducted on a large dataset consisting of raw PET brain data +from 1,224 patients, acquired using a Siemens Biograph Vision PET/CT scanner. +Each patient underwent a 120-seconds brain scan. To simulate low-dose PET +conditions, images were reconstructed from shortened scan durations of 30, 12, +and 5 seconds, corresponding to 1/4, 1/10, and 1/24 of the full-dose +acquisition, respectively, using a custom-developed GPU-based image +reconstruction software. The results show that Cycle-DCN significantly improves +average Peak Signal-to-Noise Ratio (PSNR), SSIM, and Normalized Root Mean +Square Error (NRMSE) across three dose levels, with improvements of up to 56%, +35%, and 71%, respectively. Additionally, it achieves contrast-to-noise ratio +(CNR) and Edge Preservation Index (EPI) values that closely align with +full-dose images, effectively preserving image details, tumor shape, and +contrast, while resolving issues with blurred edges. The results of reader +studies indicated that the images restored by Cycle-DCN consistently received +the highest ratings from nuclear medicine physicians, highlighting their strong +clinical relevance. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + On Learning Multi-Modal Forgery Representation for Diffusion Generated + Video Detection + + +
+ Large numbers of synthesized videos from diffusion models pose threats to +information security and authenticity, leading to an increasing demand for +generated content detection. However, existing video-level detection algorithms +primarily focus on detecting facial forgeries and often fail to identify +diffusion-generated content with a diverse range of semantics. To advance the +field of video forensics, we propose an innovative algorithm named Multi-Modal +Detection(MM-Det) for detecting diffusion-generated videos. MM-Det utilizes the +profound perceptual and comprehensive abilities of Large Multi-modal Models +(LMMs) by generating a Multi-Modal Forgery Representation (MMFR) from LMM's +multi-modal space, enhancing its ability to detect unseen forgery content. +Besides, MM-Det leverages an In-and-Across Frame Attention (IAFA) mechanism for +feature augmentation in the spatio-temporal domain. A dynamic fusion strategy +helps refine forgery representations for the fusion. Moreover, we construct a +comprehensive diffusion video dataset, called Diffusion Video Forensics (DVF), +across a wide range of forgery videos. MM-Det achieves state-of-the-art +performance in DVF, demonstrating the effectiveness of our algorithm. Both +source code and DVF are available at https://github.com/SparkleXFantasy/MM-Det. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ NAVSIM: Data-Driven Non-Reactive Autonomous Vehicle Simulation and + Benchmarking NeurIPS 2024 + + +
+ Benchmarking vision-based driving policies is challenging. On one hand, +open-loop evaluation with real data is easy, but these results do not reflect +closed-loop performance. On the other, closed-loop evaluation is possible in +simulation, but is hard to scale due to its significant computational demands. +Further, the simulators available today exhibit a large domain gap to real +data. This has resulted in an inability to draw clear conclusions from the +rapidly growing body of research on end-to-end autonomous driving. In this +paper, we present NAVSIM, a middle ground between these evaluation paradigms, +where we use large datasets in combination with a non-reactive simulator to +enable large-scale real-world benchmarking. Specifically, we gather +simulation-based metrics, such as progress and time to collision, by unrolling +bird's eye view abstractions of the test scenes for a short simulation horizon. +Our simulation is non-reactive, i.e., the evaluated policy and environment do +not influence each other. As we demonstrate empirically, this decoupling allows +open-loop metric computation while being better aligned with closed-loop +evaluations than traditional displacement errors. NAVSIM enabled a new +competition held at CVPR 2024, where 143 teams submitted 463 entries, resulting +in several new insights. On a large set of challenging scenarios, we observe +that simple methods with moderate compute requirements such as TransFuser can +match recent large-scale end-to-end driving architectures such as UniAD. Our +modular framework can potentially be extended with new datasets, data curation +strategies, and metrics, and will be continually maintained to host future +challenges. Our code is available at +https://github.com/autonomousvision/navsim. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ Quantized neural network for complex hologram generation + + +
+ Computer-generated holography (CGH) is a promising technology for augmented +reality displays, such as head-mounted or head-up displays. However, its high +computational demand makes it impractical for implementation. Recent efforts to +integrate neural networks into CGH have successfully accelerated computing +speed, demonstrating the potential to overcome the trade-off between +computational cost and image quality. Nevertheless, deploying neural +network-based CGH algorithms on computationally limited embedded systems +requires more efficient models with lower computational cost, memory footprint, +and power consumption. In this study, we developed a lightweight model for +complex hologram generation by introducing neural network quantization. +Specifically, we built a model based on tensor holography and quantized it from +32-bit floating-point precision (FP32) to 8-bit integer precision (INT8). Our +performance evaluation shows that the proposed INT8 model achieves hologram +quality comparable to that of the FP32 model while reducing the model size by +approximately 70% and increasing the speed fourfold. Additionally, we +implemented the INT8 model on a system-on-module to demonstrate its +deployability on embedded platforms and high power efficiency. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ ReNO: Enhancing One-step Text-to-Image Models through Reward-based Noise + Optimization NeurIPS 2024 + + +
+ Text-to-Image (T2I) models have made significant advancements in recent +years, but they still struggle to accurately capture intricate details +specified in complex compositional prompts. While fine-tuning T2I models with +reward objectives has shown promise, it suffers from "reward hacking" and may +not generalize well to unseen prompt distributions. In this work, we propose +Reward-based Noise Optimization (ReNO), a novel approach that enhances T2I +models at inference by optimizing the initial noise based on the signal from +one or multiple human preference reward models. Remarkably, solving this +optimization problem with gradient ascent for 50 iterations yields impressive +results on four different one-step models across two competitive benchmarks, +T2I-CompBench and GenEval. Within a computational budget of 20-50 seconds, +ReNO-enhanced one-step models consistently surpass the performance of all +current open-source Text-to-Image models. Extensive user studies demonstrate +that our model is preferred nearly twice as often compared to the popular SDXL +model and is on par with the proprietary Stable Diffusion 3 with 8B parameters. +Moreover, given the same computational resources, a ReNO-optimized one-step +model outperforms widely-used open-source models such as SDXL and +PixArt-$\alpha$, highlighting the efficiency and effectiveness of ReNO in +enhancing T2I model performance at inference time. Code is available at +https://github.com/ExplainableML/ReNO. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Invisible Image Watermarks Are Provably Removable Using Generative AI NeurIPS 2024 + + +
+ Invisible watermarks safeguard images' copyrights by embedding hidden +messages only detectable by owners. They also prevent people from misusing +images, especially those generated by AI models. We propose a family of +regeneration attacks to remove these invisible watermarks. The proposed attack +method first adds random noise to an image to destroy the watermark and then +reconstructs the image. This approach is flexible and can be instantiated with +many existing image-denoising algorithms and pre-trained generative models such +as diffusion models. Through formal proofs and extensive empirical evaluations, +we demonstrate that pixel-level invisible watermarks are vulnerable to this +regeneration attack. Our results reveal that, across four different pixel-level +watermarking schemes, the proposed method consistently achieves superior +performance compared to existing attack techniques, with lower detection rates +and higher image quality. However, watermarks that keep the image semantically +similar can be an alternative defense against our attacks. Our finding +underscores the need for a shift in research/industry emphasis from invisible +watermarks to semantic-preserving watermarks. Code is available at +https://github.com/XuandongZhao/WatermarkAttacker + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ MoVA: Adapting Mixture of Vision Experts to Multimodal Context NeurIPS 2024 + + +
+ As the key component in multimodal large language models (MLLMs), the ability +of the visual encoder greatly affects MLLM's understanding on diverse image +content. Although some large-scale pretrained vision encoders such as vision +encoders in CLIP and DINOv2 have brought promising performance, we found that +there is still no single vision encoder that can dominate various image content +understanding, e.g., the CLIP vision encoder leads to outstanding results on +general image understanding but poor performance on document or chart content. +To alleviate the bias of CLIP vision encoder, we first delve into the inherent +behavior of different pre-trained vision encoders and then propose the MoVA, a +powerful and novel MLLM, adaptively routing and fusing task-specific vision +experts with a coarse-to-fine mechanism. In the coarse-grained stage, we design +a context-aware expert routing strategy to dynamically select the most suitable +vision experts according to the user instruction, input image, and expertise of +vision experts. This benefits from the powerful model function understanding +ability of the large language model (LLM). In the fine-grained stage, we +elaborately conduct the mixture-of-vision-expert adapter (MoV-Adapter) to +extract and fuse task-specific knowledge from various experts. This +coarse-to-fine paradigm effectively leverages representations from experts +based on multimodal context and model expertise, further enhancing the +generalization ability. We conduct extensive experiments to evaluate the +effectiveness of the proposed approach. Without any bells and whistles, MoVA +can achieve significant performance gains over current state-of-the-art methods +in a wide range of challenging multimodal benchmarks. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Faster Neighborhood Attention: Reducing the O(n^2) Cost of Self + Attention at the Threadblock Level NeurIPS 2024 + + +
+ Neighborhood attention reduces the cost of self attention by restricting each +token's attention span to its nearest neighbors. This restriction, +parameterized by a window size and dilation factor, draws a spectrum of +possible attention patterns between linear projection and self attention. +Neighborhood attention, and more generally sliding window attention patterns, +have long been bounded by infrastructure, particularly in higher-rank spaces +(2-D and 3-D), calling for the development of custom kernels, which have been +limited in either functionality, or performance, if not both. In this work, we +aim to massively improve upon existing infrastructure by providing two new +methods for implementing neighborhood attention. We first show that +neighborhood attention can be represented as a batched GEMM problem, similar to +standard attention, and implement it for 1-D and 2-D neighborhood attention. +These kernels on average provide 895% and 272% improvement in full precision +runtime compared to existing naive CUDA kernels for 1-D and 2-D neighborhood +attention respectively. We find that aside from being heavily bound by memory +bandwidth, certain inherent inefficiencies exist in all unfused implementations +of neighborhood attention, which in most cases undo their theoretical +efficiency gain. Motivated by the progress made into fused dot-product +attention kernels, we developed fused neighborhood attention; an adaptation of +fused dot-product attention kernels that allow fine-grained control over +attention across different spatial axes. Known for reducing the quadratic time +complexity of self attention to a linear complexity, neighborhood attention can +now enjoy a reduced and constant memory footprint, and record-breaking half +precision runtime. We observe that our fused implementation successfully +circumvents some of the unavoidable inefficiencies in unfused +implementations... + +
+
+ comment: To appear in 38th Conference on Neural Information Processing Systems + (NeurIPS 2024) +
+
+
+
+
+ + ♻ ☆ Visual place recognition for aerial imagery: A survey + + +
+ Aerial imagery and its direct application to visual localization is an +essential problem for many Robotics and Computer Vision tasks. While Global +Navigation Satellite Systems (GNSS) are the standard default solution for +solving the aerial localization problem, it is subject to a number of +limitations, such as, signal instability or solution unreliability that make +this option not so desirable. Consequently, visual geolocalization is emerging +as a viable alternative. However, adapting Visual Place Recognition (VPR) task +to aerial imagery presents significant challenges, including weather variations +and repetitive patterns. Current VPR reviews largely neglect the specific +context of aerial data. This paper introduces a methodology tailored for +evaluating VPR techniques specifically in the domain of aerial imagery, +providing a comprehensive assessment of various methods and their performance. +However, we not only compare various VPR methods, but also demonstrate the +importance of selecting appropriate zoom and overlap levels when constructing +map tiles to achieve maximum efficiency of VPR algorithms in the case of aerial +imagery. The code is available on our GitHub repository -- +https://github.com/prime-slam/aero-vloc. + +
+
+
+
+
+ + ♻ ☆ Learning Cooperative Trajectory Representations for Motion Forecasting NeurIPS 2024 + + +
+ Motion forecasting is an essential task for autonomous driving, and utilizing +information from infrastructure and other vehicles can enhance forecasting +capabilities. Existing research mainly focuses on leveraging single-frame +cooperative information to enhance the limited perception capability of the ego +vehicle, while underutilizing the motion and interaction context of traffic +participants observed from cooperative devices. In this paper, we propose a +forecasting-oriented representation paradigm to utilize motion and interaction +features from cooperative information. Specifically, we present V2X-Graph, a +representative framework to achieve interpretable and end-to-end trajectory +feature fusion for cooperative motion forecasting. V2X-Graph is evaluated on +V2X-Seq in vehicle-to-infrastructure (V2I) scenarios. To further evaluate on +vehicle-to-everything (V2X) scenario, we construct the first real-world V2X +motion forecasting dataset V2X-Traj, which contains multiple autonomous +vehicles and infrastructure in every scenario. Experimental results on both +V2X-Seq and V2X-Traj show the advantage of our method. We hope both V2X-Graph +and V2X-Traj will benefit the further development of cooperative motion +forecasting. Find the project at https://github.com/AIR-THU/V2X-Graph. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ De-Confusing Pseudo-Labels in Source-Free Domain Adaptation + + +
+ Source-free domain adaptation aims to adapt a source-trained model to an +unlabeled target domain without access to the source data. It has attracted +growing attention in recent years, where existing approaches focus on +self-training that usually includes pseudo-labeling techniques. In this paper, +we introduce a novel noise-learning approach tailored to address noise +distribution in domain adaptation settings and learn to de-confuse the +pseudo-labels. More specifically, we learn a noise transition matrix of the +pseudo-labels to capture the label corruption of each class and learn the +underlying true label distribution. Estimating the noise transition matrix +enables a better true class-posterior estimation, resulting in better +prediction accuracy. We demonstrate the effectiveness of our approach when +combined with several source-free domain adaptation methods: SHOT, SHOT++, and +AaD. We obtain state-of-the-art results on three domain adaptation datasets: +VisDA, DomainNet, and OfficeHome. + +
+
+
+
+
+ + ♻ ☆ Text-Aware Diffusion for Policy Learning + + +
+ Training an agent to achieve particular goals or perform desired behaviors is +often accomplished through reinforcement learning, especially in the absence of +expert demonstrations. However, supporting novel goals or behaviors through +reinforcement learning requires the ad-hoc design of appropriate reward +functions, which quickly becomes intractable. To address this challenge, we +propose Text-Aware Diffusion for Policy Learning (TADPoLe), which uses a +pretrained, frozen text-conditioned diffusion model to compute dense zero-shot +reward signals for text-aligned policy learning. We hypothesize that +large-scale pretrained generative models encode rich priors that can supervise +a policy to behave not only in a text-aligned manner, but also in alignment +with a notion of naturalness summarized from internet-scale training data. In +our experiments, we demonstrate that TADPoLe is able to learn policies for +novel goal-achievement and continuous locomotion behaviors specified by natural +language, in both Humanoid and Dog environments. The behaviors are learned +zero-shot without ground-truth rewards or expert demonstrations, and are +qualitatively more natural according to human evaluation. We further show that +TADPoLe performs competitively when applied to robotic manipulation tasks in +the Meta-World environment, without having access to any in-domain +demonstrations. + +
+
+
+
+
+ + ♻ ☆ Adversarial Score identity Distillation: Rapidly Surpassing the Teacher + in One Step + + +
+ Score identity Distillation (SiD) is a data-free method that has achieved +state-of-the-art performance in image generation by leveraging only a +pretrained diffusion model, without requiring any training data. However, the +ultimate performance of SiD is constrained by the accuracy with which the +pretrained model captures the true data scores at different stages of the +diffusion process. In this paper, we introduce SiDA (SiD with Adversarial +Loss), which not only enhances generation quality but also improves +distillation efficiency by incorporating real images and adversarial loss. SiDA +utilizes the encoder from the generator's score network as a discriminator, +boosting its ability to distinguish between real images and those generated by +SiD. The adversarial loss is batch-normalized within each GPU and then combined +with the original SiD loss. This integration effectively incorporates the +average "fakeness" per GPU batch into the pixel-based SiD loss, enabling SiDA +to distill a single-step generator either from scratch or by fine-tuning an +existing one. SiDA converges significantly faster than its predecessor when +trained from scratch, and swiftly improves upon the original model's +performance after an initial warmup period during fine-tuning from a +pre-distilled SiD generator. This one-step adversarial distillation method +establishes new benchmarks in generation performance when distilling EDM +diffusion models pretrained on CIFAR-10 (32x32) and ImageNet (64x64), achieving +FID score of 1.110 on ImageNet 64x64. It sets record-low FID scores when +distilling EDM2 models trained on ImageNet (512x512), surpassing even the +largest teacher model, EDM2-XXL. Our SiDA's results record FID scores of 2.156 +for EDM2-XS, 1.669 for EDM2-S, 1.488 for EDM2-M, and 1.465 for EDM2-L, +demonstrating significant improvements across all model sizes. Our open-source +code will be integrated into the SiD codebase. + +
+
+
+
+
+ + ♻ ☆ EraW-Net: Enhance-Refine-Align W-Net for Scene-Associated Driver + Attention Estimation + + +
+ Associating driver attention with driving scene across two fields of views +(FOVs) is a hard cross-domain perception problem, which requires comprehensive +consideration of cross-view mapping, dynamic driving scene analysis, and driver +status tracking. Previous methods typically focus on a single view or map +attention to the scene via estimated gaze, failing to exploit the implicit +connection between them. Moreover, simple fusion modules are insufficient for +modeling the complex relationships between the two views, making information +integration challenging. To address these issues, we propose a novel method for +end-to-end scene-associated driver attention estimation, called EraW-Net. This +method enhances the most discriminative dynamic cues, refines feature +representations, and facilitates semantically aligned cross-domain integration +through a W-shaped architecture, termed W-Net. Specifically, a Dynamic Adaptive +Filter Module (DAF-Module) is proposed to address the challenges of frequently +changing driving environments by extracting vital regions. It suppresses the +indiscriminately recorded dynamics and highlights crucial ones by innovative +joint frequency-spatial analysis, enhancing the model's ability to parse +complex dynamics. Additionally, to track driver states during non-fixed facial +poses, we propose a Global Context Sharing Module (GCS-Module) to construct +refined feature representations by capturing hierarchical features that adapt +to various scales of head and eye movements. Finally, W-Net achieves systematic +cross-view information integration through its "Encoding-Independent Partial +Decoding-Fusion Decoding" structure, addressing semantic misalignment in +heterogeneous data integration. Experiments demonstrate that the proposed +method robustly and accurately estimates the mapping of driver attention in +scene on large public datasets. + +
+
+ comment: 13pages, 9 figures +
+
+
+
+
+ + ♻ ☆ FairSkin: Fair Diffusion for Skin Disease Image Generation + + +
+ Image generation is a prevailing technique for clinical data augmentation for +advancing diagnostic accuracy and reducing healthcare disparities. Diffusion +Model (DM) has become a leading method in generating synthetic medical images, +but it suffers from a critical twofold bias: (1) The quality of images +generated for Caucasian individuals is significantly higher, as measured by the +Frechet Inception Distance (FID). (2) The ability of the downstream-task +learner to learn critical features from disease images varies across different +skin tones. These biases pose significant risks, particularly in skin disease +detection, where underrepresentation of certain skin tones can lead to +misdiagnosis or neglect of specific conditions. To address these challenges, we +propose FairSkin, a novel DM framework that mitigates these biases through a +three-level resampling mechanism, ensuring fairer representation across racial +and disease categories. Our approach significantly improves the diversity and +quality of generated images, contributing to more equitable skin disease +detection in clinical settings. + +
+
+
+
+
+ + ♻ ☆ Low-light Pedestrian Detection in Visible and Infrared Image Feeds: + Issues and Challenges + + +
+ Pedestrian detection has become a cornerstone for several high-level tasks, +including autonomous driving, intelligent transportation, and traffic +surveillance. There are several works focussed on pedestrian detection using +visible images, mainly in the daytime. However, this task is very intriguing +when the environmental conditions change to poor lighting or nighttime. +Recently, new ideas have been spurred to use alternative sources, such as Far +InfraRed (FIR) temperature sensor feeds for detecting pedestrians in low-light +conditions. This study reviews recent developments in low-light pedestrian +detection approaches. It systematically categorizes and analyses various +algorithms from region-based to non-region-based and graph-based learning +methodologies by highlighting their methodologies, implementation issues, and +challenges. It also outlines the key benchmark datasets that can be used for +research and development of advanced pedestrian detection algorithms, +particularly in low-light situations. + +
+
+
+
+
+ + ♻ ☆ UDHF2-Net: Uncertainty-diffusion-model-based High-Frequency TransFormer + Network for Remotely Sensed Imagery Interpretation + + +
+ Remotely sensed imagery interpretation (RSII) faces the three major problems: +(1) objective representation of spatial distribution patterns; (2) edge +uncertainty problem caused by downsampling encoder and intrinsic edge noises +(e.g., mixed pixel and edge occlusion etc.); and (3) false detection problem +caused by geometric registration error in change detection. To solve the +aforementioned problems, uncertainty-diffusion-model-based high-Frequency +TransFormer network (UDHF2-Net) is the first to be proposed, whose +superiorities are as follows: (1) a spatially-stationary-and-non-stationary +high-frequency connection paradigm (SHCP) is proposed to enhance the +interaction of spatially frequency-wise stationary and non-stationary features +to yield high-fidelity edge extraction result. Inspired by HRFormer, SHCP +proposes high-frequency-wise stream to replace high-resolution-wise stream in +HRFormer through the whole encoder-decoder process with parallel frequency-wise +high-to-low streams, so it improves the edge extraction accuracy by +continuously remaining high-frequency information; (2) a +mask-and-geo-knowledge-based uncertainty diffusion module (MUDM), which is a +self-supervised learning strategy, is proposed to improve the edge accuracy of +extraction and change detection by gradually removing the simulated spectrum +noises based on geo-knowledge and the generated diffused spectrum noises; (3) a +frequency-wise semi-pseudo-Siamese UDHF2-Net is the first to be proposed to +balance accuracy and complexity for change detection. Besides the +aforementioned spectrum noises in semantic segmentation, MUDM is also a +self-supervised learning strategy to effectively reduce the edge false change +detection from the generated imagery with geometric registration error. + +
+
+
+
+
+ + ♻ ☆ SR-CACO-2: A Dataset for Confocal Fluorescence Microscopy Image + Super-Resolution NeurIPS 2024 + + +
+ Confocal fluorescence microscopy is one of the most accessible and widely +used imaging techniques for the study of biological processes at the cellular +and subcellular levels. Scanning confocal microscopy allows the capture of +high-quality images from thick three-dimensional (3D) samples, yet suffers from +well-known limitations such as photobleaching and phototoxicity of specimens +caused by intense light exposure, limiting its applications. Cellular damage +can be alleviated by changing imaging parameters to reduce light exposure, +often at the expense of image quality. Machine/deep learning methods for +single-image super-resolution (SISR) can be applied to restore image quality by +upscaling lower-resolution (LR) images to yield high-resolution images (HR). +These SISR methods have been successfully applied to photo-realistic images due +partly to the abundance of publicly available data. In contrast, the lack of +publicly available data partly limits their application and success in scanning +confocal microscopy. In this paper, we introduce a large scanning confocal +microscopy dataset named SR-CACO-2 that is comprised of low- and +high-resolution image pairs marked for three different fluorescent markers. It +allows the evaluation of performance of SISR methods on three different +upscaling levels (X2, X4, X8). SR-CACO-2 contains the human epithelial cell +line Caco-2 (ATCC HTB-37), and it is composed of 2,200 unique images, captured +with four resolutions and three markers, forming 9,937 image patches for SISR +methods. We provide benchmarking results for 16 state-of-the-art methods of the +main SISR families. Results show that these methods have limited success in +producing high-resolution textures. The dataset is freely accessible under a +Creative Commons license (CC BY-NC-SA 4.0). Our dataset, code and pretrained +weights for SISR methods are available: https://github.com/sbelharbi/sr-caco-2. + +
+
+ comment: 27 pages, 15 figures, NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Eddeep: Fast eddy-current distortion correction for diffusion MRI with + deep learning + + +
+ Modern diffusion MRI sequences commonly acquire a large number of volumes +with diffusion sensitization gradients of differing strengths or directions. +Such sequences rely on echo-planar imaging (EPI) to achieve reasonable scan +duration. However, EPI is vulnerable to off-resonance effects, leading to +tissue susceptibility and eddy-current induced distortions. The latter is +particularly problematic because it causes misalignment between volumes, +disrupting downstream modelling and analysis. The essential correction of eddy +distortions is typically done post-acquisition, with image registration. +However, this is non-trivial because correspondence between volumes can be +severely disrupted due to volume-specific signal attenuations induced by +varying directions and strengths of the applied gradients. This challenge has +been successfully addressed by the popular FSL~Eddy tool but at considerable +computational cost. We propose an alternative approach, leveraging recent +advances in image processing enabled by deep learning (DL). It consists of two +convolutional neural networks: 1) An image translator to restore correspondence +between images; 2) A registration model to align the translated images. Results +demonstrate comparable distortion estimates to FSL~Eddy, while requiring only +modest training sample sizes. This work, to the best of our knowledge, is the +first to tackle this problem with deep learning. Together with recently +developed DL-based susceptibility correction techniques, they pave the way for +real-time preprocessing of diffusion MRI, facilitating its wider uptake in the +clinic. + +
+
+ comment: accepted in MICCAI 2024 conference +
+
+
+
+
+ + ♻ ☆ SuperFusion: Multilevel LiDAR-Camera Fusion for Long-Range HD Map + Generation ICRA 2024 + + +
+ High-definition (HD) semantic map generation of the environment is an +essential component of autonomous driving. Existing methods have achieved good +performance in this task by fusing different sensor modalities, such as LiDAR +and camera. However, current works are based on raw data or network +feature-level fusion and only consider short-range HD map generation, limiting +their deployment to realistic autonomous driving applications. In this paper, +we focus on the task of building the HD maps in both short ranges, i.e., within +30 m, and also predicting long-range HD maps up to 90 m, which is required by +downstream path planning and control tasks to improve the smoothness and safety +of autonomous driving. To this end, we propose a novel network named +SuperFusion, exploiting the fusion of LiDAR and camera data at multiple levels. +We use LiDAR depth to improve image depth estimation and use image features to +guide long-range LiDAR feature prediction. We benchmark our SuperFusion on the +nuScenes dataset and a self-recorded dataset and show that it outperforms the +state-of-the-art baseline methods with large margins on all intervals. +Additionally, we apply the generated HD map to a downstream path planning task, +demonstrating that the long-range HD maps predicted by our method can lead to +better path planning for autonomous vehicles. Our code has been released at +https://github.com/haomo-ai/SuperFusion. + +
+
+ comment: ICRA 2024 +
+
+
+
+
+ + ♻ ☆ CoMix: A Comprehensive Benchmark for Multi-Task Comic Understanding NeurIPS 2024 + + +
+ The comic domain is rapidly advancing with the development of single-page +analysis and synthesis models. However, evaluation metrics and datasets lag +behind, often limited to small-scale or single-style test sets. We introduce a +novel benchmark, CoMix, designed to evaluate the multi-task capabilities of +models in comic analysis. Unlike existing benchmarks that focus on isolated +tasks such as object detection or text recognition, CoMix addresses a broader +range of tasks including object detection, speaker identification, character +re-identification, reading order, and multi-modal reasoning tasks like +character naming and dialogue generation. Our benchmark comprises three +existing datasets with expanded annotations to support multi-task evaluation. +To mitigate the over-representation of manga-style data, we have incorporated a +new dataset of carefully selected American comic-style books, thereby enriching +the diversity of comic styles. CoMix is designed to assess pre-trained models +in zero-shot and limited fine-tuning settings, probing their transfer +capabilities across different comic styles and tasks. The validation split of +the benchmark is publicly available for research purposes, and an evaluation +server for the held-out test split is also provided. Comparative results +between human performance and state-of-the-art models reveal a significant +performance gap, highlighting substantial opportunities for advancements in +comic understanding. The dataset, baseline models, and code are accessible at +https://github.com/emanuelevivoli/CoMix-dataset. This initiative sets a new +standard for comprehensive comic analysis, providing the community with a +common benchmark for evaluation on a large and varied set. + +
+
+ comment: Accepted at NeurIPS 2024 (D&B) +
+
+
+
+
+ + ♻ ☆ SERF: Fine-Grained Interactive 3D Segmentation and Editing with Radiance + Fields + + +
+ Although significant progress has been made in the field of 2D-based +interactive editing, fine-grained 3D-based interactive editing remains +relatively unexplored. This limitation can be attributed to two main +challenges: the lack of an efficient 3D representation robust to different +modifications and the absence of an effective 3D interactive segmentation +method. In this paper, we introduce a novel fine-grained interactive 3D +segmentation and editing algorithm with radiance fields, which we refer to as +SERF. Our method entails creating a neural mesh representation by integrating +multi-view algorithms with pre-trained 2D models. Building upon this +representation, we introduce a novel surface rendering technique that preserves +local information and is robust to deformation. Moreover, this representation +forms the basis for achieving accurate and interactive 3D segmentation without +requiring 3D supervision. Harnessing this representation facilitates a range of +interactive 3D editing operations, encompassing tasks such as interactive +geometry editing and texture painting. Extensive experiments and visualization +examples of editing on both real and synthetic data demonstrate the superiority +of our method on representation quality and editing ability. + +
+
+
+
+
+ + ♻ ☆ CrossEarth: Geospatial Vision Foundation Model for Domain Generalizable + Remote Sensing Semantic Segmentation + + +
+ The field of Remote Sensing Domain Generalization (RSDG) has emerged as a +critical and valuable research frontier, focusing on developing models that +generalize effectively across diverse scenarios. Despite the substantial domain +gaps in RS images that are characterized by variabilities such as location, +wavelength, and sensor type, research in this area remains underexplored: (1) +Current cross-domain methods primarily focus on Domain Adaptation (DA), which +adapts models to predefined domains rather than to unseen ones; (2) Few studies +targeting the RSDG issue, especially for semantic segmentation tasks, where +existing models are developed for specific unknown domains, struggling with +issues of underfitting on other unknown scenarios; (3) Existing RS foundation +models tend to prioritize in-domain performance over cross-domain +generalization. To this end, we introduce the first vision foundation model for +RSDG semantic segmentation, CrossEarth. CrossEarth demonstrates strong +cross-domain generalization through a specially designed data-level Earth-Style +Injection pipeline and a model-level Multi-Task Training pipeline. In addition, +for the semantic segmentation task, we have curated an RSDG benchmark +comprising 28 cross-domain settings across various regions, spectral bands, +platforms, and climates, providing a comprehensive framework for testing the +generalizability of future RSDG models. Extensive experiments on this benchmark +demonstrate the superiority of CrossEarth over existing state-of-the-art +methods. + +
+
+ comment: The codes and models will be available at + https://github.com/Cuzyoung/CrossEarth +
+
+
+
+
+ + ♻ ☆ CaptainCook4D: A Dataset for Understanding Errors in Procedural + Activities + + +
+ Following step-by-step procedures is an essential component of various +activities carried out by individuals in their daily lives. These procedures +serve as a guiding framework that helps to achieve goals efficiently, whether +it is assembling furniture or preparing a recipe. However, the complexity and +duration of procedural activities inherently increase the likelihood of making +errors. Understanding such procedural activities from a sequence of frames is a +challenging task that demands an accurate interpretation of visual information +and the ability to reason about the structure of the activity. To this end, we +collect a new egocentric 4D dataset, CaptainCook4D, comprising 384 recordings +(94.5 hours) of people performing recipes in real kitchen environments. This +dataset consists of two distinct types of activity: one in which participants +adhere to the provided recipe instructions and another in which they deviate +and induce errors. We provide 5.3K step annotations and 10K fine-grained action +annotations and benchmark the dataset for the following tasks: supervised error +recognition, multistep localization, and procedure learning + +
+
+ comment: Accepted to the 2024 Neural Information Processing Systems Datasets + and Benchmarks Track, Project Page: + https://captaincook4d.github.io/captain-cook/ +
+
+
+
+
+ + ♻ ☆ Embracing Events and Frames with Hierarchical Feature Refinement Network + for Object Detection ECCV 2024 + + +
+ In frame-based vision, object detection faces substantial performance +degradation under challenging conditions due to the limited sensing capability +of conventional cameras. Event cameras output sparse and asynchronous events, +providing a potential solution to solve these problems. However, effectively +fusing two heterogeneous modalities remains an open issue. In this work, we +propose a novel hierarchical feature refinement network for event-frame fusion. +The core concept is the design of the coarse-to-fine fusion module, denoted as +the cross-modality adaptive feature refinement (CAFR) module. In the initial +phase, the bidirectional cross-modality interaction (BCI) part facilitates +information bridging from two distinct sources. Subsequently, the features are +further refined by aligning the channel-level mean and variance in the two-fold +adaptive feature refinement (TAFR) part. We conducted extensive experiments on +two benchmarks: the low-resolution PKU-DDD17-Car dataset and the +high-resolution DSEC dataset. Experimental results show that our method +surpasses the state-of-the-art by an impressive margin of $\textbf{8.0}\%$ on +the DSEC dataset. Besides, our method exhibits significantly better robustness +(\textbf{69.5}\% versus \textbf{38.7}\%) when introducing 15 different +corruption types to the frame images. The code can be found at the link +(https://github.com/HuCaoFighting/FRN). + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Consistency Diffusion Bridge Models NeurIPS 2024 + + +
+ Diffusion models (DMs) have become the dominant paradigm of generative +modeling in a variety of domains by learning stochastic processes from noise to +data. Recently, diffusion denoising bridge models (DDBMs), a new formulation of +generative modeling that builds stochastic processes between fixed data +endpoints based on a reference diffusion process, have achieved empirical +success across tasks with coupled data distribution, such as image-to-image +translation. However, DDBM's sampling process typically requires hundreds of +network evaluations to achieve decent performance, which may impede their +practical deployment due to high computational demands. In this work, inspired +by the recent advance of consistency models in DMs, we tackle this problem by +learning the consistency function of the probability-flow ordinary differential +equation (PF-ODE) of DDBMs, which directly predicts the solution at a starting +step given any point on the ODE trajectory. Based on a dedicated general-form +ODE solver, we propose two paradigms: consistency bridge distillation and +consistency bridge training, which is flexible to apply on DDBMs with broad +design choices. Experimental results show that our proposed method could sample +$4\times$ to $50\times$ faster than the base DDBM and produce better visual +quality given the same step in various tasks with pixel resolution ranging from +$64 \times 64$ to $256 \times 256$, as well as supporting downstream tasks such +as semantic interpolation in the data space. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Dessie: Disentanglement for Articulated 3D Horse Shape and Pose + Estimation from Images + + +
+ In recent years, 3D parametric animal models have been developed to aid in +estimating 3D shape and pose from images and video. While progress has been +made for humans, it's more challenging for animals due to limited annotated +data. To address this, we introduce the first method using synthetic data +generation and disentanglement to learn to regress 3D shape and pose. Focusing +on horses, we use text-based texture generation and a synthetic data pipeline +to create varied shapes, poses, and appearances, learning disentangled spaces. +Our method, Dessie, surpasses existing 3D horse reconstruction methods and +generalizes to other large animals like zebras, cows, and deer. See the project +website at: \url{https://celiali.github.io/Dessie/}. + +
+
+ comment: ACCV2024 +
+
+
+
+
+ + ♻ ☆ M3LEO: A Multi-Modal, Multi-Label Earth Observation Dataset Integrating + Interferometric SAR and Multispectral Data + + +
+ Satellite-based remote sensing has revolutionised the way we address global +challenges. Huge quantities of Earth Observation (EO) data are generated by +satellite sensors daily, but processing these large datasets for use in ML +pipelines is technically and computationally challenging. While some +preprocessed Earth observation datasets exist, their content is often limited +to optical or near-optical wavelength data, which is ineffective at night or in +adverse weather conditions. Synthetic Aperture Radar (SAR), an active sensing +technique based on microwave length radiation, offers a viable alternative. +However, the application of machine learning to SAR has been limited due to a +lack of ML-ready data and pipelines, particularly for the full diversity of SAR +data, including polarimetry, coherence and interferometry. In this work, we +introduce M3LEO, a multi-modal, multi-label Earth observation dataset that +includes polarimetric, interferometric, and coherence SAR data derived from +Sentinel-1, alongside multispectral Sentinel-2 imagery and auxiliary data +describing terrain properties such as land use. M3LEO spans approximately 17M +4x4 km data chips from six diverse geographic regions. The dataset is +complemented by a flexible PyTorch Lightning framework configured using Hydra +to accommodate its use across diverse ML applications in Earth observation. We +provide tools to process any dataset available on popular platforms such as +Google Earth Engine for seamless integration with our framework. We show that +the distribution shift in self-supervised embeddings is substantial across +geographic regions, even when controlling for terrain properties. Data: +huggingface.co/M3LEO, Code: github.com/spaceml-org/M3LEO. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Blind Inpainting with Object-aware Discrimination for Artificial Marker + Removal + + +
+ Medical images often incorporate doctor-added markers that can hinder +AI-based diagnosis. This issue highlights the need of inpainting techniques to +restore the corrupted visual contents. However, existing methods require manual +mask annotation as input, limiting the application scenarios. In this paper, we +propose a novel blind inpainting method that automatically reconstructs visual +contents within the corrupted regions without mask input as guidance. Our model +includes a blind reconstruction network and an object-aware discriminator for +adversarial training. The reconstruction network contains two branches that +predict corrupted regions in images and simultaneously restore the missing +visual contents. Leveraging the potent recognition capability of a dense object +detector, the object-aware discriminator ensures markers undetectable after +inpainting. Thus, the restored images closely resemble the clean ones. We +evaluate our method on three datasets of various medical imaging modalities, +confirming better performance over other state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Advancing Video Anomaly Detection: A Concise Review and a New Dataset NeurIPS 2024 + + +
+ Video Anomaly Detection (VAD) finds widespread applications in security +surveillance, traffic monitoring, industrial monitoring, and healthcare. +Despite extensive research efforts, there remains a lack of concise reviews +that provide insightful guidance for researchers. Such reviews would serve as +quick references to grasp current challenges, research trends, and future +directions. In this paper, we present such a review, examining models and +datasets from various perspectives. We emphasize the critical relationship +between model and dataset, where the quality and diversity of datasets +profoundly influence model performance, and dataset development adapts to the +evolving needs of emerging approaches. Our review identifies practical issues, +including the absence of comprehensive datasets with diverse scenarios. To +address this, we introduce a new dataset, Multi-Scenario Anomaly Detection +(MSAD), comprising 14 distinct scenarios captured from various camera views. +Our dataset has diverse motion patterns and challenging variations, such as +different lighting and weather conditions, providing a robust foundation for +training superior models. We conduct an in-depth analysis of recent +representative models using MSAD and highlight its potential in addressing the +challenges of detecting anomalies across diverse and evolving surveillance +scenarios. [Project website: https://msad-dataset.github.io/] + +
+
+ comment: Accepted at the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024) Track on Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ WildGaussians: 3D Gaussian Splatting in the Wild NeurIPS 2024 + + +
+ While the field of 3D scene reconstruction is dominated by NeRFs due to their +photorealistic quality, 3D Gaussian Splatting (3DGS) has recently emerged, +offering similar quality with real-time rendering speeds. However, both methods +primarily excel with well-controlled 3D scenes, while in-the-wild data - +characterized by occlusions, dynamic objects, and varying illumination - +remains challenging. NeRFs can adapt to such conditions easily through +per-image embedding vectors, but 3DGS struggles due to its explicit +representation and lack of shared parameters. To address this, we introduce +WildGaussians, a novel approach to handle occlusions and appearance changes +with 3DGS. By leveraging robust DINO features and integrating an appearance +modeling module within 3DGS, our method achieves state-of-the-art results. We +demonstrate that WildGaussians matches the real-time rendering speed of 3DGS +while surpassing both 3DGS and NeRF baselines in handling in-the-wild data, all +within a simple architectural framework. + +
+
+ comment: NeurIPS 2024; Project page: https://wild-gaussians.github.io/ +
+
+
+
+
+ + ♻ ☆ FasterDiT: Towards Faster Diffusion Transformers Training without + Architecture Modification NeurIPS 2024 + + +
+ Diffusion Transformers (DiT) have attracted significant attention in +research. However, they suffer from a slow convergence rate. In this paper, we +aim to accelerate DiT training without any architectural modification. We +identify the following issues in the training process: firstly, certain +training strategies do not consistently perform well across different data. +Secondly, the effectiveness of supervision at specific timesteps is limited. In +response, we propose the following contributions: (1) We introduce a new +perspective for interpreting the failure of the strategies. Specifically, we +slightly extend the definition of Signal-to-Noise Ratio (SNR) and suggest +observing the Probability Density Function (PDF) of SNR to understand the +essence of the data robustness of the strategy. (2) We conduct numerous +experiments and report over one hundred experimental results to empirically +summarize a unified accelerating strategy from the perspective of PDF. (3) We +develop a new supervision method that further accelerates the training process +of DiT. Based on them, we propose FasterDiT, an exceedingly simple and +practicable design strategy. With few lines of code modifications, it achieves +2.30 FID on ImageNet 256 resolution at 1000k iterations, which is comparable to +DiT (2.27 FID) but 7 times faster in training. + +
+
+ comment: NeurIPS 2024 (poster); update to camera-ready version +
+
+
+
+
+ + ♻ ☆ Rethinking Out-of-Distribution Detection on Imbalanced Data Distribution NeurIPS 2024 + + +
+ Detecting and rejecting unknown out-of-distribution (OOD) samples is critical +for deployed neural networks to void unreliable predictions. In real-world +scenarios, however, the efficacy of existing OOD detection methods is often +impeded by the inherent imbalance of in-distribution (ID) data, which causes +significant performance decline. Through statistical observations, we have +identified two common challenges faced by different OOD detectors: +misidentifying tail class ID samples as OOD, while erroneously predicting OOD +samples as head class from ID. To explain this phenomenon, we introduce a +generalized statistical framework, termed ImOOD, to formulate the OOD detection +problem on imbalanced data distribution. Consequently, the theoretical analysis +reveals that there exists a class-aware bias item between balanced and +imbalanced OOD detection, which contributes to the performance gap. Building +upon this finding, we present a unified training-time regularization technique +to mitigate the bias and boost imbalanced OOD detectors across architecture +designs. Our theoretically grounded method translates into consistent +improvements on the representative CIFAR10-LT, CIFAR100-LT, and ImageNet-LT +benchmarks against several state-of-the-art OOD detection approaches. Code is +available at https://github.com/alibaba/imood. + +
+
+ comment: This paper has been accepted by NeurIPS 2024. Code is available at + https://github.com/alibaba/imood +
+
+
+
+
+ + ♻ ☆ Subsurface Scattering for 3D Gaussian Splatting + + +
+ 3D reconstruction and relighting of objects made from scattering materials +present a significant challenge due to the complex light transport beneath the +surface. 3D Gaussian Splatting introduced high-quality novel view synthesis at +real-time speeds. While 3D Gaussians efficiently approximate an object's +surface, they fail to capture the volumetric properties of subsurface +scattering. We propose a framework for optimizing an object's shape together +with the radiance transfer field given multi-view OLAT (one light at a time) +data. Our method decomposes the scene into an explicit surface represented as +3D Gaussians, with a spatially varying BRDF, and an implicit volumetric +representation of the scattering component. A learned incident light field +accounts for shadowing. We optimize all parameters jointly via ray-traced +differentiable rendering. Our approach enables material editing, relighting and +novel view synthesis at interactive rates. We show successful application on +synthetic data and introduce a newly acquired multi-view multi-light dataset of +objects in a light-stage setup. Compared to previous work we achieve comparable +or better results at a fraction of optimization and rendering time while +enabling detailed control over material attributes. Project page +https://sss.jdihlmann.com/ + +
+
+ comment: Project page: https://sss.jdihlmann.com/ +
+
+
+
+
+ + ♻ ☆ UNION: Unsupervised 3D Object Detection using Object Appearance-based + Pseudo-Classes NeurIPS'24 + + +
+ Unsupervised 3D object detection methods have emerged to leverage vast +amounts of data without requiring manual labels for training. Recent approaches +rely on dynamic objects for learning to detect mobile objects but penalize the +detections of static instances during training. Multiple rounds of (self) +training are used to add detected static instances to the set of training +targets; this procedure to improve performance is computationally expensive. To +address this, we propose the method UNION. We use spatial clustering and +self-supervised scene flow to obtain a set of static and dynamic object +proposals from LiDAR. Subsequently, object proposals' visual appearances are +encoded to distinguish static objects in the foreground and background by +selecting static instances that are visually similar to dynamic objects. As a +result, static and dynamic mobile objects are obtained together, and existing +detectors can be trained with a single training. In addition, we extend 3D +object discovery to detection by using object appearance-based cluster labels +as pseudo-class labels for training object classification. We conduct extensive +experiments on the nuScenes dataset and increase the state-of-the-art +performance for unsupervised 3D object discovery, i.e. UNION more than doubles +the average precision to 38.4. The code is available at +github.com/TedLentsch/UNION. + +
+
+ comment: NeurIPS'24 +
+
+
+
+
+ + ♻ ☆ NM-FlowGAN: Modeling sRGB Noise without Paired Images using a Hybrid + Approach of Normalizing Flows and GAN + + +
+ Modeling and synthesizing real sRGB noise is crucial for various low-level +vision tasks, such as building datasets for training image denoising systems. +The distribution of real sRGB noise is highly complex and affected by a +multitude of factors, making its accurate modeling extremely challenging. +Therefore, recent studies have proposed methods that employ data-driven +generative models, such as Generative Adversarial Networks (GAN) and +Normalizing Flows. These studies achieve more accurate modeling of sRGB noise +compared to traditional noise modeling methods. However, there are performance +limitations due to the inherent characteristics of each generative model. To +address this issue, we propose NM-FlowGAN, a hybrid approach that exploits the +strengths of both GAN and Normalizing Flows. We combine pixel-wise noise +modeling networks based on Normalizing Flows and spatial correlation modeling +networks based on GAN. Specifically, the pixel-wise noise modeling network +leverages the high training stability of Normalizing Flows to capture noise +characteristics that are affected by a multitude of factors, and the spatial +correlation networks efficiently model pixel-to-pixel relationships. In +particular, unlike recent methods that rely on paired noisy images, our method +synthesizes noise using clean images and factors that affect noise +characteristics, such as easily obtainable parameters like camera type and ISO +settings, making it applicable to various fields where obtaining noisy-clean +image pairs is not feasible. In our experiments, our NM-FlowGAN outperforms +other baselines in the sRGB noise synthesis task. Moreover, the denoising +neural network trained with synthesized image pairs from our model shows +superior performance compared to other baselines. Our code is available at: +\url{https://github.com/YoungJooHan/NM-FlowGAN}. + +
+
+ comment: 13 pages, 10 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ PuLID: Pure and Lightning ID Customization via Contrastive Alignment NeurIPS 2024 + + +
+ We propose Pure and Lightning ID customization (PuLID), a novel tuning-free +ID customization method for text-to-image generation. By incorporating a +Lightning T2I branch with a standard diffusion one, PuLID introduces both +contrastive alignment loss and accurate ID loss, minimizing disruption to the +original model and ensuring high ID fidelity. Experiments show that PuLID +achieves superior performance in both ID fidelity and editability. Another +attractive property of PuLID is that the image elements (e.g., background, +lighting, composition, and style) before and after the ID insertion are kept as +consistent as possible. Codes and models are available at +https://github.com/ToTheBeginning/PuLID + +
+
+ comment: NeurIPS 2024. Codes and models are available at + https://github.com/ToTheBeginning/PuLID +
+
+
+
+
+ + ♻ ☆ MambaEviScrib: Mamba and Evidence-Guided Consistency Enhance CNN + Robustness for Scribble-Based Weakly Supervised Ultrasound Image Segmentation + + +
+ Segmenting anatomical structures and lesions from ultrasound images +contributes to disease assessment. Weakly supervised learning (WSL) based on +sparse annotation has achieved encouraging performance and demonstrated the +potential to reduce annotation costs. This study attempts to introduce +scribble-based WSL into ultrasound image segmentation tasks. However, +ultrasound images often suffer from poor contrast and unclear edges, coupled +with insufficient supervison signals for edges, posing challenges to edge +prediction. Uncertainty modeling has been proven to facilitate models in +dealing with these issues. Nevertheless, existing uncertainty estimation +paradigms are not robust enough and often filter out predictions near decision +boundaries, resulting in unstable edge predictions. Therefore, we propose +leveraging predictions near decision boundaries effectively. Specifically, we +introduce Dempster-Shafer Theory (DST) of evidence to design an Evidence-Guided +Consistency strategy. This strategy utilizes high-evidence predictions, which +are more likely to occur near high-density regions, to guide the optimization +of low-evidence predictions that may appear near decision boundaries. +Furthermore, the diverse sizes and locations of lesions in ultrasound images +pose a challenge for CNNs with local receptive fields, as they struggle to +model global information. Therefore, we introduce Visual Mamba based on +structured state space sequence models, which achieves long-range dependency +with linear computational complexity, and we construct a novel hybrid CNN-Mamba +framework. During training, the collaboration between the CNN branch and the +Mamba branch in the proposed framework draws inspiration from each other based +on the EGC strategy. Experiments demonstrate the competitiveness of the +proposed method. Dataset and code will be available on +https://github.com/GtLinyer/MambaEviScrib. + +
+
+
+
+
+ + ♻ ☆ Stabilize the Latent Space for Image Autoregressive Modeling: A Unified + Perspective NeurIPS 2024 + + +
+ Latent-based image generative models, such as Latent Diffusion Models (LDMs) +and Mask Image Models (MIMs), have achieved notable success in image generation +tasks. These models typically leverage reconstructive autoencoders like VQGAN +or VAE to encode pixels into a more compact latent space and learn the data +distribution in the latent space instead of directly from pixels. However, this +practice raises a pertinent question: Is it truly the optimal choice? In +response, we begin with an intriguing observation: despite sharing the same +latent space, autoregressive models significantly lag behind LDMs and MIMs in +image generation. This finding contrasts sharply with the field of NLP, where +the autoregressive model GPT has established a commanding presence. To address +this discrepancy, we introduce a unified perspective on the relationship +between latent space and generative models, emphasizing the stability of latent +space in image generative modeling. Furthermore, we propose a simple but +effective discrete image tokenizer to stabilize the latent space for image +generative modeling by applying K-Means on the latent features of +self-supervised learning models. Experimental results show that image +autoregressive modeling with our tokenizer (DiGIT) benefits both image +understanding and image generation with the next token prediction principle, +which is inherently straightforward for GPT models but challenging for other +generative models. Remarkably, for the first time, a GPT-style autoregressive +model for images outperforms LDMs, which also exhibits substantial improvement +akin to GPT when scaling up model size. Our findings underscore the potential +of an optimized latent space and the integration of discrete tokenization in +advancing the capabilities of image generative models. The code is available at +\url{https://github.com/DAMO-NLP-SG/DiGIT}. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ GIC: Gaussian-Informed Continuum for Physical Property Identification + and Simulation NeurIPS 2024 + + +
+ This paper studies the problem of estimating physical properties (system +identification) through visual observations. To facilitate geometry-aware +guidance in physical property estimation, we introduce a novel hybrid framework +that leverages 3D Gaussian representation to not only capture explicit shapes +but also enable the simulated continuum to render object masks as 2D shape +surrogates during training. We propose a new dynamic 3D Gaussian framework +based on motion factorization to recover the object as 3D Gaussian point sets +across different time states. Furthermore, we develop a coarse-to-fine filling +strategy to generate the density fields of the object from the Gaussian +reconstruction, allowing for the extraction of object continuums along with +their surfaces and the integration of Gaussian attributes into these continuum. +In addition to the extracted object surfaces, the Gaussian-informed continuum +also enables the rendering of object masks during simulations, serving as +2D-shape guidance for physical property estimation. Extensive experimental +evaluations demonstrate that our pipeline achieves state-of-the-art performance +across multiple benchmarks and metrics. Additionally, we illustrate the +effectiveness of the proposed method through real-world demonstrations, +showcasing its practical utility. Our project page is at +https://jukgei.github.io/project/gic. + +
+
+ comment: 21 pages, 8 figures, NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ ProbTalk3D: Non-Deterministic Emotion Controllable Speech-Driven 3D + Facial Animation Synthesis Using VQ-VAE SIGGRAPH + + +
+ Audio-driven 3D facial animation synthesis has been an active field of +research with attention from both academia and industry. While there are +promising results in this area, recent approaches largely focus on lip-sync and +identity control, neglecting the role of emotions and emotion control in the +generative process. That is mainly due to the lack of emotionally rich facial +animation data and algorithms that can synthesize speech animations with +emotional expressions at the same time. In addition, majority of the models are +deterministic, meaning given the same audio input, they produce the same output +motion. We argue that emotions and non-determinism are crucial to generate +diverse and emotionally-rich facial animations. In this paper, we propose +ProbTalk3D a non-deterministic neural network approach for emotion controllable +speech-driven 3D facial animation synthesis using a two-stage VQ-VAE model and +an emotionally rich facial animation dataset 3DMEAD. We provide an extensive +comparative analysis of our model against the recent 3D facial animation +synthesis approaches, by evaluating the results objectively, qualitatively, and +with a perceptual user study. We highlight several objective metrics that are +more suitable for evaluating stochastic outputs and use both in-the-wild and +ground truth data for subjective evaluation. To our knowledge, that is the +first non-deterministic 3D facial animation synthesis method incorporating a +rich emotion dataset and emotion control with emotion labels and intensity +levels. Our evaluation demonstrates that the proposed model achieves superior +performance compared to state-of-the-art emotion-controlled, deterministic and +non-deterministic models. We recommend watching the supplementary video for +quality judgement. The entire codebase is publicly available +(https://github.com/uuembodiedsocialai/ProbTalk3D/). + +
+
+ comment: 14 pages, 9 figures, 3 tables. Includes code. Accepted at ACM + SIGGRAPH MIG 2024 +
+
+
+
+
+ + ♻ ☆ Rapid Plug-in Defenders + + +
+ In the realm of daily services, the deployment of deep neural networks +underscores the paramount importance of their reliability. However, the +vulnerability of these networks to adversarial attacks, primarily +evasion-based, poses a concerning threat to their functionality. Common methods +for enhancing robustness involve heavy adversarial training or leveraging +learned knowledge from clean data, both necessitating substantial computational +resources. This inherent time-intensive nature severely limits the agility of +large foundational models to swiftly counter adversarial perturbations. To +address this challenge, this paper focuses on the Rapid Plug-in Defender +(RaPiD) problem, aiming to rapidly counter adversarial perturbations without +altering the deployed model. Drawing inspiration from the generalization and +the universal computation ability of pre-trained transformer models, we propose +a novel method termed CeTaD (Considering Pre-trained Transformers as Defenders) +for RaPiD, optimized for efficient computation. CeTaD strategically fine-tunes +the normalization layer parameters within the defender using a limited set of +clean and adversarial examples. Our evaluation centers on assessing CeTaD's +effectiveness, transferability, and the impact of different components in +scenarios involving one-shot adversarial examples. The proposed method is +capable of rapidly adapting to various attacks and different application +scenarios without altering the target model and clean training data. We also +explore the influence of varying training data conditions on CeTaD's +performance. Notably, CeTaD exhibits adaptability across differentiable service +models and proves the potential of continuous learning. + +
+
+
+
+
+ + ♻ ☆ CAT: Coordinating Anatomical-Textual Prompts for Multi-Organ and Tumor + Segmentation + + +
+ Existing promptable segmentation methods in the medical imaging field +primarily consider either textual or visual prompts to segment relevant +objects, yet they often fall short when addressing anomalies in medical images, +like tumors, which may vary greatly in shape, size, and appearance. Recognizing +the complexity of medical scenarios and the limitations of textual or visual +prompts, we propose a novel dual-prompt schema that leverages the complementary +strengths of visual and textual prompts for segmenting various organs and +tumors. Specifically, we introduce CAT, an innovative model that Coordinates +Anatomical prompts derived from 3D cropped images with Textual prompts enriched +by medical domain knowledge. The model architecture adopts a general +query-based design, where prompt queries facilitate segmentation queries for +mask prediction. To synergize two types of prompts within a unified framework, +we implement a ShareRefiner, which refines both segmentation and prompt queries +while disentangling the two types of prompts. Trained on a consortium of 10 +public CT datasets, CAT demonstrates superior performance in multiple +segmentation tasks. Further validation on a specialized in-house dataset +reveals the remarkable capacity of segmenting tumors across multiple cancer +stages. This approach confirms that coordinating multimodal prompts is a +promising avenue for addressing complex scenarios in the medical domain. + +
+
+
+
+
+ + ♻ ☆ Optical Diffusion Models for Image Generation + + +
+ Diffusion models generate new samples by progressively decreasing the noise +from the initially provided random distribution. This inference procedure +generally utilizes a trained neural network numerous times to obtain the final +output, creating significant latency and energy consumption on digital +electronic hardware such as GPUs. In this study, we demonstrate that the +propagation of a light beam through a semi-transparent medium can be programmed +to implement a denoising diffusion model on image samples. This framework +projects noisy image patterns through passive diffractive optical layers, which +collectively only transmit the predicted noise term in the image. The optical +transparent layers, which are trained with an online training approach, +backpropagating the error to the analytical model of the system, are passive +and kept the same across different steps of denoising. Hence this method +enables high-speed image generation with minimal power consumption, benefiting +from the bandwidth and energy efficiency of optical information processing. + +
+
+ comment: 17 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ SSL-NBV: A Self-Supervised-Learning-Based Next-Best-View algorithm for + Efficient 3D Plant Reconstruction by a Robot + + +
+ The 3D reconstruction of plants is challenging due to their complex shape +causing many occlusions. Next-Best-View (NBV) methods address this by +iteratively selecting new viewpoints to maximize information gain (IG). +Deep-learning-based NBV (DL-NBV) methods demonstrate higher computational +efficiency over classic voxel-based NBV approaches but current methods require +extensive training using ground-truth plant models, making them impractical for +real-world plants. These methods, moreover, rely on offline training with +pre-collected data, limiting adaptability in changing agricultural +environments. This paper proposes a self-supervised learning-based NBV method +(SSL-NBV) that uses a deep neural network to predict the IG for candidate +viewpoints. The method allows the robot to gather its own training data during +task execution by comparing new 3D sensor data to the earlier gathered data and +by employing weakly-supervised learning and experience replay for efficient +online learning. Comprehensive evaluations were conducted in simulation and +real-world environments using cross-validation. The results showed that SSL-NBV +required fewer views for plant reconstruction than non-NBV methods and was over +800 times faster than a voxel-based method. SSL-NBV reduced training +annotations by over 90% compared to a baseline DL-NBV. Furthermore, SSL-NBV +could adapt to novel scenarios through online fine-tuning. Also using real +plants, the results showed that the proposed method can learn to effectively +plan new viewpoints for 3D plant reconstruction. Most importantly, SSL-NBV +automated the entire network training and uses continuous online learning, +allowing it to operate in changing agricultural environments. + +
+
+ comment: 22 pages, 11 figures, 1 table +
+
+
+
+
+ + ♻ ☆ ConDiSR: Contrastive Disentanglement and Style Regularization for Single + Domain Generalization + + +
+ Medical data often exhibits distribution shifts, which cause test-time +performance degradation for deep learning models trained using standard +supervised learning pipelines. This challenge is addressed in the field of +Domain Generalization (DG) with the sub-field of Single Domain Generalization +(SDG) being specifically interesting due to the privacy- or logistics-related +issues often associated with medical data. Existing disentanglement-based SDG +methods heavily rely on structural information embedded in segmentation masks, +however classification labels do not provide such dense information. This work +introduces a novel SDG method aimed at medical image classification that +leverages channel-wise contrastive disentanglement. It is further enhanced with +reconstruction-based style regularization to ensure extraction of distinct +style and structure feature representations. We evaluate our method on the +complex task of multicenter histopathology image classification, comparing it +against state-of-the-art (SOTA) SDG baselines. Results demonstrate that our +method surpasses the SOTA by a margin of 1% in average accuracy while also +showing more stable performance. This study highlights the importance and +challenges of exploring SDG frameworks in the context of the classification +task. The code is publicly available at +https://github.com/BioMedIA-MBZUAI/ConDiSR + +
+
+ comment: A flaw was found in the results acquisition +
+
+
+
+
+ + ♻ ☆ DF40: Toward Next-Generation Deepfake Detection + + +
+ We propose a new comprehensive benchmark to revolutionize the current +deepfake detection field to the next generation. Predominantly, existing works +identify top-notch detection algorithms and models by adhering to the common +practice: training detectors on one specific dataset (e.g., FF++) and testing +them on other prevalent deepfake datasets. This protocol is often regarded as a +"golden compass" for navigating SoTA detectors. But can these stand-out +"winners" be truly applied to tackle the myriad of realistic and diverse +deepfakes lurking in the real world? If not, what underlying factors contribute +to this gap? In this work, we found the dataset (both train and test) can be +the "primary culprit" due to: (1) forgery diversity: Deepfake techniques are +commonly referred to as both face forgery and entire image synthesis. Most +existing datasets only contain partial types of them, with limited forgery +methods implemented; (2) forgery realism: The dominated training dataset, FF++, +contains out-of-date forgery techniques from the past four years. "Honing +skills" on these forgeries makes it difficult to guarantee effective detection +generalization toward nowadays' SoTA deepfakes; (3) evaluation protocol: Most +detection works perform evaluations on one type, which hinders the development +of universal deepfake detectors. To address this dilemma, we construct a highly +diverse deepfake detection dataset called DF40, which comprises 40 distinct +deepfake techniques. We then conduct comprehensive evaluations using 4 standard +evaluation protocols and 8 representative detection methods, resulting in over +2,000 evaluations. Through these evaluations, we provide an extensive analysis +from various perspectives, leading to 7 new insightful findings. We also open +up 4 valuable yet previously underexplored research questions to inspire future +works. Our project page is https://github.com/YZY-stack/DF40. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2108.05080 by other authors +
+
+
+
+
+ + ♻ ☆ G3: An Effective and Adaptive Framework for Worldwide Geolocalization + Using Large Multi-Modality Models NeurIPS2024 + + +
+ Worldwide geolocalization aims to locate the precise location at the +coordinate level of photos taken anywhere on the Earth. It is very challenging +due to 1) the difficulty of capturing subtle location-aware visual semantics, +and 2) the heterogeneous geographical distribution of image data. As a result, +existing studies have clear limitations when scaled to a worldwide context. +They may easily confuse distant images with similar visual contents, or cannot +adapt to various locations worldwide with different amounts of relevant data. +To resolve these limitations, we propose G3, a novel framework based on +Retrieval-Augmented Generation (RAG). In particular, G3 consists of three +steps, i.e., Geo-alignment, Geo-diversification, and Geo-verification to +optimize both retrieval and generation phases of worldwide geolocalization. +During Geo-alignment, our solution jointly learns expressive multi-modal +representations for images, GPS and textual descriptions, which allows us to +capture location-aware semantics for retrieving nearby images for a given +query. During Geo-diversification, we leverage a prompt ensembling method that +is robust to inconsistent retrieval performance for different image queries. +Finally, we combine both retrieved and generated GPS candidates in +Geo-verification for location prediction. Experiments on two well-established +datasets IM2GPS3k and YFCC4k verify the superiority of G3 compared to other +state-of-the-art methods. Our code and data are available online for +reproduction. + +
+
+ comment: Accepted to NeurIPS2024 +
+
+
+
+
+ + ♻ ☆ Improving Adversarial Robust Fairness via Anti-Bias Soft Label + Distillation NeurIPS2024 + + +
+ Adversarial Training (AT) has been widely proved to be an effective method to +improve the adversarial robustness against adversarial examples for Deep Neural +Networks (DNNs). As a variant of AT, Adversarial Robustness Distillation (ARD) +has demonstrated its superior performance in improving the robustness of small +student models with the guidance of large teacher models. However, both AT and +ARD encounter the robust fairness problem: these models exhibit strong +robustness when facing part of classes (easy class), but weak robustness when +facing others (hard class). In this paper, we give an in-depth analysis of the +potential factors and argue that the smoothness degree of samples' soft labels +for different classes (i.e., hard class or easy class) will affect the robust +fairness of DNNs from both empirical observation and theoretical analysis. +Based on the above finding, we propose an Anti-Bias Soft Label Distillation +(ABSLD) method to mitigate the adversarial robust fairness problem within the +framework of Knowledge Distillation (KD). Specifically, ABSLD adaptively +reduces the student's error risk gap between different classes to achieve +fairness by adjusting the class-wise smoothness degree of samples' soft labels +during the training process, and the smoothness degree of soft labels is +controlled by assigning different temperatures in KD to different classes. +Extensive experiments demonstrate that ABSLD outperforms state-of-the-art AT, +ARD, and robust fairness methods in the comprehensive metric (Normalized +Standard Deviation) of robustness and fairness. + +
+
+ comment: Accepted by NeurIPS2024 +
+
+
+
+
+ + ♻ ☆ iVideoGPT: Interactive VideoGPTs are Scalable World Models NeurIPS 2024 + + +
+ World models empower model-based agents to interactively explore, reason, and +plan within imagined environments for real-world decision-making. However, the +high demand for interactivity poses challenges in harnessing recent +advancements in video generative models for developing world models at scale. +This work introduces Interactive VideoGPT (iVideoGPT), a scalable +autoregressive transformer framework that integrates multimodal signals--visual +observations, actions, and rewards--into a sequence of tokens, facilitating an +interactive experience of agents via next-token prediction. iVideoGPT features +a novel compressive tokenization technique that efficiently discretizes +high-dimensional visual observations. Leveraging its scalable architecture, we +are able to pre-train iVideoGPT on millions of human and robotic manipulation +trajectories, establishing a versatile foundation that is adaptable to serve as +interactive world models for a wide range of downstream tasks. These include +action-conditioned video prediction, visual planning, and model-based +reinforcement learning, where iVideoGPT achieves competitive performance +compared with state-of-the-art methods. Our work advances the development of +interactive general world models, bridging the gap between generative video +models and practical model-based reinforcement learning applications. Code and +pre-trained models are available at https://thuml.github.io/iVideoGPT. + +
+
+ comment: NeurIPS 2024. Code is available at project website: + https://thuml.github.io/iVideoGPT +
+
+
+
+
+ + ♻ ☆ Rethinking Human Evaluation Protocol for Text-to-Video Models: Enhancing + Reliability,Reproducibility, and Practicality + + +
+ Recent text-to-video (T2V) technology advancements, as demonstrated by models +such as Gen2, Pika, and Sora, have significantly broadened its applicability +and popularity. Despite these strides, evaluating these models poses +substantial challenges. Primarily, due to the limitations inherent in automatic +metrics, manual evaluation is often considered a superior method for assessing +T2V generation. However, existing manual evaluation protocols face +reproducibility, reliability, and practicality issues. To address these +challenges, this paper introduces the Text-to-Video Human Evaluation (T2VHE) +protocol, a comprehensive and standardized protocol for T2V models. The T2VHE +protocol includes well-defined metrics, thorough annotator training, and an +effective dynamic evaluation module. Experimental results demonstrate that this +protocol not only ensures high-quality annotations but can also reduce +evaluation costs by nearly 50\%. We will open-source the entire setup of the +T2VHE protocol, including the complete protocol workflow, the dynamic +evaluation component details, and the annotation interface code. This will help +communities establish more sophisticated human assessment protocols. + +
+
+
+
+
+ + ♻ ☆ PhyRecon: Physically Plausible Neural Scene Reconstruction NeurIPS'24 + + +
+ We address the issue of physical implausibility in multi-view neural +reconstruction. While implicit representations have gained popularity in +multi-view 3D reconstruction, previous work struggles to yield physically +plausible results, limiting their utility in domains requiring rigorous +physical accuracy. This lack of plausibility stems from the absence of physics +modeling in existing methods and their inability to recover intricate +geometrical structures. In this paper, we introduce PHYRECON, the first +approach to leverage both differentiable rendering and differentiable physics +simulation to learn implicit surface representations. PHYRECON features a novel +differentiable particle-based physical simulator built on neural implicit +representations. Central to this design is an efficient transformation between +SDF-based implicit representations and explicit surface points via our proposed +Surface Points Marching Cubes (SP-MC), enabling differentiable learning with +both rendering and physical losses. Additionally, PHYRECON models both +rendering and physical uncertainty to identify and compensate for inconsistent +and inaccurate monocular geometric priors. The physical uncertainty further +facilitates physics-guided pixel sampling to enhance the learning of slender +structures. By integrating these techniques, our model supports differentiable +joint modeling of appearance, geometry, and physics. Extensive experiments +demonstrate that PHYRECON significantly improves the reconstruction quality. +Our results also exhibit superior physical stability in physical simulators, +with at least a 40% improvement across all datasets, paving the way for future +physics-based applications. + +
+
+ comment: NeurIPS'24. Project page: https://phyrecon.github.io/ +
+
+
+
+
+ + ♻ ☆ Model LEGO: Creating Models Like Disassembling and Assembling Building + Blocks + + +
+ With the rapid development of deep learning, the increasing complexity and +scale of parameters make training a new model increasingly resource-intensive. +In this paper, we start from the classic convolutional neural network (CNN) and +explore a paradigm that does not require training to obtain new models. Similar +to the birth of CNN inspired by receptive fields in the biological visual +system, we draw inspiration from the information subsystem pathways in the +biological visual system and propose Model Disassembling and Assembling (MDA). +During model disassembling, we introduce the concept of relative contribution +and propose a component locating technique to extract task-aware components +from trained CNN classifiers. For model assembling, we present the alignment +padding strategy and parameter scaling strategy to construct a new model +tailored for a specific task, utilizing the disassembled task-aware components. +The entire process is akin to playing with LEGO bricks, enabling arbitrary +assembly of new models, and providing a novel perspective for model creation +and reuse. Extensive experiments showcase that task-aware components +disassembled from CNN classifiers or new models assembled using these +components closely match or even surpass the performance of the baseline, +demonstrating its promising results for model reuse. Furthermore, MDA exhibits +diverse potential applications, with comprehensive experiments exploring model +decision route analysis, model compression, knowledge distillation, and more. +The code is available at https://github.com/jiaconghu/Model-LEGO. + +
+
+
+
+
+ + ♻ ☆ ETO:Efficient Transformer-based Local Feature Matching by Organizing + Multiple Homography Hypotheses + + +
+ We tackle the efficiency problem of learning local feature matching. Recent +advancements have given rise to purely CNN-based and transformer-based +approaches, each augmented with deep learning techniques. While CNN-based +methods often excel in matching speed, transformer-based methods tend to +provide more accurate matches. We propose an efficient transformer-based +network architecture for local feature matching. This technique is built on +constructing multiple homography hypotheses to approximate the continuous +correspondence in the real world and uni-directional cross-attention to +accelerate the refinement. On the YFCC100M dataset, our matching accuracy is +competitive with LoFTR, a state-of-the-art transformer-based architecture, +while the inference speed is boosted to 4 times, even outperforming the +CNN-based methods. Comprehensive evaluations on other open datasets such as +Megadepth, ScanNet, and HPatches demonstrate our method's efficacy, +highlighting its potential to significantly enhance a wide array of downstream +applications. + +
+
+
+
+
+ + ♻ ☆ FM-Fusion: Instance-aware Semantic Mapping Boosted by Vision-Language + Foundation Models + + +
+ Semantic mapping based on the supervised object detectors is sensitive to +image distribution. In real-world environments, the object detection and +segmentation performance can lead to a major drop, preventing the use of +semantic mapping in a wider domain. On the other hand, the development of +vision-language foundation models demonstrates a strong zero-shot +transferability across data distribution. It provides an opportunity to +construct generalizable instance-aware semantic maps. Hence, this work explores +how to boost instance-aware semantic mapping from object detection generated +from foundation models. We propose a probabilistic label fusion method to +predict close-set semantic classes from open-set label measurements. An +instance refinement module merges the over-segmented instances caused by +inconsistent segmentation. We integrate all the modules into a unified semantic +mapping system. Reading a sequence of RGB-D input, our work incrementally +reconstructs an instance-aware semantic map. We evaluate the zero-shot +performance of our method in ScanNet and SceneNN datasets. Our method achieves +40.3 mean average precision (mAP) on the ScanNet semantic instance segmentation +task. It outperforms the traditional semantic mapping method significantly. + +
+
+ comment: Published in IEEE RAL +
+
+
+
+
+ + ♻ ☆ UniAR: A Unified model for predicting human Attention and Responses on + visual content NeurIPS 2024 + + +
+ Progress in human behavior modeling involves understanding both implicit, +early-stage perceptual behavior, such as human attention, and explicit, +later-stage behavior, such as subjective preferences or likes. Yet most prior +research has focused on modeling implicit and explicit human behavior in +isolation; and often limited to a specific type of visual content. We propose +UniAR -- a unified model of human attention and preference behavior across +diverse visual content. UniAR leverages a multimodal transformer to predict +subjective feedback, such as satisfaction or aesthetic quality, along with the +underlying human attention or interaction heatmaps and viewing order. We train +UniAR on diverse public datasets spanning natural images, webpages, and graphic +designs, and achieve SOTA performance on multiple benchmarks across various +image domains and behavior modeling tasks. Potential applications include +providing instant feedback on the effectiveness of UIs/visual content, and +enabling designers and content-creation models to optimize their creation for +human-centric improvements. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ One Prompt to Verify Your Models: Black-Box Text-to-Image Models + Verification via Non-Transferable Adversarial Attacks + + +
+ Recently, the success of Text-to-Image (T2I) models has led to the rise of +numerous third-party platforms, which claim to provide cheaper API services and +more flexibility in model options. However, this also raises a new security +concern: Are these third-party services truly offering the models they claim? +To address this problem, we propose the first T2I model verification method +named Text-to-Image Model Verification via Non-Transferable Adversarial Attacks +(TVN). The non-transferability of adversarial examples means that these +examples are only effective on a target model and ineffective on other models, +thereby allowing for the verification of the target model. TVN utilizes the +Non-dominated Sorting Genetic Algorithm II (NSGA-II) to optimize the cosine +similarity of a prompt's text encoding, generating non-transferable adversarial +prompts. By calculating the CLIP-text scores between the non-transferable +adversarial prompts without perturbations and the images, we can verify if the +model matches the claimed target model, based on a 3-sigma threshold. The +experiments showed that TVN performed well in both closed-set and open-set +scenarios, achieving a verification accuracy of over 90\%. Moreover, the +adversarial prompts generated by TVN significantly reduced the CLIP-text scores +of the target model, while having little effect on other models. + +
+
+
+
+
+ + ♻ ☆ Detection of adrenal anomalous findings in spinal CT images using multi + model graph aggregation + + +
+ Low back pain is the symptom that is the second most frequently reported to +primary care physicians, effecting 50 to 80 percent of the population in a +lifetime, resulting in multiple referrals of patients suffering from back +problems, to CT and MRI scans, which are then examined by radiologists. The +radiologists examining these spinal scans naturally focus on spinal pathologies +and might miss other types of abnormalities, and in particular, abdominal ones, +such as malignancies. Nevertheless, the patients whose spine was scanned might +as well have malignant and other abdominal pathologies. Thus, clinicians have +suggested the need for computerized assistance and decision support in +screening spinal scans for additional abnormalities. In the current study, We +have addressed the important case of detecting suspicious lesions in the +adrenal glands as an example for the overall methodology we have developed. A +patient CT scan is integrated from multiple slices with an axial orientation. +Our method determines whether a patient has an abnormal adrenal gland, and +localises the abnormality if it exists. Our method is composed of three deep +learning models; each model has a different task for achieving the final goal. +We call our compound method the Multi Model Graph Aggregation MMGA method. The +novelty in this study is twofold. First, the use, for an important screening +task, of CT scans that are originally focused and tuned for imaging the spine, +which were acquired from patients with potential spinal disorders, for +detection of a totally different set of abnormalities such as abdominal Adrenal +glands pathologies. Second, we have built a complex pipeline architecture +composed from three deep learning models that can be utilized for other organs +(such as the pancreas or the kidney), or for similar applications, but using +other types of imaging, such as MRI. + +
+
+
+
+
+ + ♻ ☆ VASA-1: Lifelike Audio-Driven Talking Faces Generated in Real Time NeurIPS 2024 + + +
+ We introduce VASA, a framework for generating lifelike talking faces with +appealing visual affective skills (VAS) given a single static image and a +speech audio clip. Our premiere model, VASA-1, is capable of not only +generating lip movements that are exquisitely synchronized with the audio, but +also producing a large spectrum of facial nuances and natural head motions that +contribute to the perception of authenticity and liveliness. The core +innovations include a holistic facial dynamics and head movement generation +model that works in a face latent space, and the development of such an +expressive and disentangled face latent space using videos. Through extensive +experiments including evaluation on a set of new metrics, we show that our +method significantly outperforms previous methods along various dimensions +comprehensively. Our method not only delivers high video quality with realistic +facial and head dynamics but also supports the online generation of 512x512 +videos at up to 40 FPS with negligible starting latency. It paves the way for +real-time engagements with lifelike avatars that emulate human conversational +behaviors. + +
+
+ comment: NeurIPS 2024 (Oral) Camera ready. Project webpage: + https://www.microsoft.com/en-us/research/project/vasa-1/ +
+
+
+
+
+ + ♻ ☆ Generalizing Alignment Paradigm of Text-to-Image Generation with + Preferences through $f$-divergence Minimization + + +
+ Direct Preference Optimization (DPO) has recently expanded its successful +application from aligning large language models (LLMs) to aligning +text-to-image models with human preferences, which has generated considerable +interest within the community. However, we have observed that these approaches +rely solely on minimizing the reverse Kullback-Leibler divergence during +alignment process between the fine-tuned model and the reference model, +neglecting the incorporation of other divergence constraints. In this study, we +focus on extending reverse Kullback-Leibler divergence in the alignment +paradigm of text-to-image models to $f$-divergence, which aims to garner better +alignment performance as well as good generation diversity. We provide the +generalized formula of the alignment paradigm under the $f$-divergence +condition and thoroughly analyze the impact of different divergence constraints +on alignment process from the perspective of gradient fields. We conduct +comprehensive evaluation on image-text alignment performance, human value +alignment performance and generation diversity performance under different +divergence constraints, and the results indicate that alignment based on +Jensen-Shannon divergence achieves the best trade-off among them. The option of +divergence employed for aligning text-to-image models significantly impacts the +trade-off between alignment performance (especially human value alignment) and +generation diversity, which highlights the necessity of selecting an +appropriate divergence for practical applications. + +
+
+ comment: 34 pages +
+
+
+
+
+ + ♻ ☆ From Explicit Rules to Implicit Reasoning in an Interpretable Violence + Monitoring System + + +
+ Recently, research based on pre-trained models has demonstrated outstanding +performance in violence surveillance tasks. However, these black-box systems +face challenges regarding explainability during training and inference +processes. An important question is how to incorporate explicit knowledge into +these implicit models, thereby designing expert-driven and interpretable +violence surveillance systems. This paper proposes a new paradigm for weakly +supervised violence monitoring (WSVM) called Rule base Violence monitoring +(RuleVM). The proposed RuleVM uses a dual-branch structure for different +designs for images and text. One of the branches is called the implicit branch, +which uses only visual features for coarse-grained binary classification. In +this branch, image feature extraction is divided into two channels: one +responsible for extracting scene frames and the other focusing on extracting +actions. The other branch is called the explicit branch, which utilizes +language-image alignment to perform fine-grained classification. For the +language channel design in the explicit branch, the proposed RuleCLIP uses the +state-of-the-art YOLO-World model to detect objects and actions in video +frames, and association rules are identified through data mining methods as +descriptions of the video. Leveraging the dual-branch architecture, RuleVM +achieves interpretable coarse-grained and fine-grained violence surveillance. +Extensive experiments were conducted on two commonly used benchmarks, and the +results show that RuleCLIP achieved the best performance in both coarse-grained +and fine-grained detection, significantly outperforming existing +state-of-the-art methods. Moreover, interpretability experiments uncovered some +interesting rules, such as the observation that as the number of people +increases, the risk level of violent behavior also rises. + +
+
+ comment: 12 pages,7 figures +
+
+
+
+
+ + ♻ ☆ A Longitudinal Analysis of Racial and Gender Bias in New York Times and + Fox News Images and Articles + + +
+ The manner in which different racial and gender groups are portrayed in news +coverage plays a large role in shaping public opinion. As such, understanding +how such groups are portrayed in news media is of notable societal value, and +has thus been a significant endeavour in both the computer and social sciences. +Yet, the literature still lacks a longitudinal study examining both the +frequency of appearance of different racial and gender groups in online news +articles, as well as the context in which such groups are discussed. To fill +this gap, we propose two machine learning classifiers to detect the race and +age of a given subject. Next, we compile a dataset of 123,337 images and +441,321 online news articles from New York Times (NYT) and Fox News (Fox), and +examine representation through two computational approaches. Firstly, we +examine the frequency and prominence of appearance of racial and gender groups +in images embedded in news articles, revealing that racial and gender +minorities are largely under-represented, and when they do appear, they are +featured less prominently compared to majority groups. Furthermore, we find +that NYT largely features more images of racial minority groups compared to +Fox. Secondly, we examine both the frequency and context with which racial +minority groups are presented in article text. This reveals the narrow scope in +which certain racial groups are covered and the frequency with which different +groups are presented as victims and/or perpetrators in a given conflict. Taken +together, our analysis contributes to the literature by providing two novel +open-source classifiers to detect race and age from images, and shedding light +on the racial and gender biases in news articles from venues on opposite ends +of the American political spectrum. + +
+
+ comment: 13 pages, and 11 figures +
+
+
+
+
+ + ♻ ☆ Mind the Context: Attention-Guided Weak-to-Strong Consistency for + Enhanced Semi-Supervised Medical Image Segmentation + + +
+ Medical image segmentation is a pivotal step in diagnostic and therapeutic +processes, relying on high-quality annotated data that is often challenging and +costly to obtain. Semi-supervised learning offers a promising approach to +enhance model performance by leveraging unlabeled data. Although weak-to-strong +consistency is a prevalent method in semi-supervised image segmentation, there +is a scarcity of research on perturbation strategies specifically tailored for +semi-supervised medical image segmentation tasks. To address this challenge, +this paper introduces a simple yet efficient semi-supervised learning framework +named Attention-Guided weak-to-strong Consistency Match (AIGCMatch). The +AIGCMatch framework incorporates attention-guided perturbation strategies at +both the image and feature levels to achieve weak-to-strong consistency +regularization. This method not only preserves the structural information of +medical images but also enhances the model's ability to process complex +semantic information. Extensive experiments conducted on the ACDC and ISIC-2017 +datasets have validated the effectiveness of AIGCMatch. Our method achieved a +90.4\% Dice score in the 7-case scenario on the ACDC dataset, surpassing the +state-of-the-art methods and demonstrating its potential and efficacy in +clinical settings. Additionally, on the ISIC-2017 dataset, we significantly +outperformed our baseline, indicating the robustness and generalizability of +AIGCMatch across different medical image segmentation tasks. + +
+
+
+
+
+ + ♻ ☆ Epipolar-Free 3D Gaussian Splatting for Generalizable Novel View + Synthesis NeurIPS 2024 + + +
+ Generalizable 3D Gaussian splitting (3DGS) can reconstruct new scenes from +sparse-view observations in a feed-forward inference manner, eliminating the +need for scene-specific retraining required in conventional 3DGS. However, +existing methods rely heavily on epipolar priors, which can be unreliable in +complex realworld scenes, particularly in non-overlapping and occluded regions. +In this paper, we propose eFreeSplat, an efficient feed-forward 3DGS-based +model for generalizable novel view synthesis that operates independently of +epipolar line constraints. To enhance multiview feature extraction with 3D +perception, we employ a selfsupervised Vision Transformer (ViT) with cross-view +completion pre-training on large-scale datasets. Additionally, we introduce an +Iterative Cross-view Gaussians Alignment method to ensure consistent depth +scales across different views. Our eFreeSplat represents an innovative approach +for generalizable novel view synthesis. Different from the existing pure +geometry-free methods, eFreeSplat focuses more on achieving epipolar-free +feature matching and encoding by providing 3D priors through cross-view +pretraining. We evaluate eFreeSplat on wide-baseline novel view synthesis tasks +using the RealEstate10K and ACID datasets. Extensive experiments demonstrate +that eFreeSplat surpasses state-of-the-art baselines that rely on epipolar +priors, achieving superior geometry reconstruction and novel view synthesis +quality. Project page: https://tatakai1.github.io/efreesplat/. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Perceiving Longer Sequences With Bi-Directional Cross-Attention + Transformers NeurIPS 2024 + + +
+ We present a novel bi-directional Transformer architecture (BiXT) which +scales linearly with input size in terms of computational cost and memory +consumption, but does not suffer the drop in performance or limitation to only +one input modality seen with other efficient Transformer-based approaches. BiXT +is inspired by the Perceiver architectures but replaces iterative attention +with an efficient bi-directional cross-attention module in which input tokens +and latent variables attend to each other simultaneously, leveraging a +naturally emerging attention-symmetry between the two. This approach unlocks a +key bottleneck experienced by Perceiver-like architectures and enables the +processing and interpretation of both semantics ('what') and location ('where') +to develop alongside each other over multiple layers -- allowing its direct +application to dense and instance-based tasks alike. By combining efficiency +with the generality and performance of a full Transformer architecture, BiXT +can process longer sequences like point clouds, text or images at higher +feature resolutions and achieves competitive performance across a range of +tasks like point cloud part segmentation, semantic image segmentation, image +classification, hierarchical sequence modeling and document retrieval. Our +experiments demonstrate that BiXT models outperform larger competitors by +leveraging longer sequences more efficiently on vision tasks like +classification and segmentation, and perform on par with full Transformer +variants on sequence modeling and document retrieval -- but require $28\%$ +fewer FLOPs and are up to $8.4\times$ faster. + +
+
+ comment: Accepted at NeurIPS 2024; Code and models will be available at + https://github.com/mrkshllr/BiXT +
+
+
+
+
+ + ♻ ☆ Multiview Scene Graph NeurIPS 2024 + + +
+ A proper scene representation is central to the pursuit of spatial +intelligence where agents can robustly reconstruct and efficiently understand +3D scenes. A scene representation is either metric, such as landmark maps in 3D +reconstruction, 3D bounding boxes in object detection, or voxel grids in +occupancy prediction, or topological, such as pose graphs with loop closures in +SLAM or visibility graphs in SfM. In this work, we propose to build Multiview +Scene Graphs (MSG) from unposed images, representing a scene topologically with +interconnected place and object nodes. The task of building MSG is challenging +for existing representation learning methods since it needs to jointly address +both visual place recognition, object detection, and object association from +images with limited fields of view and potentially large viewpoint changes. To +evaluate any method tackling this task, we developed an MSG dataset and +annotation based on a public 3D dataset. We also propose an evaluation metric +based on the intersection-over-union score of MSG edges. Moreover, we develop a +novel baseline method built on mainstream pretrained vision models, combining +visual place recognition and object association into one Transformer decoder +architecture. Experiments demonstrate that our method has superior performance +compared to existing relevant baselines. + +
+
+ comment: To be published in NeurIPS 2024. Website at + https://ai4ce.github.io/MSG/ +
+
+
+
+
+ + ♻ ☆ Dinomaly: The Less Is More Philosophy in Multi-Class Unsupervised + Anomaly Detection + + +
+ Recent studies highlighted a practical setting of unsupervised anomaly +detection (UAD) that builds a unified model for multi-class images, serving as +an alternative to the conventional one-class-one-model setup. Despite various +advancements addressing this challenging task, the detection performance under +the multi-class setting still lags far behind state-of-the-art class-separated +models. Our research aims to bridge this substantial performance gap. In this +paper, we introduce a minimalistic reconstruction-based anomaly detection +framework, namely Dinomaly, which leverages pure Transformer architectures +without relying on complex designs, additional modules, or specialized tricks. +Given this powerful framework consisted of only Attentions and MLPs, we found +four simple components that are essential to multi-class anomaly detection: (1) +Foundation Transformers that extracts universal and discriminative features, +(2) Noisy Bottleneck where pre-existing Dropouts do all the noise injection +tricks, (3) Linear Attention that naturally cannot focus, and (4) Loose +Reconstruction that does not force layer-to-layer and point-by-point +reconstruction. Extensive experiments are conducted across popular anomaly +detection benchmarks including MVTec-AD, VisA, and Real-IAD. Our proposed +Dinomaly achieves impressive image-level AUROC of 99.6%, 98.7%, and 89.3% on +the three datasets respectively, which is not only superior to state-of-the-art +multi-class UAD methods, but also achieves the most advanced class-separated +UAD records. + +
+
+
+
+
+ + ♻ ☆ VLM Agents Generate Their Own Memories: Distilling Experience into + Embodied Programs + + +
+ Large-scale generative language and vision-language models excel in +in-context learning for decision making. However, they require high-quality +exemplar demonstrations to be included in their context window. In this work, +we ask: Can LLMs and VLMs generate their own examples from generic, sub-optimal +demonstrations? We propose In-Context Abstraction Learning (ICAL), a method +that builds a memory of multimodal experience from sub-optimal demonstrations +and human feedback. Given a task demonstration that may contain inefficiencies +or mistakes, a VLM abstracts the trajectory into a generalized program by +correcting inefficient actions and annotating cognitive abstractions: causal +relationships, object state changes, temporal subgoals, and task-relevant +visual elements. These abstractions are iteratively improved through human +feedback while the agent attempts to execute the trajectory. The resulting +examples, when used as exemplars in the prompt, significantly improve +decision-making in retrieval-augmented LLM and VLM agents. Moreover, as the +agent's library of examples grows, it becomes more efficient, relying less on +human feedback and requiring fewer environment interactions per demonstration. +Our ICAL agent surpasses the state-of-the-art in dialogue-based instruction +following in TEACh, multimodal web agents in VisualWebArena, and action +anticipation in Ego4D. In TEACh, we achieve a 12.6% improvement in +goal-condition success. In VisualWebArena, our task success rate improves over +the SOTA from 14.3% to 22.7% using GPT4V. In Ego4D action forecasting, we +improve over few-shot GPT-4V and remain competitive with supervised models. We +show finetuning our retrieval-augmented in-context agent yields additional +improvements. Our approach significantly reduces reliance on manual prompt +engineering and consistently outperforms in-context learning from action plans +that lack such abstractions. + +
+
+ comment: Project website: http://ical-learning.github.io/ +
+
+
+
+
+ + ♻ ☆ CLAP4CLIP: Continual Learning with Probabilistic Finetuning for + Vision-Language Models NeurIPS 2024 + + +
+ Continual learning (CL) aims to help deep neural networks learn new knowledge +while retaining what has been learned. Owing to their powerful +generalizability, pre-trained vision-language models such as Contrastive +Language-Image Pre-training (CLIP) have lately gained traction as practical CL +candidates. However, the domain mismatch between the pre-training and the +downstream CL tasks often calls for finetuning of the CLIP on the latter. Most +existing finetuning methods exhibit deterministic nature. This makes them +overlook the many possible interactions across the input modalities and deems +them unsafe for high-risk tasks requiring reliable uncertainty estimation. To +address these, our work proposes Continual LeArning with Probabilistic +finetuning (CLAP) - a probabilistic modeling framework over visual-guided text +features per task, thus providing more calibrated CL finetuning. Unlike recent +data-hungry anti-forgetting CL techniques, CLAP alleviates forgetting by +exploiting the rich pre-trained knowledge of CLIP for weight initialization and +distribution regularization of task-specific parameters. Cooperating with the +diverse range of existing prompting methods, CLAP can surpass the predominant +deterministic finetuning approaches for CL with CLIP. We conclude with +out-of-the-box applications of superior uncertainty estimation abilities of +CLAP including novel data detection and exemplar selection within the existing +CL setups. Our code is available at +\url{https://github.com/srvCodes/clap4clip}. + +
+
+ comment: Accepted as a poster at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ PipeFusion: Patch-level Pipeline Parallelism for Diffusion Transformers + Inference + + +
+ This paper presents PipeFusion, an innovative parallel methodology to tackle +the high latency issues associated with generating high-resolution images using +diffusion transformers (DiTs) models. PipeFusion partitions images into patches +and the model layers across multiple GPUs. It employs a patch-level pipeline +parallel strategy to orchestrate communication and computation efficiently. By +capitalizing on the high similarity between inputs from successive diffusion +steps, PipeFusion reuses one-step stale feature maps to provide context for the +current pipeline step. This approach notably reduces communication costs +compared to existing DiTs inference parallelism, including tensor parallel, +sequence parallel and DistriFusion. PipeFusion also exhibits superior memory +efficiency, because it can distribute model parameters across multiple devices, +making it more suitable for DiTs with large parameter sizes, such as Flux.1. +Experimental results demonstrate that PipeFusion achieves state-of-the-art +performance on 8xL40 PCIe GPUs for Pixart, Stable-Diffusion 3 and Flux.1 +models.Our Source code is available at https://github.com/xdit-project/xDiT. + +
+
+
+
+
+ + ♻ ☆ RACCooN: A Versatile Instructional Video Editing Framework with + Auto-Generated Narratives + + +
+ Recent video generative models primarily rely on carefully written text +prompts for specific tasks, like inpainting or style editing. They require +labor-intensive textual descriptions for input videos, hindering their +flexibility to adapt personal/raw videos to user specifications. This paper +proposes RACCooN, a versatile and user-friendly video-to-paragraph-to-video +generative framework that supports multiple video editing capabilities such as +removal, addition, and modification, through a unified pipeline. RACCooN +consists of two principal stages: Video-to-Paragraph (V2P) and +Paragraph-to-Video (P2V). In the V2P stage, we automatically describe video +scenes in well-structured natural language, capturing both the holistic context +and focused object details. Subsequently, in the P2V stage, users can +optionally refine these descriptions to guide the video diffusion model, +enabling various modifications to the input video, such as removing, changing +subjects, and/or adding new objects. The proposed approach stands out from +other methods through several significant contributions: (1) RACCooN suggests a +multi-granular spatiotemporal pooling strategy to generate well-structured +video descriptions, capturing both the broad context and object details without +requiring complex human annotations, simplifying precise video content editing +based on text for users. (2) Our video generative model incorporates +auto-generated narratives or instructions to enhance the quality and accuracy +of the generated content. (3) RACCooN also plans to imagine new objects in a +given video, so users simply prompt the model to receive a detailed video +editing plan for complex video editing. The proposed framework demonstrates +impressive versatile capabilities in video-to-paragraph generation, video +content editing, and can be incorporated into other SoTA video generative +models for further enhancement. + +
+
+ comment: The first two authors contribute equally. Project Page: + https://raccoon-mllm-gen.github.io/ +
+
+
+
+
+ + ♻ ☆ VILA$^2$: VILA Augmented VILA + + +
+ While visual language model architectures and training infrastructures +advance rapidly, data curation remains under-explored where quantity and +quality become a bottleneck. Existing work either crawls extra Internet data +with a loose guarantee of quality or distills from black-box proprietary +models, e.g., GPT-4V / Gemini that are API frequency and performance bounded. +This work enables a VLM to improve itself via data enhancement, exploiting its +generative nature. We introduce a simple yet effective VLM augmentation scheme +that includes a self-augment step and a specialist-augment step to iteratively +improve data quality and hence, model performance. In the self-augment step, +the instruction-finetuned VLM recaptions its pretraining caption datasets and +then retrains from scratch leveraging refined data. Without any expensive +human-in-the-loop annotation, we observe improvements in data quality and +downstream accuracy boosts with three self-augmentation rounds -- a viable free +lunch to the current VLM training recipe. When self-augmentation saturates, we +augment the caption diversity by leveraging specialty skills picked up from +instruction finetuning. We finetune VLM specialists from the self-augmented VLM +with domain-specific experts, including spatial, grounding, and OCR, to fuse +task-aware synthetic data into the pretraining stage. Data quality improvements +and hallucination reductions are cross-checked by VLM (GPT-4V, Gemini) and +human judges. Combining self-augmentation and specialist-augmented training, +VILA$^2$ consistently improves the accuracy on a wide range of benchmarks over +the prior art, producing a reusable pretraining dataset that is 300x more +cost-efficient than human labeling. + +
+
+
+
+
+ + ♻ ☆ Exploring Behavior-Relevant and Disentangled Neural Dynamics with + Generative Diffusion Models + + +
+ Understanding the neural basis of behavior is a fundamental goal in +neuroscience. Current research in large-scale neuro-behavioral data analysis +often relies on decoding models, which quantify behavioral information in +neural data but lack details on behavior encoding. This raises an intriguing +scientific question: ``how can we enable in-depth exploration of neural +representations in behavioral tasks, revealing interpretable neural dynamics +associated with behaviors''. However, addressing this issue is challenging due +to the varied behavioral encoding across different brain regions and mixed +selectivity at the population level. To tackle this limitation, our approach, +named ``BeNeDiff'', first identifies a fine-grained and disentangled neural +subspace using a behavior-informed latent variable model. It then employs +state-of-the-art generative diffusion models to synthesize behavior videos that +interpret the neural dynamics of each latent factor. We validate the method on +multi-session datasets containing widefield calcium imaging recordings across +the dorsal cortex. Through guiding the diffusion model to activate individual +latent factors, we verify that the neural dynamics of latent factors in the +disentangled neural subspace provide interpretable quantifications of the +behaviors of interest. At the same time, the neural subspace in BeNeDiff +demonstrates high disentanglement and neural reconstruction quality. + +
+
+
+
+
+ + ♻ ☆ Elliptical Attention NeurIPS 2024 + + +
+ Pairwise dot-product self-attention is key to the success of transformers +that achieve state-of-the-art performance across a variety of applications in +language and vision. This dot-product self-attention computes attention weights +among the input tokens using Euclidean distance, which makes the model prone to +representation collapse and vulnerable to contaminated samples. In this paper, +we propose using a Mahalanobis distance metric for computing the attention +weights to stretch the underlying feature space in directions of high +contextual relevance. In particular, we define a hyper-ellipsoidal neighborhood +around each query to increase the attention weights of the tokens lying in the +contextually important directions. We term this novel class of attention +Elliptical Attention. Our Elliptical Attention provides two benefits: 1) +reducing representation collapse and 2) enhancing the model's robustness as +Elliptical Attention pays more attention to contextually relevant information +rather than focusing on some small subset of informative features. We +empirically demonstrate the advantages of Elliptical Attention over the +baseline dot-product attention and state-of-the-art attention methods on +various practical tasks, including object classification, image segmentation, +and language modeling across different data modalities. + +
+
+ comment: 10 pages in the main text. Published at NeurIPS 2024. The code is + available at https://github.com/stefvk/Elliptical-Attention +
+
+
+
+
+ + ♻ ☆ NASM: Neural Anisotropic Surface Meshing SIGGRAPH + + +
+ This paper introduces a new learning-based method, NASM, for anisotropic +surface meshing. Our key idea is to propose a graph neural network to embed an +input mesh into a high-dimensional (high-d) Euclidean embedding space to +preserve curvature-based anisotropic metric by using a dot product loss between +high-d edge vectors. This can dramatically reduce the computational time and +increase the scalability. Then, we propose a novel feature-sensitive remeshing +on the generated high-d embedding to automatically capture sharp geometric +features. We define a high-d normal metric, and then derive an automatic +differentiation on a high-d centroidal Voronoi tessellation (CVT) optimization +with the normal metric to simultaneously preserve geometric features and +curvature anisotropy that exhibit in the original 3D shapes. To our knowledge, +this is the first time that a deep learning framework and a large dataset are +proposed to construct a high-d Euclidean embedding space for 3D anisotropic +surface meshing. Experimental results are evaluated and compared with the +state-of-the-art in anisotropic surface meshing on a large number of surface +models from Thingi10K dataset as well as tested on extensive unseen 3D shapes +from Multi-Garment Network dataset and FAUST human dataset. + +
+
+ comment: SIGGRAPH Asia 2024 (Conference Track) +
+
+
+
+
+ + ♻ ☆ SRA: A Novel Method to Improve Feature Embedding in Self-supervised + Learning for Histopathological Images + + +
+ Self-supervised learning has become a cornerstone in various areas, +particularly histopathological image analysis. Image augmentation plays a +crucial role in self-supervised learning, as it generates variations in image +samples. However, traditional image augmentation techniques often overlook the +unique characteristics of histopathological images. In this paper, we propose a +new histopathology-specific image augmentation method called stain +reconstruction augmentation (SRA). We integrate our SRA with MoCo v3, a leading +model in self-supervised contrastive learning, along with our additional +contrastive loss terms, and call the new model SRA-MoCo v3. We demonstrate that +our SRA-MoCo v3 always outperforms the standard MoCo v3 across various +downstream tasks and achieves comparable or superior performance to other +foundation models pre-trained on significantly larger histopathology datasets. + +
+
+ comment: Hamid Manoochehri and Bodong Zhang contributed equally to this work +
+
+
+
+
+ + ♻ ☆ CLIBD: Bridging Vision and Genomics for Biodiversity Monitoring at Scale + + +
+ Measuring biodiversity is crucial for understanding ecosystem health. While +prior works have developed machine learning models for taxonomic classification +of photographic images and DNA separately, in this work, we introduce a +multimodal approach combining both, using CLIP-style contrastive learning to +align images, barcode DNA, and text-based representations of taxonomic labels +in a unified embedding space. This allows for accurate classification of both +known and unknown insect species without task-specific fine-tuning, leveraging +contrastive learning for the first time to fuse DNA and image data. Our method +surpasses previous single-modality approaches in accuracy by over 8% on +zero-shot learning tasks, showcasing its effectiveness in biodiversity studies. + +
+
+ comment: 25 pages with 11 figures +
+
+
+
+
+ + ♻ ☆ Domain-Adaptive Pre-training of Self-Supervised Foundation Models for + Medical Image Classification in Gastrointestinal Endoscopy + + +
+ Video capsule endoscopy has transformed gastrointestinal endoscopy (GIE) +diagnostics by offering a non-invasive method for capturing detailed images of +the gastrointestinal tract, enabling early disease detection. However, its +potential is limited by the sheer volume of images generated during the imaging +procedure, which can take anywhere from 6-8 hours and often produce up to 1 +million images, necessitating automated analysis. Additionally, the variability +of these images, combined with the need for expert annotations and the scarcity +of large, high-quality labeled datasets, constrains the effectiveness of +current medical image analysis models. To address this, we introduce a novel +large gastrointestinal endoscopy dataset, called EndoExtend24, created by +merging and re-stratifying the train/test splits of ten existing public and +private datasets, ensuring no overlap of patient data across splits. +EndoExtend24 includes over 226,000 labeled images, as well as dynamic class +mappings, which allow unified training across datasets with differing labeling +granularity, supporting up to 123 distinct pathological findings. Further, we +propose to leverage domain adaptive pre-training of foundation models in +computer vision trained with self-supervision on generic image data, to adapt +them to the task of GIE medical diagnosis. Specifically, the EVA-02 model, +which is based on the vision transformer architecture and was trained on +ImageNet-22k with masked image modeling (using EVA-CLIP as a MIM teacher), is +pre-trained on the novel EndoExtend24 dataset to achieve domain adaptation, and +finally trained on the Capsule Endoscopy 2024 Challenge dataset. Experimental +results demonstrate strong performance with an F1 score of 0.88, an improvement +of about 39% over the baseline model's F1 score of 0.49. Additionally, the +model achieved a macro AUC score of 0.993 and a balanced accuracy of 89.3%. + +
+
+
+
+
+ + ♻ ☆ SegLLM: Multi-round Reasoning Segmentation + + +
+ We present SegLLM, a novel multi-round interactive reasoning segmentation +model that enhances LLM-based segmentation by exploiting conversational memory +of both visual and textual outputs. By leveraging a mask-aware multimodal LLM, +SegLLM re-integrates previous segmentation results into its input stream, +enabling it to reason about complex user intentions and segment objects in +relation to previously identified entities, including positional, +interactional, and hierarchical relationships, across multiple interactions. +This capability allows SegLLM to respond to visual and text queries in a +chat-like manner. Evaluated on the newly curated MRSeg benchmark, SegLLM +outperforms existing methods in multi-round interactive reasoning segmentation +by over 20%. Additionally, we observed that training on multi-round reasoning +segmentation data enhances performance on standard single-round referring +segmentation and localization tasks, resulting in a 5.5% increase in cIoU for +referring expression segmentation and a 4.5% improvement in Acc@0.5 for +referring expression localization. + +
+
+ comment: 22 pages, 10 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ Tensor-Based Synchronization and the Low-Rankness of the Block Trifocal + Tensor NeurIPS 2024 + + +
+ The block tensor of trifocal tensors provides crucial geometric information +on the three-view geometry of a scene. The underlying synchronization problem +seeks to recover camera poses (locations and orientations up to a global +transformation) from the block trifocal tensor. We establish an explicit Tucker +factorization of this tensor, revealing a low multilinear rank of $(6,4,4)$ +independent of the number of cameras under appropriate scaling conditions. We +prove that this rank constraint provides sufficient information for camera +recovery in the noiseless case. The constraint motivates a synchronization +algorithm based on the higher-order singular value decomposition of the block +trifocal tensor. Experimental comparisons with state-of-the-art global +synchronization methods on real datasets demonstrate the potential of this +algorithm for significantly improving location estimation accuracy. Overall +this work suggests that higher-order interactions in synchronization problems +can be exploited to improve performance, beyond the usual pairwise-based +approaches. + +
+
+ comment: 33 pages, 3 figures. Accepted at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Probabilistic Conceptual Explainers: Trustworthy Conceptual Explanations + for Vision Foundation Models ICML 2024 + + +
+ Vision transformers (ViTs) have emerged as a significant area of focus, +particularly for their capacity to be jointly trained with large language +models and to serve as robust vision foundation models. Yet, the development of +trustworthy explanation methods for ViTs has lagged, particularly in the +context of post-hoc interpretations of ViT predictions. Existing sub-image +selection approaches, such as feature-attribution and conceptual models, fall +short in this regard. This paper proposes five desiderata for explaining ViTs +-- faithfulness, stability, sparsity, multi-level structure, and parsimony -- +and demonstrates the inadequacy of current methods in meeting these criteria +comprehensively. We introduce a variational Bayesian explanation framework, +dubbed ProbAbilistic Concept Explainers (PACE), which models the distributions +of patch embeddings to provide trustworthy post-hoc conceptual explanations. +Our qualitative analysis reveals the distributions of patch-level concepts, +elucidating the effectiveness of ViTs by modeling the joint distribution of +patch embeddings and ViT's predictions. Moreover, these patch-level +explanations bridge the gap between image-level and dataset-level explanations, +thus completing the multi-level structure of PACE. Through extensive +experiments on both synthetic and real-world datasets, we demonstrate that PACE +surpasses state-of-the-art methods in terms of the defined desiderata. + +
+
+ comment: Proceedings of the 41st International Conference on Machine Learning + (ICML 2024) +
+
+
+
+
+ + ♻ ☆ Adaptive Aggregation Weights for Federated Segmentation of Pancreas MRI + + +
+ Federated learning (FL) enables collaborative model training across +institutions without sharing sensitive data, making it an attractive solution +for medical imaging tasks. However, traditional FL methods, such as Federated +Averaging (FedAvg), face difficulties in generalizing across domains due to +variations in imaging protocols and patient demographics across institutions. +This challenge is particularly evident in pancreas MRI segmentation, where +anatomical variability and imaging artifacts significantly impact performance. +In this paper, we conduct a comprehensive evaluation of FL algorithms for +pancreas MRI segmentation and introduce a novel approach that incorporates +adaptive aggregation weights. By dynamically adjusting the contribution of each +client during model aggregation, our method accounts for domain-specific +differences and improves generalization across heterogeneous datasets. +Experimental results demonstrate that our approach enhances segmentation +accuracy and reduces the impact of domain shift compared to conventional FL +methods while maintaining privacy-preserving capabilities. Significant +performance improvements are observed across multiple hospitals (centers). + +
+
+
+
+
+ + ♻ ☆ RGB2Point: 3D Point Cloud Generation from Single RGB Images + + +
+ We introduce RGB2Point, an unposed single-view RGB image to a 3D point cloud +generation based on Transformer. RGB2Point takes an input image of an object +and generates a dense 3D point cloud. Contrary to prior works based on CNN +layers and diffusion denoising approaches, we use pre-trained Transformer +layers that are fast and generate high-quality point clouds with consistent +quality over available categories. Our generated point clouds demonstrate high +quality on a real-world dataset, as evidenced by improved Chamfer distance +(51.15%) and Earth Mover's distance (45.96%) metrics compared to the current +state-of-the-art. Additionally, our approach shows a better quality on a +synthetic dataset, achieving better Chamfer distance (39.26%), Earth Mover's +distance (26.95%), and F-score (47.16%). Moreover, our method produces 63.1% +more consistent high-quality results across various object categories compared +to prior works. Furthermore, RGB2Point is computationally efficient, requiring +only 2.3GB of VRAM to reconstruct a 3D point cloud from a single RGB image, and +our implementation generates the results 15,133x faster than a SOTA +diffusion-based model. + +
+
+ comment: Accepted to IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2025 +
+
+
+
+
+ + ♻ ☆ Continuous Spatio-Temporal Memory Networks for 4D Cardiac Cine MRI + Segmentation + + +
+ Current cardiac cine magnetic resonance image (cMR) studies focus on the end +diastole (ED) and end systole (ES) phases, while ignoring the abundant temporal +information in the whole image sequence. This is because whole sequence +segmentation is currently a tedious process and inaccurate. Conventional whole +sequence segmentation approaches first estimate the motion field between +frames, which is then used to propagate the mask along the temporal axis. +However, the mask propagation results could be prone to error, especially for +the basal and apex slices, where through-plane motion leads to significant +morphology and structural change during the cardiac cycle. Inspired by recent +advances in video object segmentation (VOS), based on spatio-temporal memory +(STM) networks, we propose a continuous STM (CSTM) network for semi-supervised +whole heart and whole sequence cMR segmentation. Our CSTM network takes full +advantage of the spatial, scale, temporal and through-plane continuity prior of +the underlying heart anatomy structures, to achieve accurate and fast 4D +segmentation. Results of extensive experiments across multiple cMR datasets +show that our method can improve the 4D cMR segmentation performance, +especially for the hard-to-segment regions. + +
+
+ comment: Accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ Multi-Object Hallucination in Vision-Language Models NeurIPS 2024 + + +
+ Large vision language models (LVLMs) often suffer from object hallucination, +producing objects not present in the given images. While current benchmarks for +object hallucination primarily concentrate on the presence of a single object +class rather than individual entities, this work systematically investigates +multi-object hallucination, examining how models misperceive (e.g., invent +nonexistent objects or become distracted) when tasked with focusing on multiple +objects simultaneously. We introduce Recognition-based Object Probing +Evaluation (ROPE), an automated evaluation protocol that considers the +distribution of object classes within a single image during testing and uses +visual referring prompts to eliminate ambiguity. With comprehensive empirical +studies and analysis of potential factors leading to multi-object +hallucination, we found that (1). LVLMs suffer more hallucinations when +focusing on multiple objects compared to a single object. (2). The tested +object class distribution affects hallucination behaviors, indicating that +LVLMs may follow shortcuts and spurious correlations. (3). Hallucinatory +behaviors are influenced by data-specific factors, salience and frequency, and +model intrinsic behaviors. We hope to enable LVLMs to recognize and reason +about multiple objects that often occur in realistic visual scenes, provide +insights, and quantify our progress towards mitigating the issues. + +
+
+ comment: Accepted to NeurIPS 2024 | Project page: + https://multi-object-hallucination.github.io/ +
+
+
+
+
+ + ♻ ☆ LucidGrasp: Robotic Framework for Autonomous Manipulation of Laboratory + Equipment with Different Degrees of Transparency via 6D Pose Estimation + + +
+ Many modern robotic systems operate autonomously, however they often lack the +ability to accurately analyze the environment and adapt to changing external +conditions, while teleoperation systems often require special operator skills. +In the field of laboratory automation, the number of automated processes is +growing, however such systems are usually developed to perform specific tasks. +In addition, many of the objects used in this field are transparent, making it +difficult to analyze them using visual channels. The contributions of this work +include the development of a robotic framework with autonomous mode for +manipulating liquid-filled objects with different degrees of transparency in +complex pose combinations. The conducted experiments demonstrated the +robustness of the designed visual perception system to accurately estimate +object poses for autonomous manipulation, and confirmed the performance of the +algorithms in dexterous operations such as liquid dispensing. The proposed +robotic framework can be applied for laboratory automation, since it allows +solving the problem of performing non-trivial manipulation tasks with the +analysis of object poses of varying degrees of transparency and liquid levels, +requiring high accuracy and repeatability. + +
+
+ comment: Accepted to the 2024 IEEE International Conference on Robotics and + Biomimetics (IEEE ROBIO 2024), 6 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ SlowFast-VGen: Slow-Fast Learning for Action-Driven Long Video + Generation + + +
+ Human beings are endowed with a complementary learning system, which bridges +the slow learning of general world dynamics with fast storage of episodic +memory from a new experience. Previous video generation models, however, +primarily focus on slow learning by pre-training on vast amounts of data, +overlooking the fast learning phase crucial for episodic memory storage. This +oversight leads to inconsistencies across temporally distant frames when +generating longer videos, as these frames fall beyond the model's context +window. To this end, we introduce SlowFast-VGen, a novel dual-speed learning +system for action-driven long video generation. Our approach incorporates a +masked conditional video diffusion model for the slow learning of world +dynamics, alongside an inference-time fast learning strategy based on a +temporal LoRA module. Specifically, the fast learning process updates its +temporal LoRA parameters based on local inputs and outputs, thereby efficiently +storing episodic memory in its parameters. We further propose a slow-fast +learning loop algorithm that seamlessly integrates the inner fast learning loop +into the outer slow learning loop, enabling the recall of prior multi-episode +experiences for context-aware skill learning. To facilitate the slow learning +of an approximate world model, we collect a large-scale dataset of 200k videos +with language action annotations, covering a wide range of scenarios. Extensive +experiments show that SlowFast-VGen outperforms baselines across various +metrics for action-driven video generation, achieving an FVD score of 514 +compared to 782, and maintaining consistency in longer videos, with an average +of 0.37 scene cuts versus 0.89. The slow-fast learning loop algorithm +significantly enhances performances on long-horizon planning tasks as well. +Project Website: https://slowfast-vgen.github.io + +
+
+
+
+
+ + ♻ ☆ Measuring Sound Symbolism in Audio-visual Models + + +
+ Audio-visual pre-trained models have gained substantial attention recently +and demonstrated superior performance on various audio-visual tasks. This study +investigates whether pre-trained audio-visual models demonstrate non-arbitrary +associations between sounds and visual representations$\unicode{x2013}$known as +sound symbolism$\unicode{x2013}$which is also observed in humans. We developed +a specialized dataset with synthesized images and audio samples and assessed +these models using a non-parametric approach in a zero-shot setting. Our +findings reveal a significant correlation between the models' outputs and +established patterns of sound symbolism, particularly in models trained on +speech data. These results suggest that such models can capture sound-meaning +connections akin to human language processing, providing insights into both +cognitive architectures and machine learning strategies. + +
+
+ comment: Errors in the introduction part that might potentially affect the + integrity of the paper. Withdraw at the point. Will replace with an updated + version in the future +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 58 + +
+
+
+ + ☆ Bridging the Human to Robot Dexterity Gap through Object-Oriented + Rewards + + +
+ Training robots directly from human videos is an emerging area in robotics +and computer vision. While there has been notable progress with two-fingered +grippers, learning autonomous tasks for multi-fingered robot hands in this way +remains challenging. A key reason for this difficulty is that a policy trained +on human hands may not directly transfer to a robot hand due to morphology +differences. In this work, we present HuDOR, a technique that enables online +fine-tuning of policies by directly computing rewards from human videos. +Importantly, this reward function is built using object-oriented trajectories +derived from off-the-shelf point trackers, providing meaningful learning +signals despite the morphology gap and visual differences between human and +robot hands. Given a single video of a human solving a task, such as gently +opening a music box, HuDOR enables our four-fingered Allegro hand to learn the +task with just an hour of online interaction. Our experiments across four tasks +show that HuDOR achieves a 4x improvement over baselines. Code and videos are +available on our website, https://object-rewards.github.io. + +
+
+
+
+
+ + ☆ DisCo: Distributed Contact-Rich Trajectory Optimization for Forceful + Multi-Robot Collaboration + + +
+ We present DisCo, a distributed algorithm for contact-rich, multi-robot +tasks. DisCo is a distributed contact-implicit trajectory optimization +algorithm, which allows a group of robots to optimize a time sequence of forces +to objects and to their environment to accomplish tasks such as collaborative +manipulation, robot team sports, and modular robot locomotion. We build our +algorithm on a variant of the Alternating Direction Method of Multipliers +(ADMM), where each robot computes its own contact forces and contact-switching +events from a smaller single-robot, contact-implicit trajectory optimization +problem, while cooperating with other robots through dual variables, enforcing +constraints between robots. Each robot iterates between solving its local +problem, and communicating over a wireless mesh network to enforce these +consistency constraints with its neighbors, ultimately converging to a +coordinated plan for the group. The local problems solved by each robot are +significantly less challenging than a centralized problem with all robots' +contact forces and switching events, improving the computational efficiency, +while also preserving the privacy of some aspects of each robot's operation. We +demonstrate the effectiveness of our algorithm in simulations of collaborative +manipulation, multi-robot team sports scenarios, and in modular robot +locomotion, where DisCo achieves $3$x higher success rates with a 2.5x to 5x +faster computation time. Further, we provide results of hardware experiments on +a modular truss robot, with three collaborating truss nodes planning +individually while working together to produce a punctuated rolling-gate motion +of the composite structure. Videos are available on the project page: +https://disco-opt.github.io. + +
+
+
+
+
+ + ☆ SlowFast-VGen: Slow-Fast Learning for Action-Driven Long Video + Generation + + +
+ Human beings are endowed with a complementary learning system, which bridges +the slow learning of general world dynamics with fast storage of episodic +memory from a new experience. Previous video generation models, however, +primarily focus on slow learning by pre-training on vast amounts of data, +overlooking the fast learning phase crucial for episodic memory storage. This +oversight leads to inconsistencies across temporally distant frames when +generating longer videos, as these frames fall beyond the model's context +window. To this end, we introduce SlowFast-VGen, a novel dual-speed learning +system for action-driven long video generation. Our approach incorporates a +masked conditional video diffusion model for the slow learning of world +dynamics, alongside an inference-time fast learning strategy based on a +temporal LoRA module. Specifically, the fast learning process updates its +temporal LoRA parameters based on local inputs and outputs, thereby efficiently +storing episodic memory in its parameters. We further propose a slow-fast +learning loop algorithm that seamlessly integrates the inner fast learning loop +into the outer slow learning loop, enabling the recall of prior multi-episode +experiences for context-aware skill learning. To facilitate the slow learning +of an approximate world model, we collect a large-scale dataset of 200k videos +with language action annotations, covering a wide range of scenarios. Extensive +experiments show that SlowFast-VGen outperforms baselines across various +metrics for action-driven video generation, achieving an FVD score of 514 +compared to 782, and maintaining consistency in longer videos, with an average +of 0.37 scene cuts versus 0.89. The slow-fast learning loop algorithm +significantly enhances performances on long-horizon planning tasks as well. +Project Website: https://slowfast-vgen.github.io + +
+
+
+
+
+ + ☆ EMMA: End-to-End Multimodal Model for Autonomous Driving + + +
+ We introduce EMMA, an End-to-end Multimodal Model for Autonomous driving. +Built on a multi-modal large language model foundation, EMMA directly maps raw +camera sensor data into various driving-specific outputs, including planner +trajectories, perception objects, and road graph elements. EMMA maximizes the +utility of world knowledge from the pre-trained large language models, by +representing all non-sensor inputs (e.g. navigation instructions and ego +vehicle status) and outputs (e.g. trajectories and 3D locations) as natural +language text. This approach allows EMMA to jointly process various driving +tasks in a unified language space, and generate the outputs for each task using +task-specific prompts. Empirically, we demonstrate EMMA's effectiveness by +achieving state-of-the-art performance in motion planning on nuScenes as well +as competitive results on the Waymo Open Motion Dataset (WOMD). EMMA also +yields competitive results for camera-primary 3D object detection on the Waymo +Open Dataset (WOD). We show that co-training EMMA with planner trajectories, +object detection, and road graph tasks yields improvements across all three +domains, highlighting EMMA's potential as a generalist model for autonomous +driving applications. However, EMMA also exhibits certain limitations: it can +process only a small amount of image frames, does not incorporate accurate 3D +sensing modalities like LiDAR or radar and is computationally expensive. We +hope that our results will inspire further research to mitigate these issues +and to further evolve the state of the art in autonomous driving model +architectures. + +
+
+ comment: Blog post: https://waymo.com/blog/2024/10/introducing-emma/ +
+
+
+
+
+ + ☆ Keypoint Abstraction using Large Models for Object-Relative Imitation + Learning + + +
+ Generalization to novel object configurations and instances across diverse +tasks and environments is a critical challenge in robotics. Keypoint-based +representations have been proven effective as a succinct representation for +capturing essential object features, and for establishing a reference frame in +action prediction, enabling data-efficient learning of robot skills. However, +their manual design nature and reliance on additional human labels limit their +scalability. In this paper, we propose KALM, a framework that leverages large +pre-trained vision-language models (LMs) to automatically generate +task-relevant and cross-instance consistent keypoints. KALM distills robust and +consistent keypoints across views and objects by generating proposals using LMs +and verifies them against a small set of robot demonstration data. Based on the +generated keypoints, we can train keypoint-conditioned policy models that +predict actions in keypoint-centric frames, enabling robots to generalize +effectively across varying object poses, camera views, and object instances +with similar functional shapes. Our method demonstrates strong performance in +the real world, adapting to different tasks and environments from only a +handful of demonstrations while requiring no additional labels. Website: +https://kalm-il.github.io/ + +
+
+ comment: CoRL LangRob Workshop, 2024 +
+
+
+
+
+ + ☆ EMOTION: Expressive Motion Sequence Generation for Humanoid Robots with + In-Context Learning + + +
+ This paper introduces a framework, called EMOTION, for generating expressive +motion sequences in humanoid robots, enhancing their ability to engage in +humanlike non-verbal communication. Non-verbal cues such as facial expressions, +gestures, and body movements play a crucial role in effective interpersonal +interactions. Despite the advancements in robotic behaviors, existing methods +often fall short in mimicking the diversity and subtlety of human non-verbal +communication. To address this gap, our approach leverages the in-context +learning capability of large language models (LLMs) to dynamically generate +socially appropriate gesture motion sequences for human-robot interaction. We +use this framework to generate 10 different expressive gestures and conduct +online user studies comparing the naturalness and understandability of the +motions generated by EMOTION and its human-feedback version, EMOTION++, against +those by human operators. The results demonstrate that our approach either +matches or surpasses human performance in generating understandable and natural +robot motions under certain scenarios. We also provide design implications for +future research to consider a set of variables when generating expressive +robotic gestures. + +
+
+
+
+
+ + ☆ Levels of explanation -- implementation and evaluation of what and when + for different time-sensitive tasks + + +
+ In this work, we focused on constructing and evaluating levels of +explanation(LOE) that address two basic aspect of HRI: 1. What information +should be communicated to the user by the robot? 2. When should the robot +communicate this information? For constructing the LOE, we defined two terms, +verbosity and explanation patterns, each with two levels (verbosity -- high and +low, explanation patterns -- dynamic and static). Based on these parameters, +three different LOE (high, medium, and low) were constructed and evaluated in a +user study with a telepresence robot. The user study was conducted for a +simulated telerobotic healthcare task with two different conditions related to +time sensitivity, as evaluated by two different user groups -- one that +performed the task within a time limit and the other with no time limit. We +found that the high LOE was preferred in terms of adequacy of explanation, +number of collisions, number of incorrect movements, and number of +clarifications when users performed the experiment in the without time limit +condition. We also found that both high and medium LOE did not have significant +differences in completion time, the fluency of HRI, and trust in the robot. +When users performed the experiment in the with time limit condition, high and +medium LOE had better task performances and were preferred to the low LOE in +terms of completion time, fluency, adequacy of explanation, trust, number of +collisions, number of incorrect movements and number of clarifications. Future +directions for advancing LOE are discussed. + +
+
+
+
+
+ + ☆ VisualPredicator: Learning Abstract World Models with Neuro-Symbolic + Predicates for Robot Planning + + +
+ Broadly intelligent agents should form task-specific abstractions that +selectively expose the essential elements of a task, while abstracting away the +complexity of the raw sensorimotor space. In this work, we present +Neuro-Symbolic Predicates, a first-order abstraction language that combines the +strengths of symbolic and neural knowledge representations. We outline an +online algorithm for inventing such predicates and learning abstract world +models. We compare our approach to hierarchical reinforcement learning, +vision-language model planning, and symbolic predicate invention approaches, on +both in- and out-of-distribution tasks across five simulated robotic domains. +Results show that our approach offers better sample complexity, stronger +out-of-distribution generalization, and improved interpretability. + +
+
+ comment: In submission +
+
+
+
+
+ + ☆ Leader-Follower 3D Formation for Underwater Robots + + +
+ The schooling behavior of fish is hypothesized to confer many survival +benefits, including foraging success, safety from predators, and energy savings +through hydrodynamic interactions when swimming in formation. Underwater robot +collectives may be able to achieve similar benefits in future applications, +e.g. using formation control to achieve efficient spatial sampling for +environmental monitoring. Although many theoretical algorithms exist for +multi-robot formation control, they have not been tested in the underwater +domain due to the fundamental challenges in underwater communication. Here we +introduce a leader-follower strategy for underwater formation control that +allows us to realize complex 3D formations, using purely vision-based +perception and a reactive control algorithm that is low computation. We use a +physical platform, BlueSwarm, to demonstrate for the first time an experimental +realization of inline, side-by-side, and staggered swimming 3D formations. More +complex formations are studied in a physics-based simulator, providing new +insights into the convergence and stability of formations given underwater +inertial/drag conditions. Our findings lay the groundwork for future +applications of underwater robot swarms in aquatic environments with minimal +communication. + +
+
+ comment: Accepted at DARS 2024 (The 17th International Symposium on + Distributed Autonomous Robotic Systems) +
+
+
+
+
+ + ☆ S3PT: Scene Semantics and Structure Guided Clustering to Boost + Self-Supervised Pre-Training for Autonomous Driving + + +
+ Recent self-supervised clustering-based pre-training techniques like DINO and +Cribo have shown impressive results for downstream detection and segmentation +tasks. However, real-world applications such as autonomous driving face +challenges with imbalanced object class and size distributions and complex +scene geometries. In this paper, we propose S3PT a novel scene semantics and +structure guided clustering to provide more scene-consistent objectives for +self-supervised training. Specifically, our contributions are threefold: First, +we incorporate semantic distribution consistent clustering to encourage better +representation of rare classes such as motorcycles or animals. Second, we +introduce object diversity consistent spatial clustering, to handle imbalanced +and diverse object sizes, ranging from large background areas to small objects +such as pedestrians and traffic signs. Third, we propose a depth-guided spatial +clustering to regularize learning based on geometric information of the scene, +thus further refining region separation on the feature level. Our learned +representations significantly improve performance in downstream semantic +segmentation and 3D object detection tasks on the nuScenes, nuImages, and +Cityscapes datasets and show promising domain translation properties. + +
+
+ comment: Accepted for WACV 2025 +
+
+
+
+
+ + ☆ FilMBot: A High-Speed Soft Parallel Robotic Micromanipulator + + +
+ Soft robotic manipulators are generally slow despite their great +adaptability, resilience, and compliance. This limitation also extends to +current soft robotic micromanipulators. Here, we introduce FilMBot, a 3-DOF +film-based, electromagnetically actuated, soft kinematic robotic +micromanipulator achieving speeds up to 2117 $\deg$/s and 2456 $\deg$/s in +$\alpha$ and $\beta$ angular motions, with corresponding linear velocities of +1.61 m/s and 1.92 m/s using a 4-cm needle end-effector, and 1.57 m/s along the +Z axis. The robot can reach ~1.50 m/s in path-following tasks, operates at +frequencies up to 30 Hz, and remains functional up to 50 Hz. It demonstrates +high precision (~6.3 $\mu$m, or ~0.05% of its workspace) in small +path-following tasks. The novel combination of the low-stiffness soft kinematic +film structure and strong electromagnetic actuation in FilMBot opens new +avenues for soft robotics. Furthermore, its simple construction and +inexpensive, readily accessible components could broaden the application of +micromanipulators beyond current academic and professional users. + +
+
+ comment: 12 pages, 15 figures +
+
+
+
+
+ + ☆ TumblerBots: Tumbling Robotic sensors for Minimally-invasive Benthic + Monitoring + + +
+ Robotic systems show significant promise for water environmental sensing +applications such as water quality monitoring, pollution mapping and +biodiversity data collection. + Conventional deployment methods often disrupt fragile ecosystems, preventing +depiction of the undisturbed environmental condition. In response to this +challenge, we propose a novel framework utilizing a lightweight tumbler system +equipped with a sensing unit, deployed via a drone. This design minimizes +disruption to the water habitat by maintaining a slow descent. The sensing unit +is detached once on the water surface, enabling precise and non-invasive data +collection from the benthic zone. + The tumbler is designed to be lightweight and compact, enabling deployment +via a drone. The sensing pod, which detaches from the tumbler and descends to +the bottom of the water body, is equipped with temperature and pressure +sensors, as well as a buoyancy system. The later, activated upon task +completion, utilizes a silicon membrane inflated via a chemical reaction. The +reaction generates a pressure of 70 kPa, causing the silicon membrane to expand +by 30\%, which exceeds the 5.7\% volume increase required for positive +buoyancy. The tumblers, made from ecofriendly materials to minimize +environmental impact when lost during the mission, were tested for their +gliding ratio and descent rate. They exhibit a low descent rate, in the range +of 0.8 to 2.5 meters per seconds, which minimizes disturbance to the ecosystem +upon water landing. Additionally, the system demonstrated robustness in +moderate to strong wind conditions during outdoor tests, validating the overall +framework. + +
+
+ comment: Submitted to IEEE Robosoft 2025 +
+
+
+
+
+ + ☆ Neural Attention Field: Emerging Point Relevance in 3D Scenes for + One-Shot Dexterous Grasping + + +
+ One-shot transfer of dexterous grasps to novel scenes with object and context +variations has been a challenging problem. While distilled feature fields from +large vision models have enabled semantic correspondences across 3D scenes, +their features are point-based and restricted to object surfaces, limiting +their capability of modeling complex semantic feature distributions for +hand-object interactions. In this work, we propose the \textit{neural attention +field} for representing semantic-aware dense feature fields in the 3D space by +modeling inter-point relevance instead of individual point features. Core to it +is a transformer decoder that computes the cross-attention between any 3D query +point with all the scene points, and provides the query point feature with an +attention-based aggregation. We further propose a self-supervised framework for +training the transformer decoder from only a few 3D pointclouds without hand +demonstrations. Post-training, the attention field can be applied to novel +scenes for semantics-aware dexterous grasping from one-shot demonstration. +Experiments show that our method provides better optimization landscapes by +encouraging the end-effector to focus on task-relevant scene regions, resulting +in significant improvements in success rates on real robots compared with the +feature-field-based methods. + +
+
+
+
+
+ + ☆ Exploring the Potential of Multi-modal Sensing Framework for Forest + Ecology ICRA 2024 + + +
+ Forests offer essential resources and services to humanity, yet preserving +and restoring them presents challenges, particularly due to the limited +availability of actionable data, especially in hard-to-reach areas like forest +canopies. Accessibility continues to pose a challenge for biologists collecting +data in forest environments, often requiring them to invest significant time +and energy in climbing trees to place sensors. This operation not only consumes +resources but also exposes them to danger. Efforts in robotics have been +directed towards accessing the tree canopy using robots. A swarm of drones has +showcased autonomous navigation through the canopy, maneuvering with agility +and evading tree collisions, all aimed at mapping the area and collecting data. +However, relying solely on free-flying drones has proven insufficient for data +collection. Flying drones within the canopy generates loud noise, disturbing +animals and potentially corrupting the data. Additionally, commercial drones +often have limited autonomy for dexterous tasks where aerial physical +interaction could be required, further complicating data acquisition efforts. +Aerial deployed sensor placement methods such as bio-gliders and sensor +shooting have proven effective for data collection within the lower canopy. +However, these methods face challenges related to retrieving the data and +sensors, often necessitating human intervention. + +
+
+ comment: Peer-reviewed and accepted in IEEE ICRA 2024 Workshop RUNE +
+
+
+
+
+ + ☆ Camber-changing flapping hydrofoils for efficient and environmental-safe + water propulsion system + + +
+ This research introduces a novel hydrofoil-based propulsion framework for +unmanned aquatic robots, inspired by the undulating locomotion observed in +select aquatic species. The proposed system incorporates a camber-modulating +mechanism to enhance hydrofoil propulsive force generation and eventually +efficiency. Through dynamic simulations, we validate the effectiveness of the +camber-adjusting hydrofoil compared to a symmetric counterpart. The results +demonstrate a significant improvement in horizontal thrust, emphasizing the +potential of the cambering approach to enhance propulsive performance. +Additionally, a prototype flipper design is presented, featuring individual +control of heave and pitch motions, as well as a camber-adjustment mechanism. +The integrated system not only provides efficient water-based propulsion but +also offers the capacity for generating vertical forces during take-off +maneuvers for seaplanes. The design is tailored to harness wave energy, +contributing to the exploration of alternative energy resources. This work +advances the understanding of bionic oscillatory principles for aquatic robots +and provides a foundation for future developments in environmentally safe and +agile underwater exploration. + +
+
+ comment: Peer-reviewed and accepted in Ubiquitous Robots 2024, New York City +
+
+
+
+
+ + ☆ Online Intrinsic Rewards for Decision Making Agents from Large Language + Model Feedback + + +
+ Automatically synthesizing dense rewards from natural language descriptions +is a promising paradigm in reinforcement learning (RL), with applications to +sparse reward problems, open-ended exploration, and hierarchical skill design. +Recent works have made promising steps by exploiting the prior knowledge of +large language models (LLMs). However, these approaches suffer from important +limitations: they are either not scalable to problems requiring billions of +environment samples; or are limited to reward functions expressible by compact +code, which may require source code and have difficulty capturing nuanced +semantics; or require a diverse offline dataset, which may not exist or be +impossible to collect. In this work, we address these limitations through a +combination of algorithmic and systems-level contributions. We propose ONI, a +distributed architecture that simultaneously learns an RL policy and an +intrinsic reward function using LLM feedback. Our approach annotates the +agent's collected experience via an asynchronous LLM server, which is then +distilled into an intrinsic reward model. We explore a range of algorithmic +choices for reward modeling with varying complexity, including hashing, +classification, and ranking models. By studying their relative tradeoffs, we +shed light on questions regarding intrinsic reward design for sparse reward +problems. Our approach achieves state-of-the-art performance across a range of +challenging, sparse reward tasks from the NetHack Learning Environment in a +simple unified process, solely using the agent's gathered experience, without +requiring external datasets nor source code. We make our code available at +\url{URL} (coming soon). + +
+
+
+
+
+ + ☆ DexGraspNet 2.0: Learning Generative Dexterous Grasping in Large-scale + Synthetic Cluttered Scenes + + +
+ Grasping in cluttered scenes remains highly challenging for dexterous hands +due to the scarcity of data. To address this problem, we present a large-scale +synthetic benchmark, encompassing 1319 objects, 8270 scenes, and 427 million +grasps. Beyond benchmarking, we also propose a novel two-stage grasping method +that learns efficiently from data by using a diffusion model that conditions on +local geometry. Our proposed generative method outperforms all baselines in +simulation experiments. Furthermore, with the aid of test-time-depth +restoration, our method demonstrates zero-shot sim-to-real transfer, attaining +90.7% real-world dexterous grasping success rate in cluttered scenes. + +
+
+
+
+
+ + ☆ A Comparison of Prompt Engineering Techniques for Task Planning and + Execution in Service Robotics + + +
+ Recent advances in LLM have been instrumental in autonomous robot control and +human-robot interaction by leveraging their vast general knowledge and +capabilities to understand and reason across a wide range of tasks and +scenarios. Previous works have investigated various prompt engineering +techniques for improving the performance of \glspl{LLM} to accomplish tasks, +while others have proposed methods that utilize LLMs to plan and execute tasks +based on the available functionalities of a given robot platform. In this work, +we consider both lines of research by comparing prompt engineering techniques +and combinations thereof within the application of high-level task planning and +execution in service robotics. We define a diverse set of tasks and a simple +set of functionalities in simulation, and measure task completion accuracy and +execution time for several state-of-the-art models. + +
+
+ comment: 6 pages, 3 figures, 2 tables, to be published in the 2024 IEEE-RAS + International Conference on Humanoid Robots, We make our code, including all + prompts, available at https://github.com/AIS-Bonn/Prompt_Engineering +
+
+
+
+
+ + ☆ PDSR: Efficient UAV Deployment for Swift and Accurate Post-Disaster + Search and Rescue + + +
+ This paper introduces a comprehensive framework for Post-Disaster Search and +Rescue (PDSR), aiming to optimize search and rescue operations leveraging +Unmanned Aerial Vehicles (UAVs). The primary goal is to improve the precision +and availability of sensing capabilities, particularly in various catastrophic +scenarios. Central to this concept is the rapid deployment of UAV swarms +equipped with diverse sensing, communication, and intelligence capabilities, +functioning as an integrated system that incorporates multiple technologies and +approaches for efficient detection of individuals buried beneath rubble or +debris following a disaster. Within this framework, we propose architectural +solution and address associated challenges to ensure optimal performance in +real-world disaster scenarios. The proposed framework aims to achieve complete +coverage of damaged areas significantly faster than traditional methods using a +multi-tier swarm architecture. Furthermore, integrating multi-modal sensing +data with machine learning for data fusion could enhance detection accuracy, +ensuring precise identification of survivors. + +
+
+ comment: This paper is currently under review at IEEE IoT Magazine +
+
+
+
+
+ + ☆ Efficient End-to-End 6-Dof Grasp Detection Framework for Edge Devices + with Hierarchical Heatmaps and Feature Propagation + + +
+ 6-DoF grasp detection is critically important for the advancement of +intelligent embodied systems, as it provides feasible robot poses for object +grasping. Various methods have been proposed to detect 6-DoF grasps through the +extraction of 3D geometric features from RGBD or point cloud data. However, +most of these approaches encounter challenges during real robot deployment due +to their significant computational demands, which can be particularly +problematic for mobile robot platforms, especially those reliant on edge +computing devices. This paper presents an Efficient End-to-End Grasp Detection +Network (E3GNet) for 6-DoF grasp detection utilizing hierarchical heatmap +representations. E3GNet effectively identifies high-quality and diverse grasps +in cluttered real-world environments. Benefiting from our end-to-end +methodology and efficient network design, our approach surpasses previous +methods in model inference efficiency and achieves real-time 6-Dof grasp +detection on edge devices. Furthermore, real-world experiments validate the +effectiveness of our method, achieving a satisfactory 94% object grasping +success rate. + +
+
+
+
+
+ + ☆ GPTR: Gaussian Process Trajectory Representation for Continuous-Time + Motion Estimation + + +
+ Continuous-time trajectory representation has gained significant popularity +in recent years, as it offers an elegant formulation that allows the fusion of +a larger number of sensors and sensing modalities, overcoming limitations of +traditional discrete-time frameworks. To bolster the adoption of the +continuous-time paradigm, we propose a so-called Gaussian Process Trajectory +Representation (GPTR) framework for continuous-time motion estimation (CTME) +tasks. Our approach stands out by employing a third-order random jerk model, +featuring closed-form expressions for both rotational and translational state +derivatives. This model provides smooth, continuous trajectory representations +that are crucial for precise estimation of complex motion. To support the wider +robotics and computer vision communities, we have made the source code for GPTR +available as a light-weight header-only library. This format was chosen for its +ease of integration, allowing developers to incorporate GPTR into existing +systems without needing extensive code modifications. Moreover, we also provide +a set of optimization examples with LiDAR, camera, IMU, UWB factors, and +closed-form analytical Jacobians under the proposed GP framework. Our +experiments demonstrate the efficacy and efficiency of GP-based trajectory +representation in various motion estimation tasks, and the examples can serve +as the prototype to help researchers quickly develop future applications such +as batch optimization, calibration, sensor fusion, trajectory planning, etc., +with continuous-time trajectory representation. Our project is accessible at +https://github.com/brytsknguyen/gptr . + +
+
+ comment: The source code has been released. All feedbacks are welcome +
+
+
+
+
+ + ☆ An Efficient Representation of Whole-body Model Predictive Control for + Online Compliant Dual-arm Mobile Manipulation + + +
+ Dual-arm mobile manipulators can transport and manipulate large-size objects +with simple end-effectors. To interact with dynamic environments with strict +safety and compliance requirements, achieving whole-body motion planning online +while meeting various hard constraints for such highly redundant mobile +manipulators poses a significant challenge. We tackle this challenge by +presenting an efficient representation of whole-body motion trajectories within +our bilevel model-based predictive control (MPC) framework. We utilize +B\'ezier-curve parameterization to represent the optimized collision-free +trajectories of two collaborating end-effectors in the first MPC, facilitating +fast long-horizon object-oriented motion planning in SE(3) while considering +approximated feasibility constraints. This approach is further applied to +parameterize whole-body trajectories in the second MPC for whole-body motion +generation with predictive admittance control in a relatively short horizon +while satisfying whole-body hard constraints. This representation enables two +MPCs with continuous properties, thereby avoiding inaccurate model-state +transition and dense decision-variable settings in existing MPCs using the +discretization method. It strengthens the online execution of the bilevel MPC +framework in high-dimensional space and facilitates the generation of +consistent commands for our hybrid position/velocity-controlled robot. The +simulation comparisons and real-world experiments demonstrate the efficiency +and robustness of this approach in various scenarios for static and dynamic +obstacle avoidance, and compliant interaction control with the manipulated +object and external disturbances. + +
+
+ comment: Under Review for IEEE Transactions on Robotics +
+
+
+
+
+ + ☆ Human-inspired Grasping Strategies of Fresh Fruits and Vegetables + Applied to Robotic Manipulation + + +
+ Robotic manipulation of fresh fruits and vegetables, including the grasping +of multiple loose items, has a strong industrial need but it still is a +challenging task for robotic manipulation. This paper outlines the distinctive +manipulation strategies used by humans to pick loose fruits and vegetables with +the aim to better adopt them for robotic manipulation of diverse items. In this +work we present a first version of a robotic setup designed to pick different +single or multiple fresh items, featuring multi-fingered compliant robotic +gripper. We analyse human grasping strategies from the perspective of +industrial Key Performance Indicators (KPIs) used in the logistic sector. The +robotic system was validated using the same KPIs, as well as taking into +account human performance and strategies. This paper lays the foundation for +future development of the robotic demonstrator for fresh fruit and vegetable +intelligent manipulation, and outlines the need for generic approaches to +handle the complexity of the task. + +
+
+ comment: *Authors contributed equally +
+
+
+
+
+ + ☆ Non-contact Dexterous Micromanipulation with Multiple Optoelectronic + Robots + + +
+ Micromanipulation systems leverage automation and robotic technologies to +improve the precision, repeatability, and efficiency of various tasks at the +microscale. However, current approaches are typically limited to specific +objects or tasks, which necessitates the use of custom tools and specialized +grasping methods. This paper proposes a novel non-contact micromanipulation +method based on optoelectronic technologies. The proposed method utilizes +repulsive dielectrophoretic forces generated in the optoelectronic field to +drive a microrobot, enabling the microrobot to push the target object in a +cluttered environment without physical contact. The non-contact feature can +minimize the risks of potential damage, contamination, or adhesion while +largely improving the flexibility of manipulation. The feature enables the use +of a general tool for indirect object manipulation, eliminating the need for +specialized tools. A series of simulation studies and real-world experiments -- +including non-contact trajectory tracking, obstacle avoidance, and reciprocal +avoidance between multiple microrobots -- are conducted to validate the +performance of the proposed method. The proposed formulation provides a general +and dexterous solution for a range of objects and tasks at the micro scale. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ☆ Grasping Force Estimation for Markerless Visuotactile Sensors + + +
+ Tactile sensors have been used for force estimation in the past, especially +Vision-Based Tactile Sensors (VBTS) have recently become a new trend due to +their high spatial resolution and low cost. In this work, we have designed and +implemented several approaches to estimate the normal grasping force using +different types of markerless visuotactile representations obtained from VBTS. +Our main goal is to determine the most appropriate visuotactile representation, +based on a performance analysis during robotic grasping tasks. Our proposal has +been tested on the dataset generated with our DIGIT sensors and another one +obtained using GelSight Mini sensors from another state-of-the-art work. We +have also tested the generalization capabilities of our best approach, called +RGBmod. The results led to two main conclusions. First, the RGB visuotactile +representation is a better input option than the depth image or a combination +of the two for estimating normal grasping forces. Second, RGBmod achieved a +good performance when tested on 10 unseen everyday objects in real-world +scenarios, achieving an average relative error of 0.125 +- 0.153. Furthermore, +we show that our proposal outperforms other works in the literature that use +RGB and depth information for the same task. + +
+
+
+
+
+ + ☆ Enhancing Tool Manipulation of An Aerial Vehicle with A Dynamically + Displacing Center-of-Mass + + +
+ As aerial robots gain traction in industrial applications, there is growing +interest in enhancing their physical interaction capabilities. Pushing tasks +performed by aerial manipulators have been successfully demonstrated in +contact-based inspections. However, more complex industrial applications +require these systems to support higher-DoF (Degree of Freedom) manipulators +and generate larger forces while pushing (e.g., drilling, grinding). This paper +builds on our previous work, where we introduced an aerial vehicle with a +dynamically displacing CoM (Center of Mass) to improve force exertion during +interactions. We propose a novel approach to further enhance this system's +force generation by optimizing its CoM location during interactions. +Additionally, we study the case of this aerial vehicle equipped with a 2-DoF +manipulation arm to extend the system's functionality in tool-based tasks. The +effectiveness of the proposed methods is validated through simulations, +demonstrating the potential of this system for advanced aerial manipulation in +practical settings. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2404.01110 +
+
+
+
+
+ + ☆ SoftCTRL: Soft conservative KL-control of Transformer Reinforcement + Learning for Autonomous Driving + + +
+ In recent years, motion planning for urban self-driving cars (SDV) has become +a popular problem due to its complex interaction of road components. To tackle +this, many methods have relied on large-scale, human-sampled data processed +through Imitation learning (IL). Although effective, IL alone cannot adequately +handle safety and reliability concerns. Combining IL with Reinforcement +learning (RL) by adding KL divergence between RL and IL policy to the RL loss +can alleviate IL's weakness but suffer from over-conservation caused by +covariate shift of IL. To address this limitation, we introduce a method that +combines IL with RL using an implicit entropy-KL control that offers a simple +way to reduce the over-conservation characteristic. In particular, we validate +different challenging simulated urban scenarios from the unseen dataset, +indicating that although IL can perform well in imitation tasks, our proposed +method significantly improves robustness (over 17\% reduction in failures) and +generates human-like driving behavior. + +
+
+ comment: submitted to IEEE Open Journal of Intelligent Transportation Systems +
+
+
+
+
+ + ☆ Robotic State Recognition with Image-to-Text Retrieval Task of + Pre-Trained Vision-Language Model and Black-Box Optimization + + +
+ State recognition of the environment and objects, such as the open/closed +state of doors and the on/off of lights, is indispensable for robots that +perform daily life support and security tasks. Until now, state recognition +methods have been based on training neural networks from manual annotations, +preparing special sensors for the recognition, or manually programming to +extract features from point clouds or raw images. In contrast, we propose a +robotic state recognition method using a pre-trained vision-language model, +which is capable of Image-to-Text Retrieval (ITR) tasks. We prepare several +kinds of language prompts in advance, calculate the similarity between these +prompts and the current image by ITR, and perform state recognition. By +applying the optimal weighting to each prompt using black-box optimization, +state recognition can be performed with higher accuracy. Experiments show that +this theory enables a variety of state recognitions by simply preparing +multiple prompts without retraining neural networks or manual programming. In +addition, since only prompts and their weights need to be prepared for each +recognizer, there is no need to prepare multiple models, which facilitates +resource management. It is possible to recognize the open/closed state of +transparent doors, the state of whether water is running or not from a faucet, +and even the qualitative state of whether a kitchen is clean or not, which have +been challenging so far, through language. + +
+
+ comment: Accepted at Humanoids2024 +
+
+
+
+
+ + ☆ MiniTac: An Ultra-Compact 8 mm Vision-Based Tactile Sensor for Enhanced + Palpation in Robot-Assisted Minimally Invasive Surgery RA-L + + +
+ Robot-assisted minimally invasive surgery (RAMIS) provides substantial +benefits over traditional open and laparoscopic methods. However, a significant +limitation of RAMIS is the surgeon's inability to palpate tissues, a crucial +technique for examining tissue properties and detecting abnormalities, +restricting the widespread adoption of RAMIS. To overcome this obstacle, we +introduce MiniTac, a novel vision-based tactile sensor with an ultra-compact +cross-sectional diameter of 8 mm, designed for seamless integration into +mainstream RAMIS devices, particularly the Da Vinci surgical systems. MiniTac +features a novel mechanoresponsive photonic elastomer membrane that changes +color distribution under varying contact pressures. This color change is +captured by an embedded miniature camera, allowing MiniTac to detect tumors +both on the tissue surface and in deeper layers typically obscured from +endoscopic view. MiniTac's efficacy has been rigorously tested on both phantoms +and ex-vivo tissues. By leveraging advanced mechanoresponsive photonic +materials, MiniTac represents a significant advancement in integrating tactile +sensing into RAMIS, potentially expanding its applicability to a wider array of +clinical scenarios that currently rely on traditional surgical approaches. + +
+
+ comment: accepted for publication in the IEEE Robotics and Automation Letters + (RA-L) +
+
+
+
+
+ + ☆ Multi-Task Interactive Robot Fleet Learning with Visual World Models + + +
+ Recent advancements in large-scale multi-task robot learning offer the +potential for deploying robot fleets in household and industrial settings, +enabling them to perform diverse tasks across various environments. However, +AI-enabled robots often face challenges with generalization and robustness when +exposed to real-world variability and uncertainty. We introduce Sirius-Fleet, a +multi-task interactive robot fleet learning framework to address these +challenges. Sirius-Fleet monitors robot performance during deployment and +involves humans to correct the robot's actions when necessary. We employ a +visual world model to predict the outcomes of future actions and build anomaly +predictors to predict whether they will likely result in anomalies. As the +robot autonomy improves, the anomaly predictors automatically adapt their +prediction criteria, leading to fewer requests for human intervention and +gradually reducing human workload over time. Evaluations on large-scale +benchmarks demonstrate Sirius-Fleet's effectiveness in improving multi-task +policy performance and monitoring accuracy. We demonstrate Sirius-Fleet's +performance in both RoboCasa in simulation and Mutex in the real world, two +diverse, large-scale multi-task benchmarks. More information is available on +the project website: https://ut-austin-rpl.github.io/sirius-fleet + +
+
+ comment: In Proceedings of CoRL 2024 +
+
+
+
+
+ + ☆ IM-GIV: an effective integrity monitoring scheme for tightly-coupled + GNSS/INS/Vision integration based on factor graph optimization + + +
+ Global Navigation Satellite System/Inertial Navigation System +(GNSS/INS)/Vision integration based on factor graph optimization (FGO) has +recently attracted extensive attention in navigation and robotics community. +Integrity monitoring (IM) capability is required when FGO-based integrated +navigation system is used for safety-critical applications. However, +traditional researches on IM of integrated navigation system are mostly based +on Kalman filter. It is urgent to develop effective IM scheme for FGO-based +GNSS/INS/Vision integration. In this contribution, the position error bounding +formula to ensure the integrity of the GNSS/INS/Vision integration based on FGO +is designed and validated for the first time. It can be calculated by the +linearized equations from the residuals of GNSS pseudo-range, IMU +pre-integration and visual measurements. The specific position error bounding +is given in the case of GNSS, INS and visual measurement faults. Field +experiments were conducted to evaluate and validate the performance of the +proposed position error bounding. Experimental results demonstrate that the +proposed position error bounding for the GNSS/INS/Vision integration based on +FGO can correctly fit the position error against different fault modes, and the +availability of integrity in six fault modes is 100% after correct and timely +fault exclusion. + +
+
+
+
+
+ + ☆ $\textbf{EMOS}$: $\textbf{E}$mbodiment-aware Heterogeneous + $\textbf{M}$ulti-robot $\textbf{O}$perating $\textbf{S}$ystem with LLM Agents + + +
+ Heterogeneous multi-robot systems (HMRS) have emerged as a powerful approach +for tackling complex tasks that single robots cannot manage alone. Current +large-language-model-based multi-agent systems (LLM-based MAS) have shown +success in areas like software development and operating systems, but applying +these systems to robot control presents unique challenges. In particular, the +capabilities of each agent in a multi-robot system are inherently tied to the +physical composition of the robots, rather than predefined roles. To address +this issue, we introduce a novel multi-agent framework designed to enable +effective collaboration among heterogeneous robots with varying embodiments and +capabilities, along with a new benchmark named Habitat-MAS. One of our key +designs is $\textit{Robot Resume}$: Instead of adopting human-designed role +play, we propose a self-prompted approach, where agents comprehend robot URDF +files and call robot kinematics tools to generate descriptions of their physics +capabilities to guide their behavior in task planning and action execution. The +Habitat-MAS benchmark is designed to assess how a multi-agent framework handles +tasks that require embodiment-aware reasoning, which includes 1) manipulation, +2) perception, 3) navigation, and 4) comprehensive multi-floor object +rearrangement. The experimental results indicate that the robot's resume and +the hierarchical design of our multi-agent system are essential for the +effective operation of the heterogeneous multi-robot system within this +intricate problem context. + +
+
+ comment: 10 pages of main content, 3 pages of references, 5 pages of appendix, + 7 figures in total +
+
+
+
+
+ + ☆ An Overtaking Trajectory Planning Framework Based on Spatio-temporal + Topology and Reachable Set Analysis Ensuring Time Efficiency + + +
+ Generating overtaking trajectories in high-speed scenarios presents +significant challenges and is typically addressed through hierarchical planning +methods. However, this method has two primary drawbacks. First, heuristic +algorithms can only provide a single initial solution, which may lead to local +optima and consequently diminish the quality of the solution. Second, the time +efficiency of trajectory refinement based on numerical optimization is +insufficient. To overcome these limitations, this paper proposes an overtaking +trajectory planning framework based on spatio-temporal topology and reachable +set analysis (SROP), to improve trajectory quality and time efficiency. +Specifically, this paper introduces topological classes to describe +trajectories representing different overtaking behaviors, which support the +spatio-temporal topological search method employed by the upper-layer planner +to identify diverse initial paths. This approach helps prevent getting stuck in +local optima, enhancing the overall solution quality by considering multiple +initial solutions from distinct topologies. Moreover, the reachable set method +is integrated into the lower-layer planner for parallel trajectory evaluation. +This method enhances planning efficiency by decoupling vehicle model +constraints from the optimization process, enabling parallel computation while +ensuring control feasibility. Simulation results show that the proposed method +improves the smoothness of generated trajectories by 66.8% compared to +state-of-the-art methods, highlighting its effectiveness in enhancing +trajectory quality. Additionally, this method reduces computation time by +62.9%, demonstrating its efficiency. + +
+
+
+
+
+ + ☆ NUSense: Robust Soft Optical Tactile Sensor + + +
+ While most tactile sensors rely on measuring pressure, insights from +continuum mechanics suggest that measuring shear strain provides critical +information for tactile sensing. In this work, we introduce an optical tactile +sensing principle based on shear strain detection. A silicone rubber layer, +dyed with color inks, is used to quantify the shear magnitude of the sensing +layer. This principle was validated using the NUSense camera-based tactile +sensor. The wide-angle camera captures the elongation of the soft pad under +mechanical load, a phenomenon attributed to the Poisson effect. The physical +and optical properties of the inked pad are essential and should ideally remain +stable over time. We tested the robustness of the sensor by subjecting the +outermost layer to multiple load cycles using a robot arm. Additionally, we +discussed potential applications of this sensor in force sensing and contact +localization. + +
+
+ comment: Madina Yergibay and Tleukhan Mussin contributed equally. 6 pages, 6 + figures +
+
+
+
+
+ + ☆ PACER: Preference-conditioned All-terrain Costmap Generation + + +
+ In autonomous robot navigation, terrain cost assignment is typically +performed using a semantics-based paradigm in which terrain is first labeled +using a pre-trained semantic classifier and costs are then assigned according +to a user-defined mapping between label and cost. While this approach is +rapidly adaptable to changing user preferences, only preferences over the types +of terrain that are already known by the semantic classifier can be expressed. +In this paper, we hypothesize that a machine-learning-based alternative to the +semantics-based paradigm above will allow for rapid cost assignment adaptation +to preferences expressed over new terrains at deployment time without the need +for additional training. To investigate this hypothesis, we introduce and study +PACER, a novel approach to costmap generation that accepts as input a single +birds-eye view (BEV) image of the surrounding area along with a user-specified +preference context and generates a corresponding BEV costmap that aligns with +the preference context. Using both real and synthetic data along with a +combination of proposed training tasks, we find that PACER is able to adapt +quickly to new user preferences while also exhibiting better generalization to +novel terrains compared to both semantics-based and representation-learning +approaches. + +
+
+
+
+
+ + ☆ Design and Motion Analysis of a Reconfigurable Pendulum-Based Rolling + Disk Robot with Magnetic Coupling + + +
+ Reconfigurable robots are at the forefront of robotics innovation due to +their unmatched versatility and adaptability in addressing various tasks +through collaborative operations. This paper explores the design and +implementation of a novel pendulum-based magnetic coupling system within a +reconfigurable disk robot. Diverging from traditional designs, this system +emphasizes enhancing coupling strength while maintaining the compactness of the +outer shell. We employ parametric optimization techniques, including magnetic +array simulations, to improve coupling performance. Additionally, we conduct a +comprehensive analysis of the rolling robot's motion to assess its operational +effectiveness in the coupling mechanism. This examination reveals intriguing +new motion patterns driven by frictional and sliding effects between the +rolling disk modules and the ground. Furthermore, the new setup introduces a +novel problem in the area of nonprehensile manipulation. + +
+
+ comment: Accepted to TAROS 2024 +
+
+
+
+
+ + ☆ Return Augmented Decision Transformer for Off-Dynamics Reinforcement + Learning + + +
+ We study offline off-dynamics reinforcement learning (RL) to utilize data +from an easily accessible source domain to enhance policy learning in a target +domain with limited data. Our approach centers on return-conditioned supervised +learning (RCSL), particularly focusing on the decision transformer (DT), which +can predict actions conditioned on desired return guidance and complete +trajectory history. Previous works tackle the dynamics shift problem by +augmenting the reward in the trajectory from the source domain to match the +optimal trajectory in the target domain. However, this strategy can not be +directly applicable in RCSL owing to (1) the unique form of the RCSL policy +class, which explicitly depends on the return, and (2) the absence of a +straightforward representation of the optimal trajectory distribution. We +propose the Return Augmented Decision Transformer (RADT) method, where we +augment the return in the source domain by aligning its distribution with that +in the target domain. We provide the theoretical analysis demonstrating that +the RCSL policy learned from RADT achieves the same level of suboptimality as +would be obtained without a dynamics shift. We introduce two practical +implementations RADT-DARA and RADT-MV respectively. Extensive experiments +conducted on D4RL datasets reveal that our methods generally outperform dynamic +programming based methods in off-dynamics RL scenarios. + +
+
+ comment: 26 pages, 10 tables, 10 figures +
+
+
+
+
+ + ☆ Learning for Deformable Linear Object Insertion Leveraging Flexibility + Estimation from Visual Cues ICRA + + +
+ Manipulation of deformable Linear objects (DLOs), including iron wire, +rubber, silk, and nylon rope, is ubiquitous in daily life. These objects +exhibit diverse physical properties, such as Young$'$s modulus and bending +stiffness.Such diversity poses challenges for developing generalized +manipulation policies. However, previous research limited their scope to +single-material DLOs and engaged in time-consuming data collection for the +state estimation. In this paper, we propose a two-stage manipulation approach +consisting of a material property (e.g., flexibility) estimation and policy +learning for DLO insertion with reinforcement learning. Firstly, we design a +flexibility estimation scheme that characterizes the properties of different +types of DLOs. The ground truth flexibility data is collected in simulation to +train our flexibility estimation module. During the manipulation, the robot +interacts with the DLOs to estimate flexibility by analyzing their visual +configurations. Secondly, we train a policy conditioned on the estimated +flexibility to perform challenging DLO insertion tasks. Our pipeline trained +with diverse insertion scenarios achieves an 85.6% success rate in simulation +and 66.67% in real robot experiments. Please refer to our project page: +https://lmeee.github.io/DLOInsert/ + +
+
+ comment: 7 pages, 9 figures, 3 tables. 2024 IEEE International Conference on + Robotics and Automation (ICRA) +
+
+
+
+
+ + ☆ Estimating Neural Network Robustness via Lipschitz Constant and + Architecture Sensitivity + + +
+ Ensuring neural network robustness is essential for the safe and reliable +operation of robotic learning systems, especially in perception and +decision-making tasks within real-world environments. This paper investigates +the robustness of neural networks in perception systems, specifically examining +their sensitivity to targeted, small-scale perturbations. We identify the +Lipschitz constant as a key metric for quantifying and enhancing network +robustness. We derive an analytical expression to compute the Lipschitz +constant based on neural network architecture, providing a theoretical basis +for estimating and improving robustness. Several experiments reveal the +relationship between network design, the Lipschitz constant, and robustness, +offering practical insights for developing safer, more robust robot learning +systems. + +
+
+ comment: SAFE-ROL at CoRL 2024 +
+
+
+
+
+ + ☆ A Cost-Effective Thermal Imaging Safety Sensor for Industry 5.0 and + Collaborative Robotics + + +
+ The Industry 5.0 paradigm focuses on industrial operator well-being and +sustainable manufacturing practices, where humans play a central role, not only +during the repetitive and collaborative tasks of the manufacturing process, but +also in the management of the factory floor assets. Human factors, such as +ergonomics, safety, and well-being, push the human-centric smart factory to +efficiently adopt novel technologies while minimizing environmental and social +impact. As operations at the factory floor increasingly rely on collaborative +robots (CoBots) and flexible manufacturing systems, there is a growing demand +for redundant safety mechanisms (i.e., automatic human detection in the +proximity of machinery that is under operation). Fostering enhanced process +safety for human proximity detection allows for the protection against possible +incidents or accidents with the deployed industrial devices and machinery. This +paper introduces the design and implementation of a cost-effective thermal +imaging Safety Sensor that can be used in the scope of Industry 5.0 to trigger +distinct safe mode states in manufacturing processes that rely on collaborative +robotics. The proposed Safety Sensor uses a hybrid detection approach and has +been evaluated under controlled environmental conditions. The obtained results +show a 97% accuracy at low computational cost when using the developed hybrid +method to detect the presence of humans in thermal images. + +
+
+ comment: Paper accepted in Edge-IoT 2022 +
+
+
+
+
+ + ♻ ☆ Is Your LiDAR Placement Optimized for 3D Scene Understanding? NeurIPS 2024 + + +
+ The reliability of driving perception systems under unprecedented conditions +is crucial for practical usage. Latest advancements have prompted increasing +interest in multi-LiDAR perception. However, prevailing driving datasets +predominantly utilize single-LiDAR systems and collect data devoid of adverse +conditions, failing to capture the complexities of real-world environments +accurately. Addressing these gaps, we proposed Place3D, a full-cycle pipeline +that encompasses LiDAR placement optimization, data generation, and downstream +evaluations. Our framework makes three appealing contributions. 1) To identify +the most effective configurations for multi-LiDAR systems, we introduce the +Surrogate Metric of the Semantic Occupancy Grids (M-SOG) to evaluate LiDAR +placement quality. 2) Leveraging the M-SOG metric, we propose a novel +optimization strategy to refine multi-LiDAR placements. 3) Centered around the +theme of multi-condition multi-LiDAR perception, we collect a 280,000-frame +dataset from both clean and adverse conditions. Extensive experiments +demonstrate that LiDAR placements optimized using our approach outperform +various baselines. We showcase exceptional results in both LiDAR semantic +segmentation and 3D object detection tasks, under diverse weather and sensor +failure conditions. + +
+
+ comment: NeurIPS 2024 (Spotlight); 36 pages, 16 figures, 14 tables; Code at + https://github.com/ywyeli/Place3D +
+
+
+
+
+ + ♻ ☆ Cognitive Load-based Affective Workload Allocation for Multi-human + Multi-robot Teams + + +
+ The interaction and collaboration between humans and multiple robots +represent a novel field of research known as human multi-robot systems. +Adequately designed systems within this field allow teams composed of both +humans and robots to work together effectively on tasks such as monitoring, +exploration, and search and rescue operations. This paper presents a deep +reinforcement learning-based affective workload allocation controller +specifically for multi-human multi-robot teams. The proposed controller can +dynamically reallocate workloads based on the performance of the operators +during collaborative missions with multi-robot systems. The operators' +performances are evaluated through the scores of a self-reported questionnaire +(i.e., subjective measurement) and the results of a deep learning-based +cognitive workload prediction algorithm that uses physiological and behavioral +data (i.e., objective measurement). To evaluate the effectiveness of the +proposed controller, we use a multi-human multi-robot CCTV monitoring task as +an example and carry out comprehensive real-world experiments with 32 human +subjects for both quantitative measurement and qualitative analysis. Our +results demonstrate the performance and effectiveness of the proposed +controller and highlight the importance of incorporating both subjective and +objective measurements of the operators' cognitive workload as well as seeking +consent for workload transitions, to enhance the performance of multi-human +multi-robot teams. + +
+
+ comment: This paper is submitted and accepted to IEEE Transactions on + Human-Machine Systems +
+
+
+
+
+ + ♻ FusionPortableV2: A Unified Multi-Sensor Dataset for Generalized SLAM + Across Diverse Platforms and Scalable Environments IJRR + + +
+ Simultaneous Localization and Mapping (SLAM) technology has been widely +applied in various robotic scenarios, from rescue operations to autonomous +driving. However, the generalization of SLAM algorithms remains a significant +challenge, as current datasets often lack scalability in terms of platforms and +environments. To address this limitation, we present FusionPortableV2, a +multi-sensor SLAM dataset featuring sensor diversity, varied motion patterns, +and a wide range of environmental scenarios. Our dataset comprises $27$ +sequences, spanning over $2.5$ hours and collected from four distinct +platforms: a handheld suite, a legged robots, a unmanned ground vehicle (UGV), +and a vehicle. These sequences cover diverse settings, including buildings, +campuses, and urban areas, with a total length of $38.7km$. Additionally, the +dataset includes ground-truth (GT) trajectories and RGB point cloud maps +covering approximately $0.3km^2$. To validate the utility of our dataset in +advancing SLAM research, we assess several state-of-the-art (SOTA) SLAM +algorithms. Furthermore, we demonstrate the dataset's broad application beyond +traditional SLAM tasks by investigating its potential for monocular depth +estimation. The complete dataset, including sensor data, GT, and calibration +details, is accessible at +https://fusionportable.github.io/dataset/fusionportable_v2. + +
+
+ comment: 21 pages, 17 figures, 7 tables. Accepted by International Journal of + Robotics Research (IJRR) +
+
+
+
+
+ + ♻ ☆ LLM2Swarm: Robot Swarms that Responsively Reason, Plan, and Collaborate + through LLMs NeurIPS 2024 + + +
+ Robot swarms are composed of many simple robots that communicate and +collaborate to fulfill complex tasks. Robot controllers usually need to be +specified by experts on a case-by-case basis via programming code. This process +is time-consuming, prone to errors, and unable to take into account all +situations that may be encountered during deployment. On the other hand, recent +Large Language Models (LLMs) have demonstrated reasoning and planning +capabilities, introduced new ways to interact with and program machines, and +incorporate both domain-specific and commonsense knowledge. Hence, we propose +to address the aforementioned challenges by integrating LLMs with robot swarms +and show the potential in proofs of concept (showcases). For this integration, +we explore two approaches. The first approach is 'indirect integration,' where +LLMs are used to synthesize and validate the robot controllers. This approach +may reduce development time and human error before deployment. Moreover, during +deployment, it could be used for on-the-fly creation of new robot behaviors. +The second approach is 'direct integration,' where each robot locally executes +a separate LLM instance during deployment for robot-robot collaboration and +human-swarm interaction. These local LLM instances enable each robot to reason, +plan, and collaborate using natural language, as demonstrated in our showcases +where the robots are able to detect a variety of anomalies, without prior +information about the nature of these anomalies. To enable further research on +our mainly conceptual contribution, we release the software and videos for our +LLM2Swarm system: https://github.com/Pold87/LLM2Swarm. + +
+
+ comment: Accepted at NeurIPS 2024 Workshop on Open-World Agents. Code: + https://github.com/Pold87/LLM2Swarm/ +
+
+
+
+
+ + ♻ ☆ Integrating One-Shot View Planning with a Single Next-Best View via + Long-Tail Multiview Sampling + + +
+ Existing view planning systems either adopt an iterative paradigm using +next-best views (NBV) or a one-shot pipeline relying on the set-covering +view-planning (SCVP) network. However, neither of these methods can +concurrently guarantee both high-quality and high-efficiency reconstruction of +3D unknown objects. To tackle this challenge, we introduce a crucial +hypothesis: with the availability of more information about the unknown object, +the prediction quality of the SCVP network improves. There are two ways to +provide extra information: (1) leveraging perception data obtained from NBVs, +and (2) training on an expanded dataset of multiview inputs. In this work, we +introduce a novel combined pipeline that incorporates a single NBV before +activating the proposed multiview-activated (MA-)SCVP network. The MA-SCVP is +trained on a multiview dataset generated by our long-tail sampling method, +which addresses the issue of unbalanced multiview inputs and enhances the +network performance. Extensive simulated experiments substantiate that our +system demonstrates a significant surface coverage increase and a substantial +45% reduction in movement cost compared to state-of-the-art systems. Real-world +experiments justify the capability of our system for generalization and +deployment. + +
+
+ comment: Accepted to IEEE Transactions on Robotics. Full appendices version +
+
+
+
+
+ + ♻ ☆ Long-Term Human Trajectory Prediction using 3D Dynamic Scene Graphs RA-L + + +
+ We present a novel approach for long-term human trajectory prediction in +indoor human-centric environments, which is essential for long-horizon robot +planning in these environments. State-of-the-art human trajectory prediction +methods are limited by their focus on collision avoidance and short-term +planning, and their inability to model complex interactions of humans with the +environment. In contrast, our approach overcomes these limitations by +predicting sequences of human interactions with the environment and using this +information to guide trajectory predictions over a horizon of up to 60s. We +leverage Large Language Models (LLMs) to predict interactions with the +environment by conditioning the LLM prediction on rich contextual information +about the scene. This information is given as a 3D Dynamic Scene Graph that +encodes the geometry, semantics, and traversability of the environment into a +hierarchical representation. We then ground these interaction sequences into +multi-modal spatio-temporal distributions over human positions using a +probabilistic approach based on continuous-time Markov Chains. To evaluate our +approach, we introduce a new semi-synthetic dataset of long-term human +trajectories in complex indoor environments, which also includes annotations of +human-object interactions. We show in thorough experimental evaluations that +our approach achieves a 54% lower average negative log-likelihood and a 26.5% +lower Best-of-20 displacement error compared to the best non-privileged (i.e., +evaluated in a zero-shot fashion on the dataset) baselines for a time horizon +of 60s. + +
+
+ comment: 8 pages, 6 figures. Accepted at IEEE Robotics and Automation Letters + (RA-L). Code released at: https://github.com/MIT-SPARK/LP2 +
+
+
+
+
+ + ♻ ☆ ES-Gaussian: Gaussian Splatting Mapping via Error Space-Based Gaussian + Completion + + +
+ Accurate and affordable indoor 3D reconstruction is critical for effective +robot navigation and interaction. Traditional LiDAR-based mapping provides high +precision but is costly, heavy, and power-intensive, with limited ability for +novel view rendering. Vision-based mapping, while cost-effective and capable of +capturing visual data, often struggles with high-quality 3D reconstruction due +to sparse point clouds. We propose ES-Gaussian, an end-to-end system using a +low-altitude camera and single-line LiDAR for high-quality 3D indoor +reconstruction. Our system features Visual Error Construction (VEC) to enhance +sparse point clouds by identifying and correcting areas with insufficient +geometric detail from 2D error maps. Additionally, we introduce a novel 3DGS +initialization method guided by single-line LiDAR, overcoming the limitations +of traditional multi-view setups and enabling effective reconstruction in +resource-constrained environments. Extensive experimental results on our new +Dreame-SR dataset and a publicly available dataset demonstrate that ES-Gaussian +outperforms existing methods, particularly in challenging scenarios. The +project page is available at https://chenlu-china.github.io/ES-Gaussian/. + +
+
+ comment: This preprint has been withdrawn due to concerns regarding the + originality of certain technical elements, as well as its basis in a company + project report that was intended solely for internal discussions. To avoid + any potential misunderstandings, we have decided to withdraw this submission + from public access. We apologize for any confusion this may have caused +
+
+
+
+
+ + ♻ ☆ MPPI-IPDDP: Hybrid Method of Collision-Free Smooth Trajectory Generation + for Autonomous Robots + + +
+ This paper presents a hybrid trajectory optimization method designed to +generate collision-free, smooth trajectories for autonomous mobile robots. By +combining sampling-based Model Predictive Path Integral (MPPI) control with +gradient-based Interior-Point Differential Dynamic Programming (IPDDP), we +leverage their respective strengths in exploration and smoothing. The proposed +method, MPPI-IPDDP, involves three steps: First, MPPI control is used to +generate a coarse trajectory. Second, a collision-free convex corridor is +constructed. Third, IPDDP is applied to smooth the coarse trajectory, utilizing +the collision-free corridor from the second step. To demonstrate the +effectiveness of our approach, we apply the proposed algorithm to trajectory +optimization for differential-drive wheeled mobile robots and point-mass +quadrotors. In comparisons with other MPPI variants and continuous +optimization-based solvers, our method shows superior performance in terms of +computational robustness and trajectory smoothness. + Code: https://github.com/i-ASL/mppi-ipddp Video: https://youtu.be/-oUAt5sd9Bk + +
+
+
+
+
+ + ♻ ☆ Robust Control Barrier Functions using Uncertainty Estimation with + Application to Mobile Robots + + +
+ This paper proposes a safety-critical control design approach for nonlinear +control affine systems in the presence of matched and unmatched uncertainties. +Our constructive framework couples control barrier function (CBF) theory with a +new uncertainty estimator to ensure robust safety. The estimated uncertainty +with a derived upper bound on the estimation error is used for synthesizing +CBFs and safety-critical controllers via a quadratic program-based feedback +control law that rigorously ensures robust safety while improving disturbance +rejection performance. The method is extended to higher-order CBFs (HOCBFs) to +achieve safety under unmatched uncertainty, which may cause relative degree +differences with respect to control input and disturbances. We assume the +relative degree difference is at most one, resulting in a second-order cone +constraint. The proposed robust HOCBF method is demonstrated via a simulation +of an uncertain elastic actuator control problem. Finally, we experimentally +demonstrated the efficacy of our robust CBF framework on a tracked robot with +slope-induced matched and unmatched perturbations. + +
+
+
+
+
+ + ♻ ☆ Robots Pre-train Robots: Manipulation-Centric Robotic Representation + from Large-Scale Robot Datasets + + +
+ The pre-training of visual representations has enhanced the efficiency of +robot learning. Due to the lack of large-scale in-domain robotic datasets, +prior works utilize in-the-wild human videos to pre-train robotic visual +representation. Despite their promising results, representations from human +videos are inevitably subject to distribution shifts and lack the dynamics +information crucial for task completion. We first evaluate various pre-trained +representations in terms of their correlation to the downstream robotic +manipulation tasks (i.e., manipulation centricity). Interestingly, we find that +the "manipulation centricity" is a strong indicator of success rates when +applied to downstream tasks. Drawing from these findings, we propose +Manipulation Centric Representation (MCR), a foundation representation learning +framework capturing both visual features and the dynamics information such as +actions and proprioceptions of manipulation tasks to improve manipulation +centricity. Specifically, we pre-train a visual encoder on the DROID robotic +dataset and leverage motion-relevant data such as robot proprioceptive states +and actions. We introduce a novel contrastive loss that aligns visual +observations with the robot's proprioceptive state-action dynamics, combined +with a behavior cloning (BC)-like actor loss to predict actions during +pre-training, along with a time contrastive loss. Empirical results across 4 +simulation domains with 20 tasks verify that MCR outperforms the strongest +baseline method by 14.8%. Moreover, MCR boosts the performance of +data-efficient learning with a UR5e arm on 3 real-world tasks by 76.9%. Project +website: https://robots-pretrain-robots.github.io/. + +
+
+
+
+
+ + ♻ ☆ CooHOI: Learning Cooperative Human-Object Interaction with Manipulated + Object Dynamics NeurIPS + 2024 + + +
+ Enabling humanoid robots to clean rooms has long been a pursued dream within +humanoid research communities. However, many tasks require multi-humanoid +collaboration, such as carrying large and heavy furniture together. Given the +scarcity of motion capture data on multi-humanoid collaboration and the +efficiency challenges associated with multi-agent learning, these tasks cannot +be straightforwardly addressed using training paradigms designed for +single-agent scenarios. In this paper, we introduce Cooperative Human-Object +Interaction (CooHOI), a framework designed to tackle the challenge of +multi-humanoid object transportation problem through a two-phase learning +paradigm: individual skill learning and subsequent policy transfer. First, a +single humanoid character learns to interact with objects through imitation +learning from human motion priors. Then, the humanoid learns to collaborate +with others by considering the shared dynamics of the manipulated object using +centralized training and decentralized execution (CTDE) multi-agent RL +algorithms. When one agent interacts with the object, resulting in specific +object dynamics changes, the other agents learn to respond appropriately, +thereby achieving implicit communication and coordination between teammates. +Unlike previous approaches that relied on tracking-based methods for +multi-humanoid HOI, CooHOI is inherently efficient, does not depend on motion +capture data of multi-humanoid interactions, and can be seamlessly extended to +include more participants and a wide range of object types. + +
+
+ comment: Project website: https://gao-jiawei.com/Research/CooHOI/. NeurIPS + 2024 Spotlight +
+
+
+
+
+ + ♻ ☆ Survey on Large Language Model-Enhanced Reinforcement Learning: Concept, + Taxonomy, and Methods + + +
+ With extensive pre-trained knowledge and high-level general capabilities, +large language models (LLMs) emerge as a promising avenue to augment +reinforcement learning (RL) in aspects such as multi-task learning, sample +efficiency, and high-level task planning. In this survey, we provide a +comprehensive review of the existing literature in LLM-enhanced RL and +summarize its characteristics compared to conventional RL methods, aiming to +clarify the research scope and directions for future studies. Utilizing the +classical agent-environment interaction paradigm, we propose a structured +taxonomy to systematically categorize LLMs' functionalities in RL, including +four roles: information processor, reward designer, decision-maker, and +generator. For each role, we summarize the methodologies, analyze the specific +RL challenges that are mitigated, and provide insights into future directions. +Lastly, a comparative analysis of each role, potential applications, +prospective opportunities, and challenges of the LLM-enhanced RL are discussed. +By proposing this taxonomy, we aim to provide a framework for researchers to +effectively leverage LLMs in the RL field, potentially accelerating RL +applications in complex applications such as robotics, autonomous driving, and +energy systems. + +
+
+ comment: 22 pages (including bibliography), 6 figures +
+
+
+
+
+ + ♻ ☆ Enhancing Safety and Robustness of Vision-Based Controllers via + Reachability Analysis + + +
+ Autonomous systems, such as self-driving cars and drones, have made +significant strides in recent years by leveraging visual inputs and machine +learning for decision-making and control. Despite their impressive performance, +these vision-based controllers can make erroneous predictions when faced with +novel or out-of-distribution inputs. Such errors can cascade into catastrophic +system failures and compromise system safety. In this work, we compute Neural +Reachable Tubes, which act as parameterized approximations of Backward +Reachable Tubes to stress-test the vision-based controllers and mine their +failure modes. The identified failures are then used to enhance the system +safety through both offline and online methods. The online approach involves +training a classifier as a run-time failure monitor to detect closed-loop, +system-level failures, subsequently triggering a fallback controller that +robustly handles these detected failures to preserve system safety. For the +offline approach, we improve the original controller via incremental training +using a carefully augmented failure dataset, resulting in a more robust +controller that is resistant to the known failure modes. In either approach, +the system is safeguarded against shortcomings that transcend the vision-based +controller and pertain to the closed-loop safety of the overall system. We +validate the proposed approaches on an autonomous aircraft taxiing task that +involves using a vision-based controller to guide the aircraft towards the +centerline of the runway. Our results show the efficacy of the proposed +algorithms in identifying and handling system-level failures, outperforming +methods that rely on controller prediction error or uncertainty quantification +for identifying system failures. + +
+
+
+
+
+ + ♻ ☆ Subwords as Skills: Tokenization for Sparse-Reward Reinforcement + Learning NeurIPS 2024 + + +
+ Exploration in sparse-reward reinforcement learning is difficult due to the +requirement of long, coordinated sequences of actions in order to achieve any +reward. Moreover, in continuous action spaces there are an infinite number of +possible actions, which only increases the difficulty of exploration. One class +of methods designed to address these issues forms temporally extended actions, +often called skills, from interaction data collected in the same domain, and +optimizes a policy on top of this new action space. Typically such methods +require a lengthy pretraining phase, especially in continuous action spaces, in +order to form the skills before reinforcement learning can begin. Given prior +evidence that the full range of the continuous action space is not required in +such tasks, we propose a novel approach to skill-generation with two +components. First we discretize the action space through clustering, and second +we leverage a tokenization technique borrowed from natural language processing +to generate temporally extended actions. Such a method outperforms baselines +for skill-generation in several challenging sparse-reward domains, and requires +orders-of-magnitude less computation in skill-generation and online rollouts. +Our code is available at \url{https://github.com/dyunis/subwords_as_skills}. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ GenRL: Multimodal-foundation world models for generalization in embodied + agents NeurIPS 2024 + + +
+ Learning generalist embodied agents, able to solve multitudes of tasks in +different domains is a long-standing problem. Reinforcement learning (RL) is +hard to scale up as it requires a complex reward design for each task. In +contrast, language can specify tasks in a more natural way. Current foundation +vision-language models (VLMs) generally require fine-tuning or other +adaptations to be adopted in embodied contexts, due to the significant domain +gap. However, the lack of multimodal data in such domains represents an +obstacle to developing foundation models for embodied applications. In this +work, we overcome these problems by presenting multimodal-foundation world +models, able to connect and align the representation of foundation VLMs with +the latent space of generative world models for RL, without any language +annotations. The resulting agent learning framework, GenRL, allows one to +specify tasks through vision and/or language prompts, ground them in the +embodied domain's dynamics, and learn the corresponding behaviors in +imagination. As assessed through large-scale multi-task benchmarking in +locomotion and manipulation domains, GenRL enables multi-task generalization +from language and visual prompts. Furthermore, by introducing a data-free +policy learning strategy, our approach lays the groundwork for foundational +policy learning using generative world models. Website, code and data: +https://mazpie.github.io/genrl/ + +
+
+ comment: Presented at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Next Best Sense: Guiding Vision and Touch with FisherRF for 3D Gaussian + Splatting + + +
+ We propose a framework for active next best view and touch selection for +robotic manipulators using 3D Gaussian Splatting (3DGS). 3DGS is emerging as a +useful explicit 3D scene representation for robotics, as it has the ability to +represent scenes in a both photorealistic and geometrically accurate manner. +However, in real-world, online robotic scenes where the number of views is +limited given efficiency requirements, random view selection for 3DGS becomes +impractical as views are often overlapping and redundant. We address this issue +by proposing an end-to-end online training and active view selection pipeline, +which enhances the performance of 3DGS in few-view robotics settings. We first +elevate the performance of few-shot 3DGS with a novel semantic depth alignment +method using Segment Anything Model 2 (SAM2) that we supplement with Pearson +depth and surface normal loss to improve color and depth reconstruction of +real-world scenes. We then extend FisherRF, a next-best-view selection method +for 3DGS, to select views and touch poses based on depth uncertainty. We +perform online view selection on a real robot system during live 3DGS training. +We motivate our improvements to few-shot GS scenes, and extend depth-based +FisherRF to them, where we demonstrate both qualitative and quantitative +improvements on challenging robot scenes. For more information, please see our +project page at https://arm.stanford.edu/next-best-sense. + +
+
+
+
+
+ + ♻ ☆ SCALER: Versatile Multi-Limbed Robot for Free-Climbing in Extreme + Terrains + + +
+ This paper presents SCALER, a versatile free-climbing multi-limbed robot that +is designed to achieve tightly coupled simultaneous locomotion and dexterous +grasping. Although existing quadruped-limbed robots have shown impressive +dexterous skills such as object manipulation, it is essential to balance +power-intensive locomotion and dexterous grasping capabilities. We design a +torso linkage and a parallel-serial limb to meet such conflicting skills that +pose unique challenges in the hardware designs. SCALER employs underactuated +two-fingered GOAT grippers that can mechanically adapt and offer 7 modes of +grasping, enabling SCALER to traverse extreme terrains with multi-modal +grasping strategies. We study the whole-body approach, where SCALER uses its +body and limbs to generate additional forces for stable grasping with +environments, further enhancing versatility. Furthermore, we improve the GOAT +gripper actuation speed to realize more dynamic climbing in a closed-loop +control fashion. With these proposed technologies, SCALER can traverse +vertical, overhang, upside-down, slippery terrains, and bouldering walls with +non-convex-shaped climbing holds under the Earth's gravity. + +
+
+
+
+
+ + ♻ ☆ DynaMo: In-Domain Dynamics Pretraining for Visuo-Motor Control + + +
+ Imitation learning has proven to be a powerful tool for training complex +visuomotor policies. However, current methods often require hundreds to +thousands of expert demonstrations to handle high-dimensional visual +observations. A key reason for this poor data efficiency is that visual +representations are predominantly either pretrained on out-of-domain data or +trained directly through a behavior cloning objective. In this work, we present +DynaMo, a new in-domain, self-supervised method for learning visual +representations. Given a set of expert demonstrations, we jointly learn a +latent inverse dynamics model and a forward dynamics model over a sequence of +image embeddings, predicting the next frame in latent space, without +augmentations, contrastive sampling, or access to ground truth actions. +Importantly, DynaMo does not require any out-of-domain data such as Internet +datasets or cross-embodied datasets. On a suite of six simulated and real +environments, we show that representations learned with DynaMo significantly +improve downstream imitation learning performance over prior self-supervised +learning objectives, and pretrained representations. Gains from using DynaMo +hold across policy classes such as Behavior Transformer, Diffusion Policy, MLP, +and nearest neighbors. Finally, we ablate over key components of DynaMo and +measure its impact on downstream policy performance. Robot videos are best +viewed at https://dynamo-ssl.github.io + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ ReferEverything: Towards Segmenting Everything We Can Speak of in Videos + + +
+ We present REM, a framework for segmenting a wide range of concepts in video +that can be described through natural language. Our method capitalizes on +visual-language representations learned by video diffusion models on +Internet-scale datasets. A key insight of our approach is preserving as much of +the generative model's original representation as possible, while fine-tuning +it on narrow-domain Referral Object Segmentation datasets. As a result, our +framework can accurately segment and track rare and unseen objects, despite +being trained on object masks from a limited set of categories. Additionally, +it can generalize to non-object dynamic concepts, such as waves crashing in the +ocean, as demonstrated in our newly introduced benchmark for Referral Video +Process Segmentation (Ref-VPS). Our experiments show that REM performs on par +with state-of-the-art approaches on in-domain datasets, like Ref-DAVIS, while +outperforming them by up to twelve points in terms of region similarity on +out-of-domain data, leveraging the power of Internet-scale pre-training. + +
+
+ comment: Project page at + https://miccooper9.github.io/projects/ReferEverything/ +
+
+
+
+
+ + ☆ RelationBooth: Towards Relation-Aware Customized Object Generation + + +
+ Customized image generation is crucial for delivering personalized content +based on user-provided image prompts, aligning large-scale text-to-image +diffusion models with individual needs. However, existing models often overlook +the relationships between customized objects in generated images. Instead, this +work addresses that gap by focusing on relation-aware customized image +generation, which aims to preserve the identities from image prompts while +maintaining the predicate relations described in text prompts. Specifically, we +introduce RelationBooth, a framework that disentangles identity and relation +learning through a well-curated dataset. Our training data consists of +relation-specific images, independent object images containing identity +information, and text prompts to guide relation generation. Then, we propose +two key modules to tackle the two main challenges: generating accurate and +natural relations, especially when significant pose adjustments are required, +and avoiding object confusion in cases of overlap. First, we introduce a +keypoint matching loss that effectively guides the model in adjusting object +poses closely tied to their relationships. Second, we incorporate local +features from the image prompts to better distinguish between objects, +preventing confusion in overlapping cases. Extensive results on three +benchmarks demonstrate the superiority of RelationBooth in generating precise +relations while preserving object identities across a diverse set of objects +and relations. The source code and trained models will be made available to the +public. + +
+
+
+
+
+ + ☆ OpenSatMap: A Fine-grained High-resolution Satellite Dataset for + Large-scale Map Construction NeurIPS 2024 + + +
+ In this paper, we propose OpenSatMap, a fine-grained, high-resolution +satellite dataset for large-scale map construction. Map construction is one of +the foundations of the transportation industry, such as navigation and +autonomous driving. Extracting road structures from satellite images is an +efficient way to construct large-scale maps. However, existing satellite +datasets provide only coarse semantic-level labels with a relatively low +resolution (up to level 19), impeding the advancement of this field. In +contrast, the proposed OpenSatMap (1) has fine-grained instance-level +annotations; (2) consists of high-resolution images (level 20); (3) is +currently the largest one of its kind; (4) collects data with high diversity. +Moreover, OpenSatMap covers and aligns with the popular nuScenes dataset and +Argoverse 2 dataset to potentially advance autonomous driving technologies. By +publishing and maintaining the dataset, we provide a high-quality benchmark for +satellite-based map construction and downstream tasks like autonomous driving. + +
+
+ comment: NeurIPS 2024 D&B Track. Project Page:https://opensatmap.github.io/ +
+
+
+
+
+ + ☆ SlowFast-VGen: Slow-Fast Learning for Action-Driven Long Video + Generation + + +
+ Human beings are endowed with a complementary learning system, which bridges +the slow learning of general world dynamics with fast storage of episodic +memory from a new experience. Previous video generation models, however, +primarily focus on slow learning by pre-training on vast amounts of data, +overlooking the fast learning phase crucial for episodic memory storage. This +oversight leads to inconsistencies across temporally distant frames when +generating longer videos, as these frames fall beyond the model's context +window. To this end, we introduce SlowFast-VGen, a novel dual-speed learning +system for action-driven long video generation. Our approach incorporates a +masked conditional video diffusion model for the slow learning of world +dynamics, alongside an inference-time fast learning strategy based on a +temporal LoRA module. Specifically, the fast learning process updates its +temporal LoRA parameters based on local inputs and outputs, thereby efficiently +storing episodic memory in its parameters. We further propose a slow-fast +learning loop algorithm that seamlessly integrates the inner fast learning loop +into the outer slow learning loop, enabling the recall of prior multi-episode +experiences for context-aware skill learning. To facilitate the slow learning +of an approximate world model, we collect a large-scale dataset of 200k videos +with language action annotations, covering a wide range of scenarios. Extensive +experiments show that SlowFast-VGen outperforms baselines across various +metrics for action-driven video generation, achieving an FVD score of 514 +compared to 782, and maintaining consistency in longer videos, with an average +of 0.37 scene cuts versus 0.89. The slow-fast learning loop algorithm +significantly enhances performances on long-horizon planning tasks as well. +Project Website: https://slowfast-vgen.github.io + +
+
+
+
+
+ + ☆ Multi-student Diffusion Distillation for Better One-step Generators + + +
+ Diffusion models achieve high-quality sample generation at the cost of a +lengthy multistep inference procedure. To overcome this, diffusion distillation +techniques produce student generators capable of matching or surpassing the +teacher in a single step. However, the student model's inference speed is +limited by the size of the teacher architecture, preventing real-time +generation for computationally heavy applications. In this work, we introduce +Multi-Student Distillation (MSD), a framework to distill a conditional teacher +diffusion model into multiple single-step generators. Each student generator is +responsible for a subset of the conditioning data, thereby obtaining higher +generation quality for the same capacity. MSD trains multiple distilled +students, allowing smaller sizes and, therefore, faster inference. Also, MSD +offers a lightweight quality boost over single-student distillation with the +same architecture. We demonstrate MSD is effective by training multiple +same-sized or smaller students on single-step distillation using distribution +matching and adversarial distillation techniques. With smaller students, MSD +gets competitive results with faster inference for single-step generation. +Using 4 same-sized students, MSD sets a new state-of-the-art for one-step image +generation: FID 1.20 on ImageNet-64x64 and 8.20 on zero-shot COCO2014. + +
+
+ comment: Project page: https://research.nvidia.com/labs/toronto-ai/MSD/ +
+
+
+
+
+ + ☆ TOMATO: Assessing Visual Temporal Reasoning Capabilities in Multimodal + Foundation Models + + +
+ Existing benchmarks often highlight the remarkable performance achieved by +state-of-the-art Multimodal Foundation Models (MFMs) in leveraging temporal +context for video understanding. However, how well do the models truly perform +visual temporal reasoning? Our study of existing benchmarks shows that this +capability of MFMs is likely overestimated as many questions can be solved by +using a single, few, or out-of-order frames. To systematically examine current +visual temporal reasoning tasks, we propose three principles with corresponding +metrics: (1) Multi-Frame Gain, (2) Frame Order Sensitivity, and (3) Frame +Information Disparity. Following these principles, we introduce TOMATO, +Temporal Reasoning Multimodal Evaluation, a novel benchmark crafted to +rigorously assess MFMs' temporal reasoning capabilities in video understanding. +TOMATO comprises 1,484 carefully curated, human-annotated questions spanning +six tasks (i.e., action count, direction, rotation, shape & trend, velocity & +frequency, and visual cues), applied to 1,417 videos, including 805 +self-recorded and -generated videos, that encompass human-centric, real-world, +and simulated scenarios. Our comprehensive evaluation reveals a human-model +performance gap of 57.3% with the best-performing model. Moreover, our in-depth +analysis uncovers more fundamental limitations beyond this gap in current MFMs. +While they can accurately recognize events in isolated frames, they fail to +interpret these frames as a continuous sequence. We believe TOMATO will serve +as a crucial testbed for evaluating the next-generation MFMs and as a call to +the community to develop AI systems capable of comprehending human world +dynamics through the video modality. + +
+
+
+
+
+ + ☆ EMMA: End-to-End Multimodal Model for Autonomous Driving + + +
+ We introduce EMMA, an End-to-end Multimodal Model for Autonomous driving. +Built on a multi-modal large language model foundation, EMMA directly maps raw +camera sensor data into various driving-specific outputs, including planner +trajectories, perception objects, and road graph elements. EMMA maximizes the +utility of world knowledge from the pre-trained large language models, by +representing all non-sensor inputs (e.g. navigation instructions and ego +vehicle status) and outputs (e.g. trajectories and 3D locations) as natural +language text. This approach allows EMMA to jointly process various driving +tasks in a unified language space, and generate the outputs for each task using +task-specific prompts. Empirically, we demonstrate EMMA's effectiveness by +achieving state-of-the-art performance in motion planning on nuScenes as well +as competitive results on the Waymo Open Motion Dataset (WOMD). EMMA also +yields competitive results for camera-primary 3D object detection on the Waymo +Open Dataset (WOD). We show that co-training EMMA with planner trajectories, +object detection, and road graph tasks yields improvements across all three +domains, highlighting EMMA's potential as a generalist model for autonomous +driving applications. However, EMMA also exhibits certain limitations: it can +process only a small amount of image frames, does not incorporate accurate 3D +sensing modalities like LiDAR or radar and is computationally expensive. We +hope that our results will inspire further research to mitigate these issues +and to further evolve the state of the art in autonomous driving model +architectures. + +
+
+ comment: Blog post: https://waymo.com/blog/2024/10/introducing-emma/ +
+
+
+
+
+ + ☆ Keypoint Abstraction using Large Models for Object-Relative Imitation + Learning + + +
+ Generalization to novel object configurations and instances across diverse +tasks and environments is a critical challenge in robotics. Keypoint-based +representations have been proven effective as a succinct representation for +capturing essential object features, and for establishing a reference frame in +action prediction, enabling data-efficient learning of robot skills. However, +their manual design nature and reliance on additional human labels limit their +scalability. In this paper, we propose KALM, a framework that leverages large +pre-trained vision-language models (LMs) to automatically generate +task-relevant and cross-instance consistent keypoints. KALM distills robust and +consistent keypoints across views and objects by generating proposals using LMs +and verifies them against a small set of robot demonstration data. Based on the +generated keypoints, we can train keypoint-conditioned policy models that +predict actions in keypoint-centric frames, enabling robots to generalize +effectively across varying object poses, camera views, and object instances +with similar functional shapes. Our method demonstrates strong performance in +the real world, adapting to different tasks and environments from only a +handful of demonstrations while requiring no additional labels. Website: +https://kalm-il.github.io/ + +
+
+ comment: CoRL LangRob Workshop, 2024 +
+
+
+
+
+ + ☆ bit2bit: 1-bit quanta video reconstruction via self-supervised photon + prediction NeurIPS 2024 + + +
+ Quanta image sensors, such as SPAD arrays, are an emerging sensor technology, +producing 1-bit arrays representing photon detection events over exposures as +short as a few nanoseconds. In practice, raw data are post-processed using +heavy spatiotemporal binning to create more useful and interpretable images at +the cost of degrading spatiotemporal resolution. In this work, we propose +bit2bit, a new method for reconstructing high-quality image stacks at the +original spatiotemporal resolution from sparse binary quanta image data. +Inspired by recent work on Poisson denoising, we developed an algorithm that +creates a dense image sequence from sparse binary photon data by predicting the +photon arrival location probability distribution. However, due to the binary +nature of the data, we show that the assumption of a Poisson distribution is +inadequate. Instead, we model the process with a Bernoulli lattice process from +the truncated Poisson. This leads to the proposal of a novel self-supervised +solution based on a masked loss function. We evaluate our method using both +simulated and real data. On simulated data from a conventional video, we +achieve 34.35 mean PSNR with extremely photon-sparse binary input (<0.06 +photons per pixel per frame). We also present a novel dataset containing a wide +range of real SPAD high-speed videos under various challenging imaging +conditions. The scenes cover strong/weak ambient light, strong motion, +ultra-fast events, etc., which will be made available to the community, on +which we demonstrate the promise of our approach. Both reconstruction quality +and throughput substantially surpass the state-of-the-art methods (e.g., Quanta +Burst Photography (QBP)). Our approach significantly enhances the visualization +and usability of the data, enabling the application of existing analysis +techniques. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ PointRecon: Online Point-based 3D Reconstruction via Ray-based 2D-3D + Matching + + +
+ We propose a novel online, point-based 3D reconstruction method from posed +monocular RGB videos. Our model maintains a global point cloud representation +of the scene, continuously updating the features and 3D locations of points as +new images are observed. It expands the point cloud with newly detected points +while carefully removing redundancies. The point cloud updates and depth +predictions for new points are achieved through a novel ray-based 2D-3D feature +matching technique, which is robust against errors in previous point position +predictions. In contrast to offline methods, our approach processes +infinite-length sequences and provides real-time updates. Additionally, the +point cloud imposes no pre-defined resolution or scene size constraints, and +its unified global representation ensures view consistency across perspectives. +Experiments on the ScanNet dataset show that our method achieves +state-of-the-art quality among online MVS approaches. Project page: +https://arthurhero.github.io/projects/pointrecon + +
+
+
+
+
+ + ☆ LGU-SLAM: Learnable Gaussian Uncertainty Matching with Deformable + Correlation Sampling for Deep Visual SLAM + + +
+ Deep visual Simultaneous Localization and Mapping (SLAM) techniques, e.g., +DROID, have made significant advancements by leveraging deep visual odometry on +dense flow fields. In general, they heavily rely on global visual similarity +matching. However, the ambiguous similarity interference in uncertain regions +could often lead to excessive noise in correspondences, ultimately misleading +SLAM in geometric modeling. To address this issue, we propose a Learnable +Gaussian Uncertainty (LGU) matching. It mainly focuses on precise +correspondence construction. In our scheme, a learnable 2D Gaussian uncertainty +model is designed to associate matching-frame pairs. It could generate +input-dependent Gaussian distributions for each correspondence map. +Additionally, a multi-scale deformable correlation sampling strategy is devised +to adaptively fine-tune the sampling of each direction by a priori look-up +ranges, enabling reliable correlation construction. Furthermore, a KAN-bias GRU +component is adopted to improve a temporal iterative enhancement for +accomplishing sophisticated spatio-temporal modeling with limited parameters. +The extensive experiments on real-world and synthetic datasets are conducted to +validate the effectiveness and superiority of our method. + +
+
+
+
+
+ + ☆ Aligning Audio-Visual Joint Representations with an Agentic Workflow + + +
+ Visual content and accompanied audio signals naturally formulate a joint +representation to improve audio-visual (AV) related applications. While studies +develop various AV representation learning frameworks, the importance of AV +data alignment is usually undermined for achieving high-quality representation. +We observe that an audio signal may contain background noise interference. +Also, non-synchronization may appear between audio and video streams. These +non-strict data alignment limits representation quality and downgrade +application performance. In this paper, we propose to improve AV joint +representations from a data-centric perspective by aligning audio signals to +visual data. Our alignment is conducted in an agentic workflow controlled by an +LLM-based assistant named AVAgent. For each input AV data pair, our AVAgent +uses a multi-modal LLM to convert audio and visual data into language +descriptions separately (i.e., tool use). Then, AVAgent reasons whether this +paired data is aligned well and plans to edit the audio signal if needed (i.e., +planning). The audio editing is executed by predefined actions that filter +noise or augment data. Moreover, we use a VLM to evaluate how modified audio +signals match the visual content and provide feedback to AVAgent (i.e., +reflection). The tool use, planning, and reflection steps operate cyclically to +become an agentic workflow where audio signals are gradually aligned to visual +content. To this end, existing methods can directly leverage the aligned AV +data via our agentic workflow to improve AV joint representations. The +experimental results comprehensively demonstrate the state-of-the-art +performance of the proposed approach against previous baselines in diverse +downstream tasks. + +
+
+
+
+
+ + ☆ DiaMond: Dementia Diagnosis with Multi-Modal Vision Transformers Using + MRI and PET + + +
+ Diagnosing dementia, particularly for Alzheimer's Disease (AD) and +frontotemporal dementia (FTD), is complex due to overlapping symptoms. While +magnetic resonance imaging (MRI) and positron emission tomography (PET) data +are critical for the diagnosis, integrating these modalities in deep learning +faces challenges, often resulting in suboptimal performance compared to using +single modalities. Moreover, the potential of multi-modal approaches in +differential diagnosis, which holds significant clinical importance, remains +largely unexplored. We propose a novel framework, DiaMond, to address these +issues with vision Transformers to effectively integrate MRI and PET. DiaMond +is equipped with self-attention and a novel bi-attention mechanism that +synergistically combine MRI and PET, alongside a multi-modal normalization to +reduce redundant dependency, thereby boosting the performance. DiaMond +significantly outperforms existing multi-modal methods across various datasets, +achieving a balanced accuracy of 92.4% in AD diagnosis, 65.2% for AD-MCI-CN +classification, and 76.5% in differential diagnosis of AD and FTD. We also +validated the robustness of DiaMond in a comprehensive ablation study. The code +is available at https://github.com/ai-med/DiaMond. + +
+
+ comment: Accepted by IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2025 +
+
+
+
+
+ + ☆ OS-ATLAS: A Foundation Action Model for Generalist GUI Agents + + +
+ Existing efforts in building GUI agents heavily rely on the availability of +robust commercial Vision-Language Models (VLMs) such as GPT-4o and +GeminiProVision. Practitioners are often reluctant to use open-source VLMs due +to their significant performance lag compared to their closed-source +counterparts, particularly in GUI grounding and Out-Of-Distribution (OOD) +scenarios. To facilitate future research in this area, we developed OS-Atlas - +a foundational GUI action model that excels at GUI grounding and OOD agentic +tasks through innovations in both data and modeling. We have invested +significant engineering effort in developing an open-source toolkit for +synthesizing GUI grounding data across multiple platforms, including Windows, +Linux, MacOS, Android, and the web. Leveraging this toolkit, we are releasing +the largest open-source cross-platform GUI grounding corpus to date, which +contains over 13 million GUI elements. This dataset, combined with innovations +in model training, provides a solid foundation for OS-Atlas to understand GUI +screenshots and generalize to unseen interfaces. Through extensive evaluation +across six benchmarks spanning three different platforms (mobile, desktop, and +web), OS-Atlas demonstrates significant performance improvements over previous +state-of-the-art models. Our evaluation also uncovers valuable insights into +continuously improving and scaling the agentic capabilities of open-source +VLMs. + +
+
+
+
+
+ + ☆ ELMGS: Enhancing memory and computation scaLability through coMpression + for 3D Gaussian Splatting + + +
+ 3D models have recently been popularized by the potentiality of end-to-end +training offered first by Neural Radiance Fields and most recently by 3D +Gaussian Splatting models. The latter has the big advantage of naturally +providing fast training convergence and high editability. However, as the +research around these is still in its infancy, there is still a gap in the +literature regarding the model's scalability. In this work, we propose an +approach enabling both memory and computation scalability of such models. More +specifically, we propose an iterative pruning strategy that removes redundant +information encoded in the model. We also enhance compressibility for the model +by including in the optimization strategy a differentiable quantization and +entropy coding estimator. Our results on popular benchmarks showcase the +effectiveness of the proposed approach and open the road to the broad +deployability of such a solution even on resource-constrained devices. + +
+
+
+
+
+ + ☆ HEX: Hierarchical Emergence Exploitation in Self-Supervised Algorithms + + +
+ In this paper, we propose an algorithm that can be used on top of a wide +variety of self-supervised (SSL) approaches to take advantage of hierarchical +structures that emerge during training. SSL approaches typically work through +some invariance term to ensure consistency between similar samples and a +regularization term to prevent global dimensional collapse. Dimensional +collapse refers to data representations spanning a lower-dimensional subspace. +Recent work has demonstrated that the representation space of these algorithms +gradually reflects a semantic hierarchical structure as training progresses. +Data samples of the same hierarchical grouping tend to exhibit greater +dimensional collapse locally compared to the dataset as a whole due to sharing +features in common with each other. Ideally, SSL algorithms would take +advantage of this hierarchical emergence to have an additional regularization +term to account for this local dimensional collapse effect. However, the +construction of existing SSL algorithms does not account for this property. To +address this, we propose an adaptive algorithm that performs a weighted +decomposition of the denominator of the InfoNCE loss into two terms: local +hierarchical and global collapse regularization respectively. This +decomposition is based on an adaptive threshold that gradually lowers to +reflect the emerging hierarchical structure of the representation space +throughout training. It is based on an analysis of the cosine similarity +distribution of samples in a batch. We demonstrate that this hierarchical +emergence exploitation (HEX) approach can be integrated across a wide variety +of SSL algorithms. Empirically, we show performance improvements of up to 5.6% +relative improvement over baseline SSL approaches on classification accuracy on +Imagenet with 100 epochs of training. + +
+
+
+
+
+ + ☆ Continuous Spatio-Temporal Memory Networks for 4D Cardiac Cine MRI + Segmentation + + +
+ Current cardiac cine magnetic resonance image (cMR) studies focus on the end +diastole (ED) and end systole (ES) phases, while ignoring the abundant temporal +information in the whole image sequence. This is because whole sequence +segmentation is currently a tedious process and inaccurate. Conventional whole +sequence segmentation approaches first estimate the motion field between +frames, which is then used to propagate the mask along the temporal axis. +However, the mask propagation results could be prone to error, especially for +the basal and apex slices, where through-plane motion leads to significant +morphology and structural change during the cardiac cycle. Inspired by recent +advances in video object segmentation (VOS), based on spatio-temporal memory +(STM) networks, we propose a continuous STM (CSTM) network for semi-supervised +whole heart and whole sequence cMR segmentation. Our CSTM network takes full +advantage of the spatial, scale, temporal and through-plane continuity prior of +the underlying heart anatomy structures, to achieve accurate and fast 4D +segmentation. Results of extensive experiments across multiple cMR datasets +show that our method can improve the 4D cMR segmentation performance, +especially for the hard-to-segment regions. + +
+
+ comment: Accepted to WACV 2025 +
+
+
+
+
+ + ☆ Fourier Amplitude and Correlation Loss: Beyond Using L2 Loss for + Skillful Precipitation Nowcasting NeurIPS 2024 + + +
+ Deep learning approaches have been widely adopted for precipitation +nowcasting in recent years. Previous studies mainly focus on proposing new +model architectures to improve pixel-wise metrics. However, they frequently +result in blurry predictions which provide limited utility to forecasting +operations. In this work, we propose a new Fourier Amplitude and Correlation +Loss (FACL) which consists of two novel loss terms: Fourier Amplitude Loss +(FAL) and Fourier Correlation Loss (FCL). FAL regularizes the Fourier amplitude +of the model prediction and FCL complements the missing phase information. The +two loss terms work together to replace the traditional $L_2$ losses such as +MSE and weighted MSE for the spatiotemporal prediction problem on signal-based +data. Our method is generic, parameter-free and efficient. Extensive +experiments using one synthetic dataset and three radar echo datasets +demonstrate that our method improves perceptual metrics and meteorology skill +scores, with a small trade-off to pixel-wise accuracy and structural +similarity. Moreover, to improve the error margin in meteorological skill +scores such as Critical Success Index (CSI) and Fractions Skill Score (FSS), we +propose and adopt the Regional Histogram Divergence (RHD), a distance metric +that considers the patch-wise similarity between signal-based imagery patterns +with tolerance to local transforms. Code is available at +https://github.com/argenycw/FACL + +
+
+ comment: Accepted by NeurIPS 2024. Camera-ready submission +
+
+
+
+
+ + ☆ VisualPredicator: Learning Abstract World Models with Neuro-Symbolic + Predicates for Robot Planning + + +
+ Broadly intelligent agents should form task-specific abstractions that +selectively expose the essential elements of a task, while abstracting away the +complexity of the raw sensorimotor space. In this work, we present +Neuro-Symbolic Predicates, a first-order abstraction language that combines the +strengths of symbolic and neural knowledge representations. We outline an +online algorithm for inventing such predicates and learning abstract world +models. We compare our approach to hierarchical reinforcement learning, +vision-language model planning, and symbolic predicate invention approaches, on +both in- and out-of-distribution tasks across five simulated robotic domains. +Results show that our approach offers better sample complexity, stronger +out-of-distribution generalization, and improved interpretability. + +
+
+ comment: In submission +
+
+
+
+
+ + ☆ Nested ResNet: A Vision-Based Method for Detecting the Sensing Area of a + Drop-in Gamma Probe + + +
+ Purpose: Drop-in gamma probes are widely used in robotic-assisted minimally +invasive surgery (RAMIS) for lymph node detection. However, these devices only +provide audio feedback on signal intensity, lacking the visual feedback +necessary for precise localisation. Previous work attempted to predict the +sensing area location using laparoscopic images, but the prediction accuracy +was unsatisfactory. Improvements are needed in the deep learning-based +regression approach. + Methods: We introduce a three-branch deep learning framework to predict the +sensing area of the probe. Specifically, we utilise the stereo laparoscopic +images as input for the main branch and develop a Nested ResNet architecture. +The framework also incorporates depth estimation via transfer learning and +orientation guidance through probe axis sampling. The combined features from +each branch enhanced the accuracy of the prediction. + Results: Our approach has been evaluated on a publicly available dataset, +demonstrating superior performance over previous methods. In particular, our +method resulted in a 22.10\% decrease in 2D mean error and a 41.67\% reduction +in 3D mean error. Additionally, qualitative comparisons further demonstrated +the improved precision of our approach. + Conclusion: With extensive evaluation, our solution significantly enhances +the accuracy and reliability of sensing area predictions. This advancement +enables visual feedback during the use of the drop-in gamma probe in surgery, +providing surgeons with more accurate and reliable localisation.} + +
+
+
+
+
+ + ☆ FAIR-TAT: Improving Model Fairness Using Targeted Adversarial Training + + +
+ Deep neural networks are susceptible to adversarial attacks and common +corruptions, which undermine their robustness. In order to enhance model +resilience against such challenges, Adversarial Training (AT) has emerged as a +prominent solution. Nevertheless, adversarial robustness is often attained at +the expense of model fairness during AT, i.e., disparity in class-wise +robustness of the model. While distinctive classes become more robust towards +such adversaries, hard to detect classes suffer. Recently, research has focused +on improving model fairness specifically for perturbed images, overlooking the +accuracy of the most likely non-perturbed data. Additionally, despite their +robustness against the adversaries encountered during model training, +state-of-the-art adversarial trained models have difficulty maintaining +robustness and fairness when confronted with diverse adversarial threats or +common corruptions. In this work, we address the above concerns by introducing +a novel approach called Fair Targeted Adversarial Training (FAIR-TAT). We show +that using targeted adversarial attacks for adversarial training (instead of +untargeted attacks) can allow for more favorable trade-offs with respect to +adversarial fairness. Empirical results validate the efficacy of our approach. + +
+
+
+
+
+ + ☆ Revisiting MAE pre-training for 3D medical image segmentation + + +
+ Self-Supervised Learning (SSL) presents an exciting opportunity to unlock the +potential of vast, untapped clinical datasets, for various downstream +applications that suffer from the scarcity of labeled data. While SSL has +revolutionized fields like natural language processing and computer vision, +their adoption in 3D medical image computing has been limited by three key +pitfalls: Small pre-training dataset sizes, architectures inadequate for 3D +medical image analysis, and insufficient evaluation practices. We address these +issues by i) leveraging a large-scale dataset of 44k 3D brain MRI volumes and +ii) using a Residual Encoder U-Net architecture within the state-of-the-art +nnU-Net framework. iii) A robust development framework, incorporating 5 +development and 8 testing brain MRI segmentation datasets, allowed +performance-driven design decisions to optimize the simple concept of Masked +Auto Encoders (MAEs) for 3D CNNs. The resulting model not only surpasses +previous SSL methods but also outperforms the strong nnU-Net baseline by an +average of approximately 3 Dice points. Furthermore, our model demonstrates +exceptional stability, achieving the highest average rank of 2 out of 7 +methods, compared to the second-best method's mean rank of 3. + +
+
+ comment: Arxiv Preprint. Currently under Review +
+
+
+
+
+ + ☆ Compositional Segmentation of Cardiac Images Leveraging Metadata + + +
+ Cardiac image segmentation is essential for automated cardiac function +assessment and monitoring of changes in cardiac structures over time. Inspired +by coarse-to-fine approaches in image analysis, we propose a novel multitask +compositional segmentation approach that can simultaneously localize the heart +in a cardiac image and perform part-based segmentation of different regions of +interest. We demonstrate that this compositional approach achieves better +results than direct segmentation of the anatomies. Further, we propose a novel +Cross-Modal Feature Integration (CMFI) module to leverage the metadata related +to cardiac imaging collected during image acquisition. We perform experiments +on two different modalities, MRI and ultrasound, using public datasets, +Multi-disease, Multi-View, and Multi-Centre (M&Ms-2) and Multi-structure +Ultrasound Segmentation (CAMUS) data, to showcase the efficiency of the +proposed compositional segmentation method and Cross-Modal Feature Integration +module incorporating metadata within the proposed compositional segmentation +network. The source code is available: +https://github.com/kabbas570/CompSeg-MetaData. + +
+
+ comment: IEEE/CVF Winter Conference on Applications of Computer Vision (WACV) + 2025 +
+
+
+
+
+ + ☆ Why Fine-grained Labels in Pretraining Benefit Generalization? + + +
+ Recent studies show that pretraining a deep neural network with fine-grained +labeled data, followed by fine-tuning on coarse-labeled data for downstream +tasks, often yields better generalization than pretraining with coarse-labeled +data. While there is ample empirical evidence supporting this, the theoretical +justification remains an open problem. This paper addresses this gap by +introducing a "hierarchical multi-view" structure to confine the input data +distribution. Under this framework, we prove that: 1) coarse-grained +pretraining only allows a neural network to learn the common features well, +while 2) fine-grained pretraining helps the network learn the rare features in +addition to the common ones, leading to improved accuracy on hard downstream +test samples. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2303.16887 +
+
+
+
+
+ + ☆ Unified Triplet-Level Hallucination Evaluation for Large Vision-Language + Models + + +
+ Despite the outstanding performance in vision-language reasoning, Large +Vision-Language Models (LVLMs) might generate hallucinated contents that do not +exist in the given image. Most existing LVLM hallucination benchmarks are +constrained to evaluate the object-related hallucinations. However, the +potential hallucination on the relations between two objects, i.e., relation +hallucination, still lacks investigation. To remedy that, in this paper we +design a unified framework to measure object and relation hallucination in +LVLMs simultaneously. The core idea of our framework is to conduct +hallucination evaluation on (object, relation, object) triplets extracted from +LVLMs' responses, and thus, could be easily generalized to different +vision-language tasks. Based on our framework, we further introduce Tri-HE, a +novel Triplet-level Hallucination Evaluation benchmark which can be used to +study both object and relation hallucination at the same time. We conduct +comprehensive evaluations on Tri-HE and observe that the relation hallucination +issue is even more serious than object hallucination among existing LVLMs, +highlighting a previously neglected problem towards reliable LVLMs. Moreover, +based on our findings, we design a simple yet effective training-free approach +to mitigate hallucinations for LVLMs, with which, we exceed all open-sourced +counterparts on Tri-HE, achieving comparable performance with the powerful +GPT-4V. Our dataset and code for the reproduction of our experiments are +available publicly at https://github.com/wujunjie1998/Tri-HE. + +
+
+ comment: 18 pages, 8 figures +
+
+
+
+
+ + ☆ NASM: Neural Anisotropic Surface Meshing SIGGRAPH + + +
+ This paper introduces a new learning-based method, NASM, for anisotropic +surface meshing. Our key idea is to propose a graph neural network to embed an +input mesh into a high-dimensional (high-d) Euclidean embedding space to +preserve curvature-based anisotropic metric by using a dot product loss between +high-d edge vectors. This can dramatically reduce the computational time and +increase the scalability. Then, we propose a novel feature-sensitive remeshing +on the generated high-d embedding to automatically capture sharp geometric +features. We define a high-d normal metric, and then derive an automatic +differentiation on a high-d centroidal Voronoi tessellation (CVT) optimization +with the normal metric to simultaneously preserve geometric features and +curvature anisotropy that exhibit in the original 3D shapes. To our knowledge, +this is the first time that a deep learning framework and a large dataset are +proposed to construct a high-d Euclidean embedding space for 3D anisotropic +surface meshing. Experimental results are evaluated and compared with the +state-of-the-art in anisotropic surface meshing on a large number of surface +models from Thingi10K dataset as well as tested on extensive unseen 3D shapes +from Multi-Garment Network dataset and FAUST human dataset. + +
+
+ comment: SIGGRAPH Asia 2024 (Conference Track) +
+
+
+
+
+ + ☆ Decoupling Semantic Similarity from Spatial Alignment for Neural + Networks NeurIPS2024 + + +
+ What representation do deep neural networks learn? How similar are images to +each other for neural networks? Despite the overwhelming success of deep +learning methods key questions about their internal workings still remain +largely unanswered, due to their internal high dimensionality and complexity. +To address this, one approach is to measure the similarity of activation +responses to various inputs. Representational Similarity Matrices (RSMs) +distill this similarity into scalar values for each input pair. These matrices +encapsulate the entire similarity structure of a system, indicating which input +leads to similar responses. While the similarity between images is ambiguous, +we argue that the spatial location of semantic objects does neither influence +human perception nor deep learning classifiers. Thus this should be reflected +in the definition of similarity between image responses for computer vision +systems. Revisiting the established similarity calculations for RSMs we expose +their sensitivity to spatial alignment. In this paper, we propose to solve this +through semantic RSMs, which are invariant to spatial permutation. We measure +semantic similarity between input responses by formulating it as a set-matching +problem. Further, we quantify the superiority of semantic RSMs over +spatio-semantic RSMs through image retrieval and by comparing the similarity +between representations to the similarity between predicted class +probabilities. + +
+
+ comment: Accepted at NeurIPS2024 +
+
+
+
+
+ + ☆ Automated Image-Based Identification and Consistent Classification of + Fire Patterns with Quantitative Shape Analysis and Spatial Location + Identification + + +
+ Fire patterns, consisting of fire effects that offer insights into fire +behavior and origin, are traditionally classified based on investigators' +visual observations, leading to subjective interpretations. This study proposes +a framework for quantitative fire pattern classification to support fire +investigators, aiming for consistency and accuracy. The framework integrates +four components. First, it leverages human-computer interaction to extract fire +patterns from surfaces, combining investigator expertise with computational +analysis. Second, it employs an aspect ratio-based random forest model to +classify fire pattern shapes. Third, fire scene point cloud segmentation +enables precise identification of fire-affected areas and the mapping of 2D +fire patterns to 3D scenes. Lastly, spatial relationships between fire patterns +and indoor elements support an interpretation of the fire scene. These +components provide a method for fire pattern analysis that synthesizes +qualitative and quantitative data. The framework's classification results +achieve 93% precision on synthetic data and 83% on real fire patterns. + +
+
+
+
+
+ + ☆ First Place Solution to the ECCV 2024 ROAD++ Challenge @ ROAD++ Atomic + Activity Recognition 2024 + + +
+ This report presents our team's technical solution for participating in Track +3 of the 2024 ECCV ROAD++ Challenge. The task of Track 3 is atomic activity +recognition, which aims to identify 64 types of atomic activities in road +scenes based on video content. Our approach primarily addresses the challenges +of small objects, discriminating between single object and a group of objects, +as well as model overfitting in this task. Firstly, we construct a multi-branch +activity recognition framework that not only separates different object +categories but also the tasks of single object and object group recognition, +thereby enhancing recognition accuracy. Subsequently, we develop various model +ensembling strategies, including integrations of multiple frame sampling +sequences, different frame sampling sequence lengths, multiple training epochs, +and different backbone networks. Furthermore, we propose an atomic activity +recognition data augmentation method, which greatly expands the sample space by +flipping video frames and road topology, effectively mitigating model +overfitting. Our methods rank first in the test set of Track 3 for the ROAD++ +Challenge 2024, and achieve 69% mAP. + +
+
+
+
+
+ + ☆ CausalDiff: Causality-Inspired Disentanglement via Diffusion Model for + Adversarial Defense NeurIPS 2024 + + +
+ Despite ongoing efforts to defend neural classifiers from adversarial +attacks, they remain vulnerable, especially to unseen attacks. In contrast, +humans are difficult to be cheated by subtle manipulations, since we make +judgments only based on essential factors. Inspired by this observation, we +attempt to model label generation with essential label-causative factors and +incorporate label-non-causative factors to assist data generation. For an +adversarial example, we aim to discriminate the perturbations as non-causative +factors and make predictions only based on the label-causative factors. +Concretely, we propose a casual diffusion model (CausalDiff) that adapts +diffusion models for conditional data generation and disentangles the two types +of casual factors by learning towards a novel casual information bottleneck +objective. Empirically, CausalDiff has significantly outperformed +state-of-the-art defense methods on various unseen attacks, achieving an +average robustness of 86.39% (+4.01%) on CIFAR-10, 56.25% (+3.13%) on +CIFAR-100, and 82.62% (+4.93%) on GTSRB (German Traffic Sign Recognition +Benchmark). + +
+
+ comment: accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ PIP-MM: Pre-Integrating Prompt Information into Visual Encoding via + Existing MLLM Structures + + +
+ The Multimodal Large Language Models (MLLMs) have activated the +capabilitiesof Large Language Models (LLMs) in solving visual-language tasks by +integratingvisual information. The prevailing approach in existing MLLMs +involvesemploying an image encoder to extract visual features, converting +thesefeatures into visual tokens via an adapter, and then integrating them with +theprompt into the LLM. However, because the process of image encoding +isprompt-agnostic, the extracted visual features only provide a +coarsedescription of the image, impossible to focus on the requirements of +theprompt. On one hand, it is easy for image features to lack information +aboutthe prompt-specified objects, resulting in unsatisfactory responses. On +theother hand, the visual features contain a large amount of +irrelevantinformation, which not only increases the burden on memory but also +worsens thegeneration effectiveness. To address the aforementioned issues, we +propose\textbf{PIP-MM}, a framework that +\textbf{P}re-\textbf{I}ntegrates\textbf{P}rompt information into the visual +encoding process using existingmodules of MLLMs. Specifically, We utilize the +frozen LLM in the MLLM tovectorize the input prompt, which summarizes the +requirements of the prompt.Then, we input the prompt vector into our trained +Multi-Layer Perceptron (MLP)to align with the visual input requirements, and +subsequently replace the classembedding in the image encoder. Since our model +only requires adding atrainable MLP, it can be applied to any MLLM. To validate +the effectiveness ofPIP-MM, we conducted experiments on multiple benchmarks. +Automated evaluationmetrics and manual assessments demonstrate the strong +performance of PIP-MM.Particularly noteworthy is that our model maintains +excellent generationresults even when half of the visual tokens are reduced. + +
+
+
+
+
+ + ☆ S3PT: Scene Semantics and Structure Guided Clustering to Boost + Self-Supervised Pre-Training for Autonomous Driving + + +
+ Recent self-supervised clustering-based pre-training techniques like DINO and +Cribo have shown impressive results for downstream detection and segmentation +tasks. However, real-world applications such as autonomous driving face +challenges with imbalanced object class and size distributions and complex +scene geometries. In this paper, we propose S3PT a novel scene semantics and +structure guided clustering to provide more scene-consistent objectives for +self-supervised training. Specifically, our contributions are threefold: First, +we incorporate semantic distribution consistent clustering to encourage better +representation of rare classes such as motorcycles or animals. Second, we +introduce object diversity consistent spatial clustering, to handle imbalanced +and diverse object sizes, ranging from large background areas to small objects +such as pedestrians and traffic signs. Third, we propose a depth-guided spatial +clustering to regularize learning based on geometric information of the scene, +thus further refining region separation on the feature level. Our learned +representations significantly improve performance in downstream semantic +segmentation and 3D object detection tasks on the nuScenes, nuImages, and +Cityscapes datasets and show promising domain translation properties. + +
+
+ comment: Accepted for WACV 2025 +
+
+
+
+
+ + ☆ AI-assisted prostate cancer detection and localisation on biparametric + MR by classifying radiologist-positives + + +
+ Prostate cancer diagnosis through MR imaging have currently relied on +radiologists' interpretation, whilst modern AI-based methods have been +developed to detect clinically significant cancers independent of radiologists. +In this study, we propose to develop deep learning models that improve the +overall cancer diagnostic accuracy, by classifying radiologist-identified +patients or lesions (i.e. radiologist-positives), as opposed to the existing +models that are trained to discriminate over all patients. We develop a single +voxel-level classification model, with a simple percentage threshold to +determine positive cases, at levels of lesions, Barzell-zones and patients. +Based on the presented experiments from two clinical data sets, consisting of +histopathology-labelled MR images from more than 800 and 500 patients in the +respective UCLA and UCL PROMIS studies, we show that the proposed strategy can +improve the diagnostic accuracy, by augmenting the radiologist reading of the +MR imaging. Among varying definition of clinical significance, the proposed +strategy, for example, achieved a specificity of 44.1% (with AI assistance) +from 36.3% (by radiologists alone), at a controlled sensitivity of 80.0% on the +publicly available UCLA data set. This provides measurable clinical values in a +range of applications such as reducing unnecessary biopsies, lowering cost in +cancer screening and quantifying risk in therapies. + +
+
+
+
+
+ + ☆ First Place Solution to the ECCV 2024 ROAD++ Challenge @ ROAD++ + Spatiotemporal Agent Detection 2024 + + +
+ This report presents our team's solutions for the Track 1 of the 2024 ECCV +ROAD++ Challenge. The task of Track 1 is spatiotemporal agent detection, which +aims to construct an "agent tube" for road agents in consecutive video frames. +Our solutions focus on the challenges in this task, including extreme-size +objects, low-light scenarios, class imbalance, and fine-grained classification. +Firstly, the extreme-size object detection heads are introduced to improve the +detection performance of large and small objects. Secondly, we design a +dual-stream detection model with a low-light enhancement stream to improve the +performance of spatiotemporal agent detection in low-light scenes, and the +feature fusion module to integrate features from different branches. +Subsequently, we develop a multi-branch detection framework to mitigate the +issues of class imbalance and fine-grained classification, and we design a +pre-training and fine-tuning approach to optimize the above multi-branch +framework. Besides, we employ some common data augmentation techniques, and +improve the loss function and upsampling operation. We rank first in the test +set of Track 1 for the ROAD++ Challenge 2024, and achieve 30.82% average +video-mAP. + +
+
+
+
+
+ + ☆ RSNet: A Light Framework for The Detection of Multi-scale Remote Sensing + Targets + + +
+ Recent developments in synthetic aperture radar (SAR) ship detection have +seen deep learning techniques achieve remarkable progress in accuracy and +speed. However, the detection of small targets against complex backgrounds +remains a significant challenge. To tackle these difficulties, this letter +presents RSNet, a lightweight framework aimed at enhancing ship detection +capabilities in SAR imagery. RSNet features the Waveletpool-ContextGuided (WCG) +backbone for enhanced accuracy with fewer parameters, and the +Waveletpool-StarFusion (WSF) head for efficient parameter reduction. +Additionally, a Lightweight-Shared (LS) module minimizes the detection head's +parameter load. Experiments on the SAR Ship Detection Dataset (SSDD) and +High-Resolution SAR Image Dataset (HRSID) demonstrate that RSNet achieves a +strong balance between lightweight design and detection performance, surpassing +many state-of-the-art detectors, reaching 72.5\% and 67.6\% in +\textbf{\(\mathbf{mAP_{.50:95}}\) }respectively with 1.49M parameters. Our code +will be released soon. + +
+
+
+
+
+ + ☆ CNN Explainability with Multivector Tucker Saliency Maps for + Self-Supervised Models + + +
+ Interpreting the decisions of Convolutional Neural Networks (CNNs) is +essential for understanding their behavior, yet explainability remains a +significant challenge, particularly for self-supervised models. Most existing +methods for generating saliency maps rely on ground truth labels, restricting +their use to supervised tasks. EigenCAM is the only notable label-independent +alternative, leveraging Singular Value Decomposition to generate saliency maps +applicable across CNN models, but it does not fully exploit the tensorial +structure of feature maps. In this work, we introduce the Tucker Saliency Map +(TSM) method, which applies Tucker tensor decomposition to better capture the +inherent structure of feature maps, producing more accurate singular vectors +and values. These are used to generate high-fidelity saliency maps, effectively +highlighting objects of interest in the input. We further extend EigenCAM and +TSM into multivector variants -Multivec-EigenCAM and Multivector Tucker +Saliency Maps (MTSM)- which utilize all singular vectors and values, further +improving saliency map quality. Quantitative evaluations on supervised +classification models demonstrate that TSM, Multivec-EigenCAM, and MTSM achieve +competitive performance with label-dependent methods. Moreover, TSM enhances +explainability by approximately 50% over EigenCAM for both supervised and +self-supervised models. Multivec-EigenCAM and MTSM further advance +state-of-the-art explainability performance on self-supervised models, with +MTSM achieving the best results. + +
+
+ comment: 29 pages, 20 figures +
+
+
+
+
+ + ☆ Controlling Language and Diffusion Models by Transporting Activations + + +
+ The increasing capabilities of large generative models and their ever more +widespread deployment have raised concerns about their reliability, safety, and +potential misuse. To address these issues, recent works have proposed to +control model generation by steering model activations in order to effectively +induce or prevent the emergence of concepts or behaviors in the generated +output. In this paper we introduce Activation Transport (AcT), a general +framework to steer activations guided by optimal transport theory that +generalizes many previous activation-steering works. AcT is modality-agnostic +and provides fine-grained control over the model behavior with negligible +computational overhead, while minimally impacting model abilities. We +experimentally show the effectiveness and versatility of our approach by +addressing key challenges in large language models (LLMs) and text-to-image +diffusion models (T2Is). For LLMs, we show that AcT can effectively mitigate +toxicity, induce arbitrary concepts, and increase their truthfulness. In T2Is, +we show how AcT enables fine-grained style control and concept negation. + +
+
+
+
+
+ + ☆ Neural Attention Field: Emerging Point Relevance in 3D Scenes for + One-Shot Dexterous Grasping + + +
+ One-shot transfer of dexterous grasps to novel scenes with object and context +variations has been a challenging problem. While distilled feature fields from +large vision models have enabled semantic correspondences across 3D scenes, +their features are point-based and restricted to object surfaces, limiting +their capability of modeling complex semantic feature distributions for +hand-object interactions. In this work, we propose the \textit{neural attention +field} for representing semantic-aware dense feature fields in the 3D space by +modeling inter-point relevance instead of individual point features. Core to it +is a transformer decoder that computes the cross-attention between any 3D query +point with all the scene points, and provides the query point feature with an +attention-based aggregation. We further propose a self-supervised framework for +training the transformer decoder from only a few 3D pointclouds without hand +demonstrations. Post-training, the attention field can be applied to novel +scenes for semantics-aware dexterous grasping from one-shot demonstration. +Experiments show that our method provides better optimization landscapes by +encouraging the end-effector to focus on task-relevant scene regions, resulting +in significant improvements in success rates on real robots compared with the +feature-field-based methods. + +
+
+
+
+
+ + ☆ DexGraspNet 2.0: Learning Generative Dexterous Grasping in Large-scale + Synthetic Cluttered Scenes + + +
+ Grasping in cluttered scenes remains highly challenging for dexterous hands +due to the scarcity of data. To address this problem, we present a large-scale +synthetic benchmark, encompassing 1319 objects, 8270 scenes, and 427 million +grasps. Beyond benchmarking, we also propose a novel two-stage grasping method +that learns efficiently from data by using a diffusion model that conditions on +local geometry. Our proposed generative method outperforms all baselines in +simulation experiments. Furthermore, with the aid of test-time-depth +restoration, our method demonstrates zero-shot sim-to-real transfer, attaining +90.7% real-world dexterous grasping success rate in cluttered scenes. + +
+
+
+
+
+ + ☆ VisAidMath: Benchmarking Visual-Aided Mathematical Reasoning + + +
+ Although previous research on large language models (LLMs) and large +multi-modal models (LMMs) has systematically explored mathematical +problem-solving (MPS) within visual contexts, the analysis of how these models +process visual information during problem-solving remains insufficient. To +address this gap, we present VisAidMath, a benchmark for evaluating the MPS +process related to visual information. We follow a rigorous data curation +pipeline involving both automated processes and manual annotations to ensure +data quality and reliability. Consequently, this benchmark includes 1,200 +challenging problems from various mathematical branches, vision-aid +formulations, and difficulty levels, collected from diverse sources such as +textbooks, examination papers, and Olympiad problems. Based on the proposed +benchmark, we conduct comprehensive evaluations on ten mainstream LLMs and +LMMs, highlighting deficiencies in the visual-aided reasoning process. For +example, GPT-4V only achieves 45.33% accuracy in the visual-aided reasoning +task, even with a drop of 2 points when provided with golden visual aids. +In-depth analysis reveals that the main cause of deficiencies lies in +hallucination regarding the implicit visual reasoning process, shedding light +on future research directions in the visual-aided MPS process. + +
+
+ comment: 58 pages, 28 figures +
+
+
+
+
+ + ☆ LumiSculpt: A Consistency Lighting Control Network for Video Generation + + +
+ Lighting plays a pivotal role in ensuring the naturalness of video +generation, significantly influencing the aesthetic quality of the generated +content. However, due to the deep coupling between lighting and the temporal +features of videos, it remains challenging to disentangle and model independent +and coherent lighting attributes, limiting the ability to control lighting in +video generation. In this paper, inspired by the established controllable T2I +models, we propose LumiSculpt, which, for the first time, enables precise and +consistent lighting control in T2V generation models.LumiSculpt equips the +video generation with strong interactive capabilities, allowing the input of +custom lighting reference image sequences. Furthermore, the core learnable +plug-and-play module of LumiSculpt facilitates remarkable control over lighting +intensity, position, and trajectory in latent video diffusion models based on +the advanced DiT backbone.Additionally, to effectively train LumiSculpt and +address the issue of insufficient lighting data, we construct LumiHuman, a new +lightweight and flexible dataset for portrait lighting of images and videos. +Experimental results demonstrate that LumiSculpt achieves precise and +high-quality lighting control in video generation. + +
+
+
+
+
+ + ☆ EnsIR: An Ensemble Algorithm for Image Restoration via Gaussian Mixture + Models + + +
+ Image restoration has experienced significant advancements due to the +development of deep learning. Nevertheless, it encounters challenges related to +ill-posed problems, resulting in deviations between single model predictions +and ground-truths. Ensemble learning, as a powerful machine learning technique, +aims to address these deviations by combining the predictions of multiple base +models. Most existing works adopt ensemble learning during the design of +restoration models, while only limited research focuses on the inference-stage +ensemble of pre-trained restoration models. Regression-based methods fail to +enable efficient inference, leading researchers in academia and industry to +prefer averaging as their choice for post-training ensemble. To address this, +we reformulate the ensemble problem of image restoration into Gaussian mixture +models (GMMs) and employ an expectation maximization (EM)-based algorithm to +estimate ensemble weights for aggregating prediction candidates. We estimate +the range-wise ensemble weights on a reference set and store them in a lookup +table (LUT) for efficient ensemble inference on the test set. Our algorithm is +model-agnostic and training-free, allowing seamless integration and enhancement +of various pre-trained image restoration models. It consistently outperforms +regression based methods and averaging ensemble approaches on 14 benchmarks +across 3 image restoration tasks, including super-resolution, deblurring and +deraining. The codes and all estimated weights have been released in Github. + +
+
+ comment: 10 pages for main manuscript, additional 17 pages for appendix, 18 + figures, 17MB +
+
+
+
+
+ + ☆ Efficient Adaptation of Pre-trained Vision Transformer via Householder + Transformation + + +
+ A common strategy for Parameter-Efficient Fine-Tuning (PEFT) of pre-trained +Vision Transformers (ViTs) involves adapting the model to downstream tasks by +learning a low-rank adaptation matrix. This matrix is decomposed into a product +of down-projection and up-projection matrices, with the bottleneck +dimensionality being crucial for reducing the number of learnable parameters, +as exemplified by prevalent methods like LoRA and Adapter. However, these +low-rank strategies typically employ a fixed bottleneck dimensionality, which +limits their flexibility in handling layer-wise variations. To address this +limitation, we propose a novel PEFT approach inspired by Singular Value +Decomposition (SVD) for representing the adaptation matrix. SVD decomposes a +matrix into the product of a left unitary matrix, a diagonal matrix of scaling +values, and a right unitary matrix. We utilize Householder transformations to +construct orthogonal matrices that efficiently mimic the unitary matrices, +requiring only a vector. The diagonal values are learned in a layer-wise +manner, allowing them to flexibly capture the unique properties of each layer. +This approach enables the generation of adaptation matrices with varying ranks +across different layers, providing greater flexibility in adapting pre-trained +models. Experiments on standard downstream vision tasks demonstrate that our +method achieves promising fine-tuning performance. + +
+
+
+
+
+ + ☆ AdaptiveISP: Learning an Adaptive Image Signal Processor for Object + Detection NeurIPS2024 + + +
+ Image Signal Processors (ISPs) convert raw sensor signals into digital +images, which significantly influence the image quality and the performance of +downstream computer vision tasks. Designing ISP pipeline and tuning ISP +parameters are two key steps for building an imaging and vision system. To find +optimal ISP configurations, recent works use deep neural networks as a proxy to +search for ISP parameters or ISP pipelines. However, these methods are +primarily designed to maximize the image quality, which are sub-optimal in the +performance of high-level computer vision tasks such as detection, recognition, +and tracking. Moreover, after training, the learned ISP pipelines are mostly +fixed at the inference time, whose performance degrades in dynamic scenes. To +jointly optimize ISP structures and parameters, we propose AdaptiveISP, a +task-driven and scene-adaptive ISP. One key observation is that for the +majority of input images, only a few processing modules are needed to improve +the performance of downstream recognition tasks, and only a few inputs require +more processing. Based on this, AdaptiveISP utilizes deep reinforcement +learning to automatically generate an optimal ISP pipeline and the associated +ISP parameters to maximize the detection performance. Experimental results show +that AdaptiveISP not only surpasses the prior state-of-the-art methods for +object detection but also dynamically manages the trade-off between detection +performance and computational cost, especially suitable for scenes with large +dynamic range variations. Project website: +https://openimaginglab.github.io/AdaptiveISP/. + +
+
+ comment: Accepted at NeurIPS2024 +
+
+
+
+
+ + ☆ Bringing NeRFs to the Latent Space: Inverse Graphics Autoencoder + + +
+ While pre-trained image autoencoders are increasingly utilized in computer +vision, the application of inverse graphics in 2D latent spaces has been +under-explored. Yet, besides reducing the training and rendering complexity, +applying inverse graphics in the latent space enables a valuable +interoperability with other latent-based 2D methods. The major challenge is +that inverse graphics cannot be directly applied to such image latent spaces +because they lack an underlying 3D geometry. In this paper, we propose an +Inverse Graphics Autoencoder (IG-AE) that specifically addresses this issue. To +this end, we regularize an image autoencoder with 3D-geometry by aligning its +latent space with jointly trained latent 3D scenes. We utilize the trained +IG-AE to bring NeRFs to the latent space with a latent NeRF training pipeline, +which we implement in an open-source extension of the Nerfstudio framework, +thereby unlocking latent scene learning for its supported methods. We +experimentally confirm that Latent NeRFs trained with IG-AE present an improved +quality compared to a standard autoencoder, all while exhibiting training and +rendering accelerations with respect to NeRFs trained in the image space. Our +project page can be found at https://ig-ae.github.io . + +
+
+
+
+
+ + ☆ An Individual Identity-Driven Framework for Animal Re-Identification + + +
+ Reliable re-identification of individuals within large wildlife populations +is crucial for biological studies, ecological research, and wildlife +conservation. Classic computer vision techniques offer a promising direction +for Animal Re-identification (Animal ReID), but their backbones' close-set +nature limits their applicability and generalizability. Despite the +demonstrated effectiveness of vision-language models like CLIP in +re-identifying persons and vehicles, their application to Animal ReID remains +limited due to unique challenges, such as the various visual representations of +animals, including variations in poses and forms. To address these limitations, +we leverage CLIP's cross-modal capabilities to introduce a two-stage framework, +the \textbf{Indiv}idual \textbf{A}nimal \textbf{ID}entity-Driven (IndivAID) +framework, specifically designed for Animal ReID. In the first stage, IndivAID +trains a text description generator by extracting individual semantic +information from each image, generating both image-specific and +individual-specific textual descriptions that fully capture the diverse visual +concepts of each individual across animal images. In the second stage, IndivAID +refines its learning of visual concepts by dynamically incorporating +individual-specific textual descriptions with an integrated attention module to +further highlight discriminative features of individuals for Animal ReID. +Evaluation against state-of-the-art methods across eight benchmark datasets and +a real-world Stoat dataset demonstrates IndivAID's effectiveness and +applicability. Code is available at \url{https://github.com/ywu840/IndivAID}. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ High-Fidelity Document Stain Removal via A Large-Scale Real-World + Dataset and A Memory-Augmented Transformer + + +
+ Document images are often degraded by various stains, significantly impacting +their readability and hindering downstream applications such as document +digitization and analysis. The absence of a comprehensive stained document +dataset has limited the effectiveness of existing document enhancement methods +in removing stains while preserving fine-grained details. To address this +challenge, we construct StainDoc, the first large-scale, high-resolution +($2145\times2245$) dataset specifically designed for document stain removal. +StainDoc comprises over 5,000 pairs of stained and clean document images across +multiple scenes. This dataset encompasses a diverse range of stain types, +severities, and document backgrounds, facilitating robust training and +evaluation of document stain removal algorithms. Furthermore, we propose +StainRestorer, a Transformer-based document stain removal approach. +StainRestorer employs a memory-augmented Transformer architecture that captures +hierarchical stain representations at part, instance, and semantic levels via +the DocMemory module. The Stain Removal Transformer (SRTransformer) leverages +these feature representations through a dual attention mechanism: an enhanced +spatial attention with an expanded receptive field, and a channel attention +captures channel-wise feature importance. This combination enables precise +stain removal while preserving document content integrity. Extensive +experiments demonstrate StainRestorer's superior performance over +state-of-the-art methods on the StainDoc dataset and its variants +StainDoc\_Mark and StainDoc\_Seal, establishing a new benchmark for document +stain removal. Our work highlights the potential of memory-augmented +Transformers for this task and contributes a valuable dataset to advance future +research. + +
+
+ comment: Accepted by WACV2025 +
+
+
+
+
+ + ☆ UniRiT: Towards Few-Shot Non-Rigid Point Cloud Registration + + +
+ Non-rigid point cloud registration is a critical challenge in 3D scene +understanding, particularly in surgical navigation. Although existing methods +achieve excellent performance when trained on large-scale, high-quality +datasets, these datasets are prohibitively expensive to collect and annotate, +e.g., organ data in authentic medical scenarios. With insufficient training +samples and data noise, existing methods degrade significantly since non-rigid +patterns are more flexible and complicated than rigid ones, and the +distributions across samples are more distinct, leading to higher difficulty in +representation learning with few data. In this work, we aim to deal with this +challenging few-shot non-rigid point cloud registration problem. Based on the +observation that complex non-rigid transformation patterns can be decomposed +into rigid and small non-rigid transformations, we propose a novel and +effective framework, UniRiT. UniRiT adopts a two-step registration strategy +that first aligns the centroids of the source and target point clouds and then +refines the registration with non-rigid transformations, thereby significantly +reducing the problem complexity. To validate the performance of UniRiT on +real-world datasets, we introduce a new dataset, MedMatch3D, which consists of +real human organs and exhibits high variability in sample distribution. We +further establish a new challenging benchmark for few-shot non-rigid +registration. Extensive empirical results demonstrate that UniRiT achieves +state-of-the-art performance on MedMatch3D, improving the existing best +approach by 94.22%. + +
+
+ comment: 21 pages, 14 figures, under review +
+
+
+
+
+ + ☆ HelloMeme: Integrating Spatial Knitting Attentions to Embed High-Level + and Fidelity-Rich Conditions in Diffusion Models + + +
+ We propose an effective method for inserting adapters into text-to-image +foundation models, which enables the execution of complex downstream tasks +while preserving the generalization ability of the base model. The core idea of +this method is to optimize the attention mechanism related to 2D feature maps, +which enhances the performance of the adapter. This approach was validated on +the task of meme video generation and achieved significant results. We hope +this work can provide insights for post-training tasks of large text-to-image +models. Additionally, as this method demonstrates good compatibility with SD1.5 +derivative models, it holds certain value for the open-source community. +Therefore, we will release the related code +(\url{https://songkey.github.io/hellomeme}). + +
+
+ comment: 11 pages, 7 figures, 2 tables +
+
+
+
+
+ + ☆ Wormhole Loss for Partial Shape Matching NeurIPS + + +
+ When matching parts of a surface to its whole, a fundamental question arises: +Which points should be included in the matching process? The issue is +intensified when using isometry to measure similarity, as it requires the +validation of whether distances measured between pairs of surface points should +influence the matching process. The approach we propose treats surfaces as +manifolds equipped with geodesic distances, and addresses the partial shape +matching challenge by introducing a novel criterion to meticulously search for +consistent distances between pairs of points. The new criterion explores the +relation between intrinsic geodesic distances between the points, geodesic +distances between the points and surface boundaries, and extrinsic distances +between boundary points measured in the embedding space. It is shown to be less +restrictive compared to previous measures and achieves state-of-the-art results +when used as a loss function in training networks for partial shape matching. + +
+
+ comment: Accepted for publication at the conference on Neural Information + Processing Systems (NeurIPS) 2024 +
+
+
+
+
+ + ☆ YOLOv11 for Vehicle Detection: Advancements, Performance, and + Applications in Intelligent Transportation Systems + + +
+ Accurate vehicle detection is essential for the development of intelligent +transportation systems, autonomous driving, and traffic monitoring. This paper +presents a detailed analysis of YOLO11, the latest advancement in the YOLO +series of deep learning models, focusing exclusively on vehicle detection +tasks. Building upon the success of its predecessors, YOLO11 introduces +architectural improvements designed to enhance detection speed, accuracy, and +robustness in complex environments. Using a comprehensive dataset comprising +multiple vehicle types-cars, trucks, buses, motorcycles, and bicycles we +evaluate YOLO11's performance using metrics such as precision, recall, F1 +score, and mean average precision (mAP). Our findings demonstrate that YOLO11 +surpasses previous versions (YOLOv8 and YOLOv10) in detecting smaller and more +occluded vehicles while maintaining a competitive inference time, making it +well-suited for real-time applications. Comparative analysis shows significant +improvements in the detection of complex vehicle geometries, further +contributing to the development of efficient and scalable vehicle detection +systems. This research highlights YOLO11's potential to enhance autonomous +vehicle performance and traffic monitoring systems, offering insights for +future developments in the field. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Effective and Efficient Adversarial Detection for Vision-Language Models + via A Single Vector + + +
+ Visual Language Models (VLMs) are vulnerable to adversarial attacks, +especially those from adversarial images, which is however under-explored in +literature. To facilitate research on this critical safety problem, we first +construct a new laRge-scale Adervsarial images dataset with Diverse hArmful +Responses (RADAR), given that existing datasets are either small-scale or only +contain limited types of harmful responses. With the new RADAR dataset, we +further develop a novel and effective iN-time Embedding-based AdveRSarial Image +DEtection (NEARSIDE) method, which exploits a single vector that distilled from +the hidden states of VLMs, which we call the attacking direction, to achieve +the detection of adversarial images against benign ones in the input. Extensive +experiments with two victim VLMs, LLaVA and MiniGPT-4, well demonstrate the +effectiveness, efficiency, and cross-model transferrability of our proposed +method. Our code is available at https://github.com/mob-scu/RADAR-NEARSIDE + +
+
+
+
+
+ + ☆ Adaptive Paradigm Synergy: Can a Cross-Paradigm Objective Enhance + Long-Tailed Learning? + + +
+ Self-supervised learning (SSL) has achieved impressive results across several +computer vision tasks, even rivaling supervised methods. However, its +performance degrades on real-world datasets with long-tailed distributions due +to difficulties in capturing inherent class imbalances. Although supervised +long-tailed learning offers significant insights, the absence of labels in SSL +prevents direct transfer of these strategies.To bridge this gap, we introduce +Adaptive Paradigm Synergy (APS), a cross-paradigm objective that seeks to unify +the strengths of both paradigms. Our approach reexamines contrastive learning +from a spatial structure perspective, dynamically adjusting the uniformity of +latent space structure through adaptive temperature tuning. Furthermore, we +draw on a re-weighting strategy from supervised learning to compensate for the +shortcomings of temperature adjustment in explicit quantity +perception.Extensive experiments on commonly used long-tailed datasets +demonstrate that APS improves performance effectively and efficiently. Our +findings reveal the potential for deeper integration between supervised and +self-supervised learning, paving the way for robust models that handle +real-world class imbalance. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ SFA-UNet: More Attention to Multi-Scale Contrast and Contextual + Information in Infrared Small Object Segmentation + + +
+ Computer vision researchers have extensively worked on fundamental infrared +visual recognition for the past few decades. Among various approaches, deep +learning has emerged as the most promising candidate. However, Infrared Small +Object Segmentation (ISOS) remains a major focus due to several challenges +including: 1) the lack of effective utilization of local contrast and global +contextual information; 2) the potential loss of small objects in deep models; +and 3) the struggling to capture fine-grained details and ignore noise. To +address these challenges, we propose a modified U-Net architecture, named +SFA-UNet, by combining Scharr Convolution (SC) and Fast Fourier Convolution +(FFC) in addition to vertical and horizontal Attention gates (AG) into UNet. +SFA-UNet utilizes double convolution layers with the addition of SC and FFC in +its encoder and decoder layers. SC helps to learn the foreground-to-background +contrast information whereas FFC provide multi-scale contextual information +while mitigating the small objects vanishing problem. Additionally, the +introduction of vertical AGs in encoder layers enhances the model's focus on +the targeted object by ignoring irrelevant regions. We evaluated the proposed +approach on publicly available, SIRST and IRSTD datasets, and achieved superior +performance by an average 0.75% with variance of 0.025 of all combined metrics +in multiple runs as compared to the existing state-of-the-art methods + +
+
+ comment: Accepted and Presented at PRIP 2023 +
+
+
+
+
+ + ☆ Towards Population Scale Testis Volume Segmentation in DIXON MRI + + +
+ Testis size is known to be one of the main predictors of male fertility, +usually assessed in clinical workup via palpation or imaging. Despite its +potential, population-level evaluation of testicular volume using imaging +remains underexplored. Previous studies, limited by small and biased datasets, +have demonstrated the feasibility of machine learning for testis volume +segmentation. This paper presents an evaluation of segmentation methods for +testicular volume using Magnet Resonance Imaging data from the UKBiobank. The +best model achieves a median dice score of $0.87$, compared to median dice +score of $0.83$ for human interrater reliability on the same dataset, enabling +large-scale annotation on a population scale for the first time. Our overall +aim is to provide a trained model, comparative baseline methods, and annotated +training data to enhance accessibility and reproducibility in testis MRI +segmentation research. + +
+
+
+
+
+ + ☆ Prune and Repaint: Content-Aware Image Retargeting for any Ratio NeurIPS24 + + +
+ Image retargeting is the task of adjusting the aspect ratio of images to suit +different display devices or presentation environments. However, existing +retargeting methods often struggle to balance the preservation of key semantics +and image quality, resulting in either deformation or loss of important +objects, or the introduction of local artifacts such as discontinuous pixels +and inconsistent regenerated content. To address these issues, we propose a +content-aware retargeting method called PruneRepaint. It incorporates semantic +importance for each pixel to guide the identification of regions that need to +be pruned or preserved in order to maintain key semantics. Additionally, we +introduce an adaptive repainting module that selects image regions for +repainting based on the distribution of pruned pixels and the proportion +between foreground size and target aspect ratio, thus achieving local +smoothness after pruning. By focusing on the content and structure of the +foreground, our PruneRepaint approach adaptively avoids key content loss and +deformation, while effectively mitigating artifacts with local repainting. We +conduct experiments on the public RetargetMe benchmark and demonstrate through +objective experimental results and subjective user studies that our method +outperforms previous approaches in terms of preserving semantics and +aesthetics, as well as better generalization across diverse aspect ratios. +Codes will be available at https://github.com/fhshen2022/PruneRepaint. + +
+
+ comment: NeurIPS24 +
+
+
+
+
+ + ☆ AtGCN: A Graph Convolutional Network For Ataxic Gait Detection + + +
+ Video-based gait analysis can be defined as the task of diagnosing +pathologies, such as ataxia, using videos of patients walking in front of a +camera. This paper presents a graph convolution network called AtGCN for +detecting ataxic gait and identifying its severity using 2D videos. The problem +is especially challenging as the deviation of an ataxic gait from a healthy +gait is very subtle. The datasets for ataxic gait detection are also quite +small, with the largest dataset having only 149 videos. The paper addresses the +first problem using special spatiotemporal graph convolution that successfully +captures important gait-related features. To handle the small dataset size, a +deep spatiotemporal graph convolution network pre-trained on an action +recognition dataset is systematically truncated and then fine-tuned on the +ataxia dataset to obtain the AtGCN model. The paper also presents an +augmentation strategy that segments a video sequence into multiple gait cycles. +The proposed AtGCN model then operates on a graph of body part locations +belonging to a single gait cycle. The evaluation results support the strength +of the proposed AtGCN model, as it outperforms the state-of-the-art in +detection and severity prediction with an accuracy of 93.46% and a MAE of +0.4169, respectively. + +
+
+
+
+
+ + ☆ DAVINCI: A Single-Stage Architecture for Constrained CAD Sketch + Inference + + +
+ This work presents DAVINCI, a unified architecture for single-stage +Computer-Aided Design (CAD) sketch parameterization and constraint inference +directly from raster sketch images. By jointly learning both outputs, DAVINCI +minimizes error accumulation and enhances the performance of constrained CAD +sketch inference. Notably, DAVINCI achieves state-of-the-art results on the +large-scale SketchGraphs dataset, demonstrating effectiveness on both precise +and hand-drawn raster CAD sketches. To reduce DAVINCI's reliance on large-scale +annotated datasets, we explore the efficacy of CAD sketch augmentations. We +introduce Constraint-Preserving Transformations (CPTs), i.e. random +permutations of the parametric primitives of a CAD sketch that preserve its +constraints. This data augmentation strategy allows DAVINCI to achieve +reasonable performance when trained with only 0.1% of the SketchGraphs dataset. +Furthermore, this work contributes a new version of SketchGraphs, augmented +with CPTs. The newly introduced CPTSketchGraphs dataset includes 80 million +CPT-augmented sketches, thus providing a rich resource for future research in +the CAD sketch domain. + +
+
+ comment: Accepted at BMVC 2024 +
+
+
+
+
+ + ☆ SFDFusion: An Efficient Spatial-Frequency Domain Fusion Network for + Infrared and Visible Image Fusion + + +
+ Infrared and visible image fusion aims to utilize the complementary +information from two modalities to generate fused images with prominent targets +and rich texture details. Most existing algorithms only perform pixel-level or +feature-level fusion from different modalities in the spatial domain. They +usually overlook the information in the frequency domain, and some of them +suffer from inefficiency due to excessively complex structures. To tackle these +challenges, this paper proposes an efficient Spatial-Frequency Domain Fusion +(SFDFusion) network for infrared and visible image fusion. First, we propose a +Dual-Modality Refinement Module (DMRM) to extract complementary information. +This module extracts useful information from both the infrared and visible +modalities in the spatial domain and enhances fine-grained spatial details. +Next, to introduce frequency domain information, we construct a Frequency +Domain Fusion Module (FDFM) that transforms the spatial domain to the frequency +domain through Fast Fourier Transform (FFT) and then integrates frequency +domain information. Additionally, we design a frequency domain fusion loss to +provide guidance for the fusion process. Extensive experiments on public +datasets demonstrate that our method produces fused images with significant +advantages in various fusion metrics and visual effects. Furthermore, our +method demonstrates high efficiency in image fusion and good performance on +downstream detection tasks, thereby satisfying the real-time demands of +advanced visual tasks. + +
+
+ comment: accept in ECAI 2024 +
+
+
+
+
+ + ☆ Latent Diffusion, Implicit Amplification: Efficient Continuous-Scale + Super-Resolution for Remote Sensing Images + + +
+ Recent advancements in diffusion models have significantly improved +performance in super-resolution (SR) tasks. However, previous research often +overlooks the fundamental differences between SR and general image generation. +General image generation involves creating images from scratch, while SR +focuses specifically on enhancing existing low-resolution (LR) images by adding +typically missing high-frequency details. This oversight not only increases the +training difficulty but also limits their inference efficiency. Furthermore, +previous diffusion-based SR methods are typically trained and inferred at fixed +integer scale factors, lacking flexibility to meet the needs of up-sampling +with non-integer scale factors. To address these issues, this paper proposes an +efficient and elastic diffusion-based SR model (E$^2$DiffSR), specially +designed for continuous-scale SR in remote sensing imagery. E$^2$DiffSR employs +a two-stage latent diffusion paradigm. During the first stage, an autoencoder +is trained to capture the differential priors between high-resolution (HR) and +LR images. The encoder intentionally ignores the existing LR content to +alleviate the encoding burden, while the decoder introduces an SR branch +equipped with a continuous scale upsampling module to accomplish the +reconstruction under the guidance of the differential prior. In the second +stage, a conditional diffusion model is learned within the latent space to +predict the true differential prior encoding. Experimental results demonstrate +that E$^2$DiffSR achieves superior objective metrics and visual quality +compared to the state-of-the-art SR methods. Additionally, it reduces the +inference time of diffusion-based SR methods to a level comparable to that of +non-diffusion methods. + +
+
+
+
+
+ + ☆ Situational Scene Graph for Structured Human-centric Situation + Understanding + + +
+ Graph based representation has been widely used in modelling spatio-temporal +relationships in video understanding. Although effective, existing graph-based +approaches focus on capturing the human-object relationships while ignoring +fine-grained semantic properties of the action components. These semantic +properties are crucial for understanding the current situation, such as where +does the action takes place, what tools are used and functional properties of +the objects. In this work, we propose a graph-based representation called +Situational Scene Graph (SSG) to encode both human-object relationships and the +corresponding semantic properties. The semantic details are represented as +predefined roles and values inspired by situation frame, which is originally +designed to represent a single action. Based on our proposed representation, we +introduce the task of situational scene graph generation and propose a +multi-stage pipeline Interactive and Complementary Network (InComNet) to +address the task. Given that the existing datasets are not applicable to the +task, we further introduce a SSG dataset whose annotations consist of semantic +role-value frames for human, objects and verb predicates of human-object +relations. Finally, we demonstrate the effectiveness of our proposed SSG +representation by testing on different downstream tasks. Experimental results +show that the unified representation can not only benefit predicate +classification and semantic role-value classification, but also benefit +reasoning tasks on human-centric situation understanding. We will release the +code and the dataset soon. + +
+
+ comment: Accepted for WACV 2025 +
+
+
+
+
+ + ☆ Epipolar-Free 3D Gaussian Splatting for Generalizable Novel View + Synthesis NeurIPS 2024 + + +
+ Generalizable 3D Gaussian splitting (3DGS) can reconstruct new scenes from +sparse-view observations in a feed-forward inference manner, eliminating the +need for scene-specific retraining required in conventional 3DGS. However, +existing methods rely heavily on epipolar priors, which can be unreliable in +complex realworld scenes, particularly in non-overlapping and occluded regions. +In this paper, we propose eFreeSplat, an efficient feed-forward 3DGS-based +model for generalizable novel view synthesis that operates independently of +epipolar line constraints. To enhance multiview feature extraction with 3D +perception, we employ a selfsupervised Vision Transformer (ViT) with cross-view +completion pre-training on large-scale datasets. Additionally, we introduce an +Iterative Cross-view Gaussians Alignment method to ensure consistent depth +scales across different views. Our eFreeSplat represents an innovative approach +for generalizable novel view synthesis. Different from the existing pure +geometry-free methods, eFreeSplat focuses more on achieving epipolar-free +feature matching and encoding by providing 3D priors through cross-view +pretraining. We evaluate eFreeSplat on wide-baseline novel view synthesis tasks +using the RealEstate10K and ACID datasets. Extensive experiments demonstrate +that eFreeSplat surpasses state-of-the-art baselines that rely on epipolar +priors, achieving superior geometry reconstruction and novel view synthesis +quality. Project page: https://tatakai1.github.io/efreesplat/. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Adaptive Multi Scale Document Binarisation Using Vision Mamba + + +
+ Enhancing and preserving the readability of document images, particularly +historical ones, is crucial for effective document image analysis. Numerous +models have been proposed for this task, including convolutional-based, +transformer-based, and hybrid convolutional-transformer architectures. While +hybrid models address the limitations of purely convolutional or +transformer-based methods, they often suffer from issues like quadratic time +complexity. In this work, we propose a Mamba-based architecture for document +binarisation, which efficiently handles long sequences by scaling linearly and +optimizing memory usage. Additionally, we introduce novel modifications to the +skip connections by incorporating Difference of Gaussians (DoG) features, +inspired by conventional signal processing techniques. These multiscale +high-frequency features enable the model to produce high-quality, detailed +outputs. + +
+
+
+
+
+ + ☆ Wavelet Burst Accumulation for turbulence mitigation + + +
+ In this paper, we investigate the extension of the recently proposed weighted +Fourier burst accumulation (FBA) method into the wavelet domain. The purpose of +FBA is to reconstruct a clean and sharp image from a sequence of blurred +frames. This concept lies in the construction of weights to amplify dominant +frequencies in the Fourier spectrum of each frame. The reconstructed image is +then obtained by taking the inverse Fourier transform of the average of all +processed spectra. In this paper, we first suggest to replace the rigid +registration step used in the original algorithm by a non-rigid registration in +order to be able to process sequences acquired through atmospheric turbulence. +Second, we propose to work in a wavelet domain instead of the Fourier one. This +leads us to the construction of two types of algorithms. Finally, we propose an +alternative approach to replace the weighting idea by an approach promoting the +sparsity in the used space. Several experiments are provided to illustrate the +efficiency of the proposed methods. + +
+
+
+
+
+ + ☆ Open Turbulent Image Set (OTIS) + + +
+ Long distance imaging is subject to the impact of the turbulent atmosphere. +This results into geometric distortions and some blur effect in the observed +frames. Despite the existence of several turbulence mitigation algorithms in +the literature, no common dataset exists to objectively evaluate their +efficiency. In this paper, we describe a new dataset called OTIS (Open +Turbulent Images Set) which contains several sequences (either static or +dynamic) acquired through the turbulent atmosphere. For almost all sequences, +we provide the corresponding groundtruth in order to make the comparison +between algorithms easier. We also discuss possible metrics to perform such +comparisons. + +
+
+
+
+
+ + ☆ Contrastive Learning and Adversarial Disentanglement for + Privacy-Preserving Task-Oriented Semantic Communications + + +
+ Task-oriented semantic communication systems have emerged as a promising +approach to achieving efficient and intelligent data transmission, where only +information relevant to a specific task is communicated. However, existing +methods struggle to fully disentangle task-relevant and task-irrelevant +information, leading to privacy concerns and subpar performance. To address +this, we propose an information-bottleneck method, named CLAD (contrastive +learning and adversarial disentanglement). CLAD leverages contrastive learning +to effectively capture task-relevant features while employing adversarial +disentanglement to discard task-irrelevant information. Additionally, due to +the lack of reliable and reproducible methods to gain insight into the +informativeness and minimality of the encoded feature vectors, we introduce a +new technique to compute the information retention index (IRI), a comparative +metric used as a proxy for the mutual information between the encoded features +and the input, reflecting the minimality of the encoded features. The IRI +quantifies the minimality and informativeness of the encoded feature vectors +across different task-oriented communication techniques. Our extensive +experiments demonstrate that CLAD outperforms state-of-the-art baselines in +terms of task performance, privacy preservation, and IRI. CLAD achieves a +predictive performance improvement of around 2.5-3%, along with a 77-90% +reduction in IRI and a 57-76% decrease in adversarial accuracy. + +
+
+ comment: Submitted to EEE Journal on Selected Areas in Communications (JSAC): + Intelligent Communications for Real-Time Computer Vision (Comm4CV) +
+
+
+
+
+ + ☆ Bregman implementation of Meyer's $G-$norm for cartoon + textures + decomposition + + +
+ In this paper, we design a very simple algorithm based on Split Bregman +iterations to numerically solve the cartoon + textures decomposition model of +Meyer. This results in a significant gain in speed compared to Chambolle's +nonlinear projectors. + +
+
+
+
+
+ + ☆ Diffusion Beats Autoregressive: An Evaluation of Compositional + Generation in Text-to-Image Models + + +
+ Text-to-image (T2I) generative models, such as Stable Diffusion and DALL-E, +have shown remarkable proficiency in producing high-quality, realistic, and +natural images from textual descriptions. However, these models sometimes fail +to accurately capture all the details specified in the input prompts, +particularly concerning entities, attributes, and spatial relationships. This +issue becomes more pronounced when the prompt contains novel or complex +compositions, leading to what are known as compositional generation failure +modes. Recently, a new open-source diffusion-based T2I model, FLUX, has been +introduced, demonstrating strong performance in high-quality image generation. +Additionally, autoregressive T2I models like LlamaGen have claimed competitive +visual quality performance compared to diffusion-based models. In this study, +we evaluate the compositional generation capabilities of these newly introduced +models against established models using the T2I-CompBench benchmark. Our +findings reveal that LlamaGen, as a vanilla autoregressive model, is not yet on +par with state-of-the-art diffusion models for compositional generation tasks +under the same criteria, such as model size and inference time. On the other +hand, the open-source diffusion-based model FLUX exhibits compositional +generation capabilities comparable to the state-of-the-art closed-source model +DALL-E3. + +
+
+
+
+
+ + ☆ FuseAnyPart: Diffusion-Driven Facial Parts Swapping via Multiple + Reference Images NeurIPS 2024 + + +
+ Facial parts swapping aims to selectively transfer regions of interest from +the source image onto the target image while maintaining the rest of the target +image unchanged. Most studies on face swapping designed specifically for +full-face swapping, are either unable or significantly limited when it comes to +swapping individual facial parts, which hinders fine-grained and customized +character designs. However, designing such an approach specifically for facial +parts swapping is challenged by a reasonable multiple reference feature fusion, +which needs to be both efficient and effective. To overcome this challenge, +FuseAnyPart is proposed to facilitate the seamless "fuse-any-part" +customization of the face. In FuseAnyPart, facial parts from different people +are assembled into a complete face in latent space within the Mask-based Fusion +Module. Subsequently, the consolidated feature is dispatched to the +Addition-based Injection Module for fusion within the UNet of the diffusion +model to create novel characters. Extensive experiments qualitatively and +quantitatively validate the superiority and robustness of FuseAnyPart. Source +codes are available at https://github.com/Thomas-wyh/FuseAnyPart. + +
+
+ comment: Accepted by the NeurIPS 2024 (Spotlight). Homepage: + https://thomas-wyh.github.io/ +
+
+
+
+
+ + ☆ Analysis of Classifier Training on Synthetic Data for Cross-Domain + Datasets + + +
+ A major challenges of deep learning (DL) is the necessity to collect huge +amounts of training data. Often, the lack of a sufficiently large dataset +discourages the use of DL in certain applications. Typically, acquiring the +required amounts of data costs considerable time, material and effort. To +mitigate this problem, the use of synthetic images combined with real data is a +popular approach, widely adopted in the scientific community to effectively +train various detectors. In this study, we examined the potential of synthetic +data-based training in the field of intelligent transportation systems. Our +focus is on camera-based traffic sign recognition applications for advanced +driver assistance systems and autonomous driving. The proposed augmentation +pipeline of synthetic datasets includes novel augmentation processes such as +structured shadows and gaussian specular highlights. A well-known DL model was +trained with different datasets to compare the performance of synthetic and +real image-based trained models. Additionally, a new, detailed method to +objectively compare these models is proposed. Synthetic images are generated +using a semi-supervised errors-guide method which is also described. Our +experiments showed that a synthetic image-based approach outperforms in most +cases real image-based training when applied to cross-domain test datasets +(+10% precision for GTSRB dataset) and consequently, the generalization of the +model is improved decreasing the cost of acquiring images. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ ETO:Efficient Transformer-based Local Feature Matching by Organizing + Multiple Homography Hypotheses + + +
+ We tackle the efficiency problem of learning local feature matching.Recent +advancements have given rise to purely CNN-based and transformer-based +approaches, each augmented with deep learning techniques. While CNN-based +methods often excel in matching speed, transformer-based methods tend to +provide more accurate matches. We propose an efficient transformer-based +network architecture for local feature matching.This technique is built on +constructing multiple homography hypotheses to approximate the continuous +correspondence in the real world and uni-directional cross-attention to +accelerate the refinement. On the YFCC100M dataset, our matching accuracy is +competitive with LoFTR, a state-of-the-art transformer-based architecture, +while the inference speed is boosted to 4 times, even outperforming the +CNN-based methods.Comprehensive evaluations on other open datasets such as +Megadepth, ScanNet, and HPatches demonstrate our method's efficacy, +highlighting its potential to significantly enhance a wide array of downstream +applications. + +
+
+
+
+
+ + ☆ st-DTPM: Spatial-Temporal Guided Diffusion Transformer Probabilistic + Model for Delayed Scan PET Image Prediction + + +
+ PET imaging is widely employed for observing biological metabolic activities +within the human body. However, numerous benign conditions can cause increased +uptake of radiopharmaceuticals, confounding differentiation from malignant +tumors. Several studies have indicated that dual-time PET imaging holds promise +in distinguishing between malignant and benign tumor processes. Nevertheless, +the hour-long distribution period of radiopharmaceuticals post-injection +complicates the determination of optimal timing for the second scan, presenting +challenges in both practical applications and research. Notably, we have +identified that delay time PET imaging can be framed as an image-to-image +conversion problem. Motivated by this insight, we propose a novel +spatial-temporal guided diffusion transformer probabilistic model (st-DTPM) to +solve dual-time PET imaging prediction problem. Specifically, this architecture +leverages the U-net framework that integrates patch-wise features of CNN and +pixel-wise relevance of Transformer to obtain local and global information. And +then employs a conditional DDPM model for image synthesis. Furthermore, on +spatial condition, we concatenate early scan PET images and noisy PET images on +every denoising step to guide the spatial distribution of denoising sampling. +On temporal condition, we convert diffusion time steps and delay time to a +universal time vector, then embed it to each layer of model architecture to +further improve the accuracy of predictions. Experimental results demonstrated +the superiority of our method over alternative approaches in preserving image +quality and structural information, thereby affirming its efficacy in +predictive task. + +
+
+
+
+
+ + ☆ One Prompt to Verify Your Models: Black-Box Text-to-Image Models + Verification via Non-Transferable Adversarial Attacks + + +
+ Recently, the success of Text-to-Image (T2I) models has led to the rise of +numerous third-party platforms, which claim to provide cheaper API services and +more flexibility in model options. However, this also raises a new security +concern: Are these third-party services truly offering the models they claim? +To address this problem, we propose the first T2I model verification method +named Text-to-Image Model Verification via Non-Transferable Adversarial Attacks +(TVN). The non-transferability of adversarial examples means that these +examples are only effective on a target model and ineffective on other models, +thereby allowing for the verification of the target model. TVN utilizes the +Non-dominated Sorting Genetic Algorithm II (NSGA-II) to optimize the cosine +similarity of a prompt's text encoding, generating non-transferable adversarial +prompts. By calculating the CLIP-text scores between the non-transferable +adversarial prompts without perturbations and the images, we can verify if the +model matches the claimed target model, based on a 3-sigma threshold. The +experiments showed that TVN performed well in both closed-set and open-set +scenarios, achieving a verification accuracy of over 90\%. Moreover, the +adversarial prompts generated by TVN significantly reduced the CLIP-text scores +of the target model, while having little effect on other models. + +
+
+
+
+
+ + ☆ SCRREAM : SCan, Register, REnder And Map:A Framework for Annotating + Accurate and Dense 3D Indoor Scenes with a Benchmark + + +
+ Traditionally, 3d indoor datasets have generally prioritized scale over +ground-truth accuracy in order to obtain improved generalization. However, +using these datasets to evaluate dense geometry tasks, such as depth rendering, +can be problematic as the meshes of the dataset are often incomplete and may +produce wrong ground truth to evaluate the details. In this paper, we propose +SCRREAM, a dataset annotation framework that allows annotation of fully dense +meshes of objects in the scene and registers camera poses on the real image +sequence, which can produce accurate ground truth for both sparse 3D as well as +dense 3D tasks. We show the details of the dataset annotation pipeline and +showcase four possible variants of datasets that can be obtained from our +framework with example scenes, such as indoor reconstruction and SLAM, scene +editing & object removal, human reconstruction and 6d pose estimation. Recent +pipelines for indoor reconstruction and SLAM serve as new benchmarks. In +contrast to previous indoor dataset, our design allows to evaluate dense +geometry tasks on eleven sample scenes against accurately rendered ground truth +depth maps. + +
+
+
+
+
+ + ☆ LoFLAT: Local Feature Matching using Focused Linear Attention + Transformer + + +
+ Local feature matching is an essential technique in image matching and plays +a critical role in a wide range of vision-based applications. However, existing +Transformer-based detector-free local feature matching methods encounter +challenges due to the quadratic computational complexity of attention +mechanisms, especially at high resolutions. However, while existing +Transformer-based detector-free local feature matching methods have reduced +computational costs using linear attention mechanisms, they still struggle to +capture detailed local interactions, which affects the accuracy and robustness +of precise local correspondences. In order to enhance representations of +attention mechanisms while preserving low computational complexity, we propose +the LoFLAT, a novel Local Feature matching using Focused Linear Attention +Transformer in this paper. Our LoFLAT consists of three main modules: the +Feature Extraction Module, the Feature Transformer Module, and the Matching +Module. Specifically, the Feature Extraction Module firstly uses ResNet and a +Feature Pyramid Network to extract hierarchical features. The Feature +Transformer Module further employs the Focused Linear Attention to refine +attention distribution with a focused mapping function and to enhance feature +diversity with a depth-wise convolution. Finally, the Matching Module predicts +accurate and robust matches through a coarse-to-fine strategy. Extensive +experimental evaluations demonstrate that the proposed LoFLAT outperforms the +LoFTR method in terms of both efficiency and accuracy. + +
+
+
+
+
+ + ☆ FilterViT and DropoutViT: Lightweight Vision Transformer Models for + Efficient Attention Mechanisms + + +
+ In this study, we introduce FilterViT, an enhanced version of MobileViT, +which leverages an attention-based mechanism for early-stage downsampling. +Traditional QKV operations on high-resolution feature maps are computationally +intensive due to the abundance of tokens. To address this, we propose a filter +attention mechanism using a convolutional neural network (CNN) to generate an +importance mask, focusing attention on key image regions. The method +significantly reduces computational complexity while maintaining +interpretability, as it highlights essential image areas. Experimental results +show that FilterViT achieves substantial gains in both efficiency and accuracy +compared to other models. We also introduce DropoutViT, a variant that uses a +stochastic approach for pixel selection, further enhancing robustness. + +
+
+
+
+
+ + ☆ Robotic State Recognition with Image-to-Text Retrieval Task of + Pre-Trained Vision-Language Model and Black-Box Optimization + + +
+ State recognition of the environment and objects, such as the open/closed +state of doors and the on/off of lights, is indispensable for robots that +perform daily life support and security tasks. Until now, state recognition +methods have been based on training neural networks from manual annotations, +preparing special sensors for the recognition, or manually programming to +extract features from point clouds or raw images. In contrast, we propose a +robotic state recognition method using a pre-trained vision-language model, +which is capable of Image-to-Text Retrieval (ITR) tasks. We prepare several +kinds of language prompts in advance, calculate the similarity between these +prompts and the current image by ITR, and perform state recognition. By +applying the optimal weighting to each prompt using black-box optimization, +state recognition can be performed with higher accuracy. Experiments show that +this theory enables a variety of state recognitions by simply preparing +multiple prompts without retraining neural networks or manual programming. In +addition, since only prompts and their weights need to be prepared for each +recognizer, there is no need to prepare multiple models, which facilitates +resource management. It is possible to recognize the open/closed state of +transparent doors, the state of whether water is running or not from a faucet, +and even the qualitative state of whether a kitchen is clean or not, which have +been challenging so far, through language. + +
+
+ comment: Accepted at Humanoids2024 +
+
+
+
+
+ + ☆ Geometry Cloak: Preventing TGS-based 3D Reconstruction from Copyrighted + Images NeurIPS 2024 + + +
+ Single-view 3D reconstruction methods like Triplane Gaussian Splatting (TGS) +have enabled high-quality 3D model generation from just a single image input +within seconds. However, this capability raises concerns about potential +misuse, where malicious users could exploit TGS to create unauthorized 3D +models from copyrighted images. To prevent such infringement, we propose a +novel image protection approach that embeds invisible geometry perturbations, +termed "geometry cloaks", into images before supplying them to TGS. These +carefully crafted perturbations encode a customized message that is revealed +when TGS attempts 3D reconstructions of the cloaked image. Unlike conventional +adversarial attacks that simply degrade output quality, our method forces TGS +to fail the 3D reconstruction in a specific way - by generating an identifiable +customized pattern that acts as a watermark. This watermark allows copyright +holders to assert ownership over any attempted 3D reconstructions made from +their protected images. Extensive experiments have verified the effectiveness +of our geometry cloak. Our project is available at +https://qsong2001.github.io/geometry_cloak. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Persistent Homology for MCI Classification: A Comparative Analysis + between Graph and Vietoris-Rips Filtrations + + +
+ Mild cognitive impairment (MCI), often linked to early neurodegeneration, is +characterized by subtle cognitive declines and disruptions in brain +connectivity. The present study offers a detailed analysis of topological +changes associated with MCI, focusing on two subtypes: Early MCI and Late MCI. +This analysis utilizes fMRI time series data from two distinct populations: the +publicly available ADNI dataset (Western cohort) and the in-house TLSA dataset +(Indian Urban cohort). Persistent Homology, a topological data analysis method, +is employed with two distinct filtration techniques - Vietoris-Rips and graph +filtration-for classifying MCI subtypes. For Vietoris-Rips filtration, +inter-ROI Wasserstein distance matrices between persistent diagrams are used +for classification, while graph filtration relies on the top ten most +persistent homology features. Comparative analysis shows that the Vietoris-Rips +filtration significantly outperforms graph filtration, capturing subtle +variations in brain connectivity with greater accuracy. The Vietoris-Rips +filtration method achieved the highest classification accuracy of 85.7\% for +distinguishing between age and gender matched healthy controls and MCI, whereas +graph filtration reached a maximum accuracy of 71.4\% for the same task. This +superior performance highlights the sensitivity of Vietoris-Rips filtration in +detecting intricate topological features associated with neurodegeneration. The +findings underscore the potential of persistent homology, particularly when +combined with the Wasserstein distance, as a powerful tool for early diagnosis +and precise classification of cognitive impairments, offering valuable insights +into brain connectivity changes in MCI. + +
+
+ comment: 17 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ Practical and Accurate Reconstruction of an Illuminant's Spectral Power + Distribution for Inverse Rendering Pipelines + + +
+ Inverse rendering pipelines are gaining prominence in realizing +photo-realistic reconstruction of real-world objects for emulating them in +virtual reality scenes. Apart from material reflectances, spectral rendering +and in-scene illuminants' spectral power distributions (SPDs) play important +roles in producing photo-realistic images. We present a simple, low-cost +technique to capture and reconstruct the SPD of uniform illuminants. Instead of +requiring a costly spectrometer for such measurements, our method uses a +diffractive compact disk (CD-ROM) and a machine learning approach for accurate +estimation. We show our method to work well with spotlights under simulations +and few real-world examples. Presented results clearly demonstrate the +reliability of our approach through quantitative and qualitative evaluations, +especially in spectral rendering of iridescent materials. + +
+
+ comment: 3 pages, 3 Figures, Submitted as a Tiny Paper at ICVGIP'24, + Bangalore, India +
+
+
+
+
+ + ☆ Backdoor Attack Against Vision Transformers via Attention Gradient-Based + Image Erosion + + +
+ Vision Transformers (ViTs) have outperformed traditional Convolutional Neural +Networks (CNN) across various computer vision tasks. However, akin to CNN, ViTs +are vulnerable to backdoor attacks, where the adversary embeds the backdoor +into the victim model, causing it to make wrong predictions about testing +samples containing a specific trigger. Existing backdoor attacks against ViTs +have the limitation of failing to strike an optimal balance between attack +stealthiness and attack effectiveness. + In this work, we propose an Attention Gradient-based Erosion Backdoor (AGEB) +targeted at ViTs. Considering the attention mechanism of ViTs, AGEB selectively +erodes pixels in areas of maximal attention gradient, embedding a covert +backdoor trigger. Unlike previous backdoor attacks against ViTs, AGEB achieves +an optimal balance between attack stealthiness and attack effectiveness, +ensuring the trigger remains invisible to human detection while preserving the +model's accuracy on clean samples. Extensive experimental evaluations across +various ViT architectures and datasets confirm the effectiveness of AGEB, +achieving a remarkable Attack Success Rate (ASR) without diminishing Clean Data +Accuracy (CDA). Furthermore, the stealthiness of AGEB is rigorously validated, +demonstrating minimal visual discrepancies between the clean and the triggered +images. + +
+
+ comment: Accepted by IEEE GLOBECOM 2024 +
+
+
+
+
+ + ☆ FlowDCN: Exploring DCN-like Architectures for Fast Image Generation with + Arbitrary Resolution NeurIPS24 + + +
+ Arbitrary-resolution image generation still remains a challenging task in +AIGC, as it requires handling varying resolutions and aspect ratios while +maintaining high visual quality. Existing transformer-based diffusion methods +suffer from quadratic computation cost and limited resolution extrapolation +capabilities, making them less effective for this task. In this paper, we +propose FlowDCN, a purely convolution-based generative model with linear time +and memory complexity, that can efficiently generate high-quality images at +arbitrary resolutions. Equipped with a new design of learnable group-wise +deformable convolution block, our FlowDCN yields higher flexibility and +capability to handle different resolutions with a single model. FlowDCN +achieves the state-of-the-art 4.30 sFID on $256\times256$ ImageNet Benchmark +and comparable resolution extrapolation results, surpassing transformer-based +counterparts in terms of convergence speed (only $\frac{1}{5}$ images), visual +quality, parameters ($8\%$ reduction) and FLOPs ($20\%$ reduction). We believe +FlowDCN offers a promising solution to scalable and flexible image synthesis. + +
+
+ comment: Accepted on NeurIPS24 +
+
+
+
+
+ + ☆ SimpsonsVQA: Enhancing Inquiry-Based Learning with a Tailored Dataset + + +
+ Visual Question Answering (VQA) has emerged as a promising area of research +to develop AI-based systems for enabling interactive and immersive learning. +Numerous VQA datasets have been introduced to facilitate various tasks, such as +answering questions or identifying unanswerable ones. However, most of these +datasets are constructed using real-world images, leaving the performance of +existing models on cartoon images largely unexplored. Hence, in this paper, we +present "SimpsonsVQA", a novel dataset for VQA derived from The Simpsons TV +show, designed to promote inquiry-based learning. Our dataset is specifically +designed to address not only the traditional VQA task but also to identify +irrelevant questions related to images, as well as the reverse scenario where a +user provides an answer to a question that the system must evaluate (e.g., as +correct, incorrect, or ambiguous). It aims to cater to various visual +applications, harnessing the visual content of "The Simpsons" to create +engaging and informative interactive systems. SimpsonsVQA contains +approximately 23K images, 166K QA pairs, and 500K judgments +(https://simpsonsvqa.org). Our experiments show that current large +vision-language models like ChatGPT4o underperform in zero-shot settings across +all three tasks, highlighting the dataset's value for improving model +performance on cartoon images. We anticipate that SimpsonsVQA will inspire +further research, innovation, and advancements in inquiry-based learning VQA. + +
+
+
+
+
+ + ☆ Unbiased Regression Loss for DETRs + + +
+ In this paper, we introduce a novel unbiased regression loss for DETR-based +detectors. The conventional $L_{1}$ regression loss tends to bias towards +larger boxes, as they disproportionately contribute more towards the overall +loss compared to smaller boxes. Consequently, the detection performance for +small objects suffers. To alleviate this bias, the proposed new unbiased loss, +termed Sized $L_{1}$ loss, normalizes the size of all boxes based on their +individual width and height. Our experiments demonstrate consistent +improvements in both fully-supervised and semi-supervised settings using the +MS-COCO benchmark dataset. + +
+
+
+
+
+ + ☆ Consistency Diffusion Bridge Models NeurIPS 2024 + + +
+ Diffusion models (DMs) have become the dominant paradigm of generative +modeling in a variety of domains by learning stochastic processes from noise to +data. Recently, diffusion denoising bridge models (DDBMs), a new formulation of +generative modeling that builds stochastic processes between fixed data +endpoints based on a reference diffusion process, have achieved empirical +success across tasks with coupled data distribution, such as image-to-image +translation. However, DDBM's sampling process typically requires hundreds of +network evaluations to achieve decent performance, which may impede their +practical deployment due to high computational demands. In this work, inspired +by the recent advance of consistency models in DMs, we tackle this problem by +learning the consistency function of the probability-flow ordinary differential +equation (PF-ODE) of DDBMs, which directly predicts the solution at a starting +step given any point on the ODE trajectory. Based on a dedicated general-form +ODE solver, we propose two paradigms: consistency bridge distillation and +consistency bridge training, which is flexible to apply on DDBMs with broad +design choices. Experimental results show that our proposed method could sample +$4\times$ to $50\times$ faster than the base DDBM and produce better visual +quality given the same step in various tasks with pixel resolution ranging from +$64 \times 64$ to $256 \times 256$, as well as supporting downstream tasks such +as semantic interpolation in the data space. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Towards Unifying Understanding and Generation in the Era of Vision + Foundation Models: A Survey from the Autoregression Perspective + + +
+ Autoregression in large language models (LLMs) has shown impressive +scalability by unifying all language tasks into the next token prediction +paradigm. Recently, there is a growing interest in extending this success to +vision foundation models. In this survey, we review the recent advances and +discuss future directions for autoregressive vision foundation models. First, +we present the trend for next generation of vision foundation models, i.e., +unifying both understanding and generation in vision tasks. We then analyze the +limitations of existing vision foundation models, and present a formal +definition of autoregression with its advantages. Later, we categorize +autoregressive vision foundation models from their vision tokenizers and +autoregression backbones. Finally, we discuss several promising research +challenges and directions. To the best of our knowledge, this is the first +survey to comprehensively summarize autoregressive vision foundation models +under the trend of unifying understanding and generation. A collection of +related resources is available at https://github.com/EmmaSRH/ARVFM. + +
+
+ comment: 17 pages, 1 table, 2 figures +
+
+
+
+
+ + ♻ ☆ Certified Robustness to Data Poisoning in Gradient-Based Training + + +
+ Modern machine learning pipelines leverage large amounts of public data, +making it infeasible to guarantee data quality and leaving models open to +poisoning and backdoor attacks. Provably bounding model behavior under such +attacks remains an open problem. In this work, we address this challenge by +developing the first framework providing provable guarantees on the behavior of +models trained with potentially manipulated data without modifying the model or +learning algorithm. In particular, our framework certifies robustness against +untargeted and targeted poisoning, as well as backdoor attacks, for bounded and +unbounded manipulations of the training inputs and labels. Our method leverages +convex relaxations to over-approximate the set of all possible parameter +updates for a given poisoning threat model, allowing us to bound the set of all +reachable parameters for any gradient-based learning algorithm. Given this set +of parameters, we provide bounds on worst-case behavior, including model +performance and backdoor success rate. We demonstrate our approach on multiple +real-world datasets from applications including energy consumption, medical +imaging, and autonomous driving. + +
+
+ comment: 21 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ DisC-GS: Discontinuity-aware Gaussian Splatting NeurIPS 2024 + + +
+ Recently, Gaussian Splatting, a method that represents a 3D scene as a +collection of Gaussian distributions, has gained significant attention in +addressing the task of novel view synthesis. In this paper, we highlight a +fundamental limitation of Gaussian Splatting: its inability to accurately +render discontinuities and boundaries in images due to the continuous nature of +Gaussian distributions. To address this issue, we propose a novel framework +enabling Gaussian Splatting to perform discontinuity-aware image rendering. +Additionally, we introduce a B\'ezier-boundary gradient approximation strategy +within our framework to keep the "differentiability" of the proposed +discontinuity-aware rendering process. Extensive experiments demonstrate the +efficacy of our framework. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Is Your LiDAR Placement Optimized for 3D Scene Understanding? NeurIPS 2024 + + +
+ The reliability of driving perception systems under unprecedented conditions +is crucial for practical usage. Latest advancements have prompted increasing +interest in multi-LiDAR perception. However, prevailing driving datasets +predominantly utilize single-LiDAR systems and collect data devoid of adverse +conditions, failing to capture the complexities of real-world environments +accurately. Addressing these gaps, we proposed Place3D, a full-cycle pipeline +that encompasses LiDAR placement optimization, data generation, and downstream +evaluations. Our framework makes three appealing contributions. 1) To identify +the most effective configurations for multi-LiDAR systems, we introduce the +Surrogate Metric of the Semantic Occupancy Grids (M-SOG) to evaluate LiDAR +placement quality. 2) Leveraging the M-SOG metric, we propose a novel +optimization strategy to refine multi-LiDAR placements. 3) Centered around the +theme of multi-condition multi-LiDAR perception, we collect a 280,000-frame +dataset from both clean and adverse conditions. Extensive experiments +demonstrate that LiDAR placements optimized using our approach outperform +various baselines. We showcase exceptional results in both LiDAR semantic +segmentation and 3D object detection tasks, under diverse weather and sensor +failure conditions. + +
+
+ comment: NeurIPS 2024 (Spotlight); 36 pages, 16 figures, 14 tables; Code at + https://github.com/ywyeli/Place3D +
+
+
+
+
+ + ♻ ☆ Super-resolution in disordered media using neural networks + + +
+ We propose a methodology that exploits large and diverse data sets to +accurately estimate the ambient medium's Green's functions in strongly +scattering media. Given these estimates, obtained with and without the use of +neural networks, excellent imaging results are achieved, with a resolution that +is better than that of a homogeneous medium. This phenomenon, also known as +super-resolution, occurs because the ambient scattering medium effectively +enhances the physical imaging aperture. + +
+
+
+
+
+ + ♻ ☆ CARES: A Comprehensive Benchmark of Trustworthiness in Medical Vision + Language Models NeurIPS 2024 + + +
+ Artificial intelligence has significantly impacted medical applications, +particularly with the advent of Medical Large Vision Language Models +(Med-LVLMs), sparking optimism for the future of automated and personalized +healthcare. However, the trustworthiness of Med-LVLMs remains unverified, +posing significant risks for future model deployment. In this paper, we +introduce CARES and aim to comprehensively evaluate the Trustworthiness of +Med-LVLMs across the medical domain. We assess the trustworthiness of Med-LVLMs +across five dimensions, including trustfulness, fairness, safety, privacy, and +robustness. CARES comprises about 41K question-answer pairs in both closed and +open-ended formats, covering 16 medical image modalities and 27 anatomical +regions. Our analysis reveals that the models consistently exhibit concerns +regarding trustworthiness, often displaying factual inaccuracies and failing to +maintain fairness across different demographic groups. Furthermore, they are +vulnerable to attacks and demonstrate a lack of privacy awareness. We publicly +release our benchmark and code in https://cares-ai.github.io/. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ♻ ☆ StyleAdapter: A Unified Stylized Image Generation Model + + +
+ This work focuses on generating high-quality images with specific style of +reference images and content of provided textual descriptions. Current leading +algorithms, i.e., DreamBooth and LoRA, require fine-tuning for each style, +leading to time-consuming and computationally expensive processes. In this +work, we propose StyleAdapter, a unified stylized image generation model +capable of producing a variety of stylized images that match both the content +of a given prompt and the style of reference images, without the need for +per-style fine-tuning. It introduces a two-path cross-attention (TPCA) module +to separately process style information and textual prompt, which cooperate +with a semantic suppressing vision model (SSVM) to suppress the semantic +content of style images. In this way, it can ensure that the prompt maintains +control over the content of the generated images, while also mitigating the +negative impact of semantic information in style references. This results in +the content of the generated image adhering to the prompt, and its style +aligning with the style references. Besides, our StyleAdapter can be integrated +with existing controllable synthesis methods, such as T2I-adapter and +ControlNet, to attain a more controllable and stable generation process. +Extensive experiments demonstrate the superiority of our method over previous +works. + +
+
+ comment: Accepted by IJCV24 +
+
+
+
+
+ + ♻ ☆ Flow Snapshot Neurons in Action: Deep Neural Networks Generalize to + Biological Motion Perception + + +
+ Biological motion perception (BMP) refers to humans' ability to perceive and +recognize the actions of living beings solely from their motion patterns, +sometimes as minimal as those depicted on point-light displays. While humans +excel at these tasks without any prior training, current AI models struggle +with poor generalization performance. To close this research gap, we propose +the Motion Perceiver (MP). MP solely relies on patch-level optical flows from +video clips as inputs. During training, it learns prototypical flow snapshots +through a competitive binding mechanism and integrates invariant motion +representations to predict action labels for the given video. During inference, +we evaluate the generalization ability of all AI models and humans on 62,656 +video stimuli spanning 24 BMP conditions using point-light displays in +neuroscience. Remarkably, MP outperforms all existing AI models with a maximum +improvement of 29% in top-1 action recognition accuracy on these conditions. +Moreover, we benchmark all AI models in point-light displays of two standard +video datasets in computer vision. MP also demonstrates superior performance in +these cases. More interestingly, via psychophysics experiments, we found that +MP recognizes biological movements in a way that aligns with human behaviors. +Our data and code are available at +https://github.com/ZhangLab-DeepNeuroCogLab/MotionPerceiver. + +
+
+
+
+
+ + ♻ ☆ A Hitchhikers Guide to Fine-Grained Face Forgery Detection Using Common + Sense Reasoning NeurIPS'2024 + + +
+ Explainability in artificial intelligence is crucial for restoring trust, +particularly in areas like face forgery detection, where viewers often struggle +to distinguish between real and fabricated content. Vision and Large Language +Models (VLLM) bridge computer vision and natural language, offering numerous +applications driven by strong common-sense reasoning. Despite their success in +various tasks, the potential of vision and language remains underexplored in +face forgery detection, where they hold promise for enhancing explainability by +leveraging the intrinsic reasoning capabilities of language to analyse +fine-grained manipulation areas. As such, there is a need for a methodology +that converts face forgery detection to a Visual Question Answering (VQA) task +to systematically and fairly evaluate these capabilities. Previous efforts for +unified benchmarks in deepfake detection have focused on the simpler binary +task, overlooking evaluation protocols for fine-grained detection and +text-generative models. We propose a multi-staged approach that diverges from +the traditional binary decision paradigm to address this gap. In the first +stage, we assess the models' performance on the binary task and their +sensitivity to given instructions using several prompts. In the second stage, +we delve deeper into fine-grained detection by identifying areas of +manipulation in a multiple-choice VQA setting. In the third stage, we convert +the fine-grained detection to an open-ended question and compare several +matching strategies for the multi-label classification task. Finally, we +qualitatively evaluate the fine-grained responses of the VLLMs included in the +benchmark. We apply our benchmark to several popular models, providing a +detailed comparison of binary, multiple-choice, and open-ended VQA evaluation +across seven datasets. +\url{https://nickyfot.github.io/hitchhickersguide.github.io/} + +
+
+ comment: Accepted at NeurIPS'2024 (D&B) +
+
+
+
+
+ + ♻ ☆ Depth Anywhere: Enhancing 360 Monocular Depth Estimation via Perspective + Distillation and Unlabeled Data Augmentation NeurIPS 2024 + + +
+ Accurately estimating depth in 360-degree imagery is crucial for virtual +reality, autonomous navigation, and immersive media applications. Existing +depth estimation methods designed for perspective-view imagery fail when +applied to 360-degree images due to different camera projections and +distortions, whereas 360-degree methods perform inferior due to the lack of +labeled data pairs. We propose a new depth estimation framework that utilizes +unlabeled 360-degree data effectively. Our approach uses state-of-the-art +perspective depth estimation models as teacher models to generate pseudo labels +through a six-face cube projection technique, enabling efficient labeling of +depth in 360-degree images. This method leverages the increasing availability +of large datasets. Our approach includes two main stages: offline mask +generation for invalid regions and an online semi-supervised joint training +regime. We tested our approach on benchmark datasets such as Matterport3D and +Stanford2D3D, showing significant improvements in depth estimation accuracy, +particularly in zero-shot scenarios. Our proposed training pipeline can enhance +any 360 monocular depth estimator and demonstrates effective knowledge transfer +across different camera projections and data types. See our project page for +results: https://albert100121.github.io/Depth-Anywhere/ + +
+
+ comment: NeurIPS 2024. Project page: + https://albert100121.github.io/Depth-Anywhere/ +
+
+
+
+
+ + ♻ ☆ Weight Copy and Low-Rank Adaptation for Few-Shot Distillation of Vision + Transformers + + +
+ Few-shot knowledge distillation recently emerged as a viable approach to +harness the knowledge of large-scale pre-trained models, using limited data and +computational resources. In this paper, we propose a novel few-shot feature +distillation approach for vision transformers. Our approach is based on two key +steps. Leveraging the fact that vision transformers have a consistent +depth-wise structure, we first copy the weights from intermittent layers of +existing pre-trained vision transformers (teachers) into shallower +architectures (students), where the intermittence factor controls the +complexity of the student transformer with respect to its teacher. Next, we +employ an enhanced version of Low-Rank Adaptation (LoRA) to distill knowledge +into the student in a few-shot scenario, aiming to recover the information +processing carried out by the skipped teacher layers. We present comprehensive +experiments with supervised and self-supervised transformers as teachers, on +six data sets from various domains (natural, medical and satellite images) and +tasks (classification and segmentation). The empirical results confirm the +superiority of our approach over state-of-the-art competitors. Moreover, the +ablation results demonstrate the usefulness of each component of the proposed +pipeline. We release our code at https://github.com/dianagrigore/WeCoLoRA. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ♻ ☆ U-DiTs: Downsample Tokens in U-Shaped Diffusion Transformers + + +
+ Diffusion Transformers (DiTs) introduce the transformer architecture to +diffusion tasks for latent-space image generation. With an isotropic +architecture that chains a series of transformer blocks, DiTs demonstrate +competitive performance and good scalability; but meanwhile, the abandonment of +U-Net by DiTs and their following improvements is worth rethinking. To this +end, we conduct a simple toy experiment by comparing a U-Net architectured DiT +with an isotropic one. It turns out that the U-Net architecture only gain a +slight advantage amid the U-Net inductive bias, indicating potential +redundancies within the U-Net-style DiT. Inspired by the discovery that U-Net +backbone features are low-frequency-dominated, we perform token downsampling on +the query-key-value tuple for self-attention that bring further improvements +despite a considerable amount of reduction in computation. Based on +self-attention with downsampled tokens, we propose a series of U-shaped DiTs +(U-DiTs) in the paper and conduct extensive experiments to demonstrate the +extraordinary performance of U-DiT models. The proposed U-DiT could outperform +DiT-XL/2 with only 1/6 of its computation cost. Codes are available at +https://github.com/YuchuanTian/U-DiT. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Unbounded: A Generative Infinite Game of Character Life Simulation + + +
+ We introduce the concept of a generative infinite game, a video game that +transcends the traditional boundaries of finite, hard-coded systems by using +generative models. Inspired by James P. Carse's distinction between finite and +infinite games, we leverage recent advances in generative AI to create +Unbounded: a game of character life simulation that is fully encapsulated in +generative models. Specifically, Unbounded draws inspiration from sandbox life +simulations and allows you to interact with your autonomous virtual character +in a virtual world by feeding, playing with and guiding it - with open-ended +mechanics generated by an LLM, some of which can be emergent. In order to +develop Unbounded, we propose technical innovations in both the LLM and visual +generation domains. Specifically, we present: (1) a specialized, distilled +large language model (LLM) that dynamically generates game mechanics, +narratives, and character interactions in real-time, and (2) a new dynamic +regional image prompt Adapter (IP-Adapter) for vision models that ensures +consistent yet flexible visual generation of a character across multiple +environments. We evaluate our system through both qualitative and quantitative +analysis, showing significant improvements in character life simulation, user +instruction following, narrative coherence, and visual consistency for both +characters and the environments compared to traditional related approaches. + +
+
+ comment: Project page: https://generative-infinite-game.github.io/ +
+
+
+
+
+ + ♻ ☆ DenoiseRep: Denoising Model for Representation Learning + + +
+ The denoising model has been proven a powerful generative model but has +little exploration of discriminative tasks. Representation learning is +important in discriminative tasks, which is defined as "learning +representations (or features) of the data that make it easier to extract useful +information when building classifiers or other predictors". In this paper, we +propose a novel Denoising Model for Representation Learning (DenoiseRep) to +improve feature discrimination with joint feature extraction and denoising. +DenoiseRep views each embedding layer in a backbone as a denoising layer, +processing the cascaded embedding layers as if we are recursively denoise +features step-by-step. This unifies the frameworks of feature extraction and +denoising, where the former progressively embeds features from low-level to +high-level, and the latter recursively denoises features step-by-step. After +that, DenoiseRep fuses the parameters of feature extraction and denoising +layers, and theoretically demonstrates its equivalence before and after the +fusion, thus making feature denoising computation-free. DenoiseRep is a +label-free algorithm that incrementally improves features but also +complementary to the label if available. Experimental results on various +discriminative vision tasks, including re-identification (Market-1501, +DukeMTMC-reID, MSMT17, CUHK-03, vehicleID), image classification (ImageNet, +UB200, Oxford-Pet, Flowers), object detection (COCO), image segmentation +(ADE20K) show stability and impressive improvements. We also validate its +effectiveness on the CNN (ResNet) and Transformer (ViT, Swin, Vmamda) +architectures. + +
+
+
+
+
+ + ♻ ☆ Copycats: the many lives of a publicly available medical imaging dataset NeurIPS 2024 + + +
+ Medical Imaging (MI) datasets are fundamental to artificial intelligence in +healthcare. The accuracy, robustness, and fairness of diagnostic algorithms +depend on the data (and its quality) used to train and evaluate the models. MI +datasets used to be proprietary, but have become increasingly available to the +public, including on community-contributed platforms (CCPs) like Kaggle or +HuggingFace. While open data is important to enhance the redistribution of +data's public value, we find that the current CCP governance model fails to +uphold the quality needed and recommended practices for sharing, documenting, +and evaluating datasets. In this paper, we conduct an analysis of publicly +available machine learning datasets on CCPs, discussing datasets' context, and +identifying limitations and gaps in the current CCP landscape. We highlight +differences between MI and computer vision datasets, particularly in the +potentially harmful downstream effects from poor adoption of recommended +dataset management practices. We compare the analyzed datasets across several +dimensions, including data sharing, data documentation, and maintenance. We +find vague licenses, lack of persistent identifiers and storage, duplicates, +and missing metadata, with differences between the platforms. Our research +contributes to efforts in responsible data curation and AI algorithms for +healthcare. + +
+
+ comment: NeurIPS 2024 Track on Datasets and Benchmarks. Please note that v1 + has a different title +
+
+
+
+
+ + ♻ ☆ Detection of Micromobility Vehicles in Urban Traffic Videos + + +
+ Urban traffic environments present unique challenges for object detection, +particularly with the increasing presence of micromobility vehicles like +e-scooters and bikes. To address this object detection problem, this work +introduces an adapted detection model that combines the accuracy and speed of +single-frame object detection with the richer features offered by video object +detection frameworks. This is done by applying aggregated feature maps from +consecutive frames processed through motion flow to the YOLOX architecture. +This fusion brings a temporal perspective to YOLOX detection abilities, +allowing for a better understanding of urban mobility patterns and +substantially improving detection reliability. Tested on a custom dataset +curated for urban micromobility scenarios, our model showcases substantial +improvement over existing state-of-the-art methods, demonstrating the need to +consider spatio-temporal information for detecting such small and thin objects. +Our approach enhances detection in challenging conditions, including +occlusions, ensuring temporal consistency, and effectively mitigating motion +blur. + +
+
+ comment: Accepted at the 21st Conference on Robots and Vision (CRV), 2024 +
+
+
+
+
+ + ♻ ☆ WaveMixSR-V2: Enhancing Super-resolution with Higher Efficiency + + +
+ Recent advancements in single image super-resolution have been predominantly +driven by token mixers and transformer architectures. WaveMixSR utilized the +WaveMix architecture, employing a two-dimensional discrete wavelet transform +for spatial token mixing, achieving superior performance in super-resolution +tasks with remarkable resource efficiency. In this work, we present an enhanced +version of the WaveMixSR architecture by (1) replacing the traditional +transpose convolution layer with a pixel shuffle operation and (2) implementing +a multistage design for higher resolution tasks ($4\times$). Our experiments +demonstrate that our enhanced model -- WaveMixSR-V2 -- outperforms other +architectures in multiple super-resolution tasks, achieving state-of-the-art +for the BSD100 dataset, while also consuming fewer resources, exhibits higher +parameter efficiency, lower latency and higher throughput. Our code is +available at https://github.com/pranavphoenix/WaveMixSR. + +
+
+ comment: 10 pages. Accepted in AAAI 2025. arXiv admin note: text overlap with + arXiv:2307.00430 +
+
+
+
+
+ + ♻ ☆ IMITATE: Clinical Prior Guided Hierarchical Vision-Language Pre-training + + +
+ In the field of medical Vision-Language Pre-training (VLP), significant +efforts have been devoted to deriving text and image features from both +clinical reports and associated medical images. However, most existing methods +may have overlooked the opportunity in leveraging the inherent hierarchical +structure of clinical reports, which are generally split into `findings' for +descriptive content and `impressions' for conclusive observation. Instead of +utilizing this rich, structured format, current medical VLP approaches often +simplify the report into either a unified entity or fragmented tokens. In this +work, we propose a novel clinical prior guided VLP framework named IMITATE to +learn the structure information from medical reports with hierarchical +vision-language alignment. The framework derives multi-level visual features +from the chest X-ray (CXR) images and separately aligns these features with the +descriptive and the conclusive text encoded in the hierarchical medical report. +Furthermore, a new clinical-informed contrastive loss is introduced for +cross-modal learning, which accounts for clinical prior knowledge in +formulating sample correlations in contrastive learning. The proposed model, +IMITATE, outperforms baseline VLP methods across six different datasets, +spanning five medical imaging downstream tasks. Comprehensive experimental +results highlight the advantages of integrating the hierarchical structure of +medical reports for vision-language alignment. The code related to this paper +is available at https://github.com/cheliu-computation/IMITATE-TMI2024. + +
+
+ comment: Accepted by TMI2024 +
+
+
+
+
+ + ♻ ☆ MemControl: Mitigating Memorization in Diffusion Models via Automated + Parameter Selection + + +
+ Diffusion models excel in generating images that closely resemble their +training data but are also susceptible to data memorization, raising privacy, +ethical, and legal concerns, particularly in sensitive domains such as medical +imaging. We hypothesize that this memorization stems from the +overparameterization of deep models and propose that regularizing model +capacity during fine-tuning can mitigate this issue. Firstly, we empirically +show that regulating the model capacity via Parameter-efficient fine-tuning +(PEFT) mitigates memorization to some extent, however, it further requires the +identification of the exact parameter subsets to be fine-tuned for high-quality +generation. To identify these subsets, we introduce a bi-level optimization +framework, MemControl, that automates parameter selection using memorization +and generation quality metrics as rewards during fine-tuning. The parameter +subsets discovered through MemControl achieve a superior tradeoff between +generation quality and memorization. For the task of medical image generation, +our approach outperforms existing state-of-the-art memorization mitigation +strategies by fine-tuning as few as 0.019% of model parameters. Moreover, we +demonstrate that the discovered parameter subsets are transferable to +non-medical domains. Our framework is scalable to large datasets, agnostic to +reward functions, and can be integrated with existing approaches for further +memorization mitigation. To the best of our knowledge, this is the first study +to empirically evaluate memorization in medical images and propose a targeted +yet universal mitigation strategy. The code is available at +https://github.com/Raman1121/Diffusion_Memorization_HPO + +
+
+ comment: Accepted at WACV'25 (Applications Track) +
+
+
+
+
+ + ♻ ☆ Differentially Private Representation Learning via Image Captioning ICML 2024 + + +
+ Differentially private (DP) machine learning is considered the gold-standard +solution for training a model from sensitive data while still preserving +privacy. However, a major barrier to achieving this ideal is its sub-optimal +privacy-accuracy trade-off, which is particularly visible in DP representation +learning. Specifically, it has been shown that under modest privacy budgets, +most models learn representations that are not significantly better than +hand-crafted features. In this work, we show that effective DP representation +learning can be done via image captioning and scaling up to internet-scale +multimodal datasets. Through a series of engineering tricks, we successfully +train a DP image captioner (DP-Cap) on a 233M subset of LAION-2B from scratch +using a reasonable amount of computation, and obtaining unprecedented +high-quality image features that can be used in a variety of downstream vision +and vision-language tasks. For example, under a privacy budget of +$\varepsilon=8$ for the LAION dataset, a linear classifier trained on top of +learned DP-Cap features attains $65.8\%$ accuracy on ImageNet-1K, considerably +improving the previous SOTA of $56.5\%$. + +
+
+ comment: Accepted and presented at ICML 2024 +
+
+
+
+
+ + ♻ ☆ Utilizing Large Language Models in an iterative paradigm with Domain + feedback for Zero-shot Molecule optimization + + +
+ Molecule optimization is a critical task in drug discovery to optimize +desired properties of a given molecule through chemical modification. Despite +Large Language Models (LLMs) holding the potential to efficiently simulate this +task by using natural language to direct the optimization, straightforwardly +utilizing shows limited performance. In this work, we facilitate utilizing LLMs +in an iterative paradigm by proposing a simple yet highly effective domain +feedback provider, namely $\text{Re}^3$DF. In detail, $\text{Re}^3$DF harnesses +an external toolkit, RDKit, to handle the molecule hallucination, if the +modified molecule is chemically invalid. Otherwise, its desired properties are +computed and compared to the original one, establishing reliable domain +feedback with correct direction and distance towards the objective, followed by +a retrieved example, to explicitly guide the LLM to refine the modified +molecule. We conduct experiments across both single- and multi-property +objectives with 2 thresholds, where $\text{Re}^3$DF shows significant +improvements. Particularly, for 20 single-property objectives, $\text{Re}^3$DF +enhances Hit ratio by 16.95% and 20.76% under loose and strict thresholds, +respectively. For 32 multi-property objectives, $\text{Re}^3$DF enhances Hit +ratio by 6.04% and 5.25%. + +
+
+
+
+
+ + ♻ ☆ Diffusion for World Modeling: Visual Details Matter in Atari NeurIPS 2024 + + +
+ World models constitute a promising approach for training reinforcement +learning agents in a safe and sample-efficient manner. Recent world models +predominantly operate on sequences of discrete latent variables to model +environment dynamics. However, this compression into a compact discrete +representation may ignore visual details that are important for reinforcement +learning. Concurrently, diffusion models have become a dominant approach for +image generation, challenging well-established methods modeling discrete +latents. Motivated by this paradigm shift, we introduce DIAMOND (DIffusion As a +Model Of eNvironment Dreams), a reinforcement learning agent trained in a +diffusion world model. We analyze the key design choices that are required to +make diffusion suitable for world modeling, and demonstrate how improved visual +details can lead to improved agent performance. DIAMOND achieves a mean human +normalized score of 1.46 on the competitive Atari 100k benchmark; a new best +for agents trained entirely within a world model. We further demonstrate that +DIAMOND's diffusion world model can stand alone as an interactive neural game +engine by training on static Counter-Strike: Global Offensive gameplay. To +foster future research on diffusion for world modeling, we release our code, +agents, videos and playable world models at https://diamond-wm.github.io. + +
+
+ comment: NeurIPS 2024 (Spotlight) +
+
+
+
+
+ + ♻ ☆ Imprecise Label Learning: A Unified Framework for Learning with Various + Imprecise Label Configurations NeurIPS 2024 + + +
+ Learning with reduced labeling standards, such as noisy label, partial label, +and multiple label candidates, which we generically refer to as +\textit{imprecise} labels, is a commonplace challenge in machine learning +tasks. Previous methods tend to propose specific designs for every emerging +imprecise label configuration, which is usually unsustainable when multiple +configurations of imprecision coexist. In this paper, we introduce imprecise +label learning (ILL), a framework for the unification of learning with various +imprecise label configurations. ILL leverages expectation-maximization (EM) for +modeling the imprecise label information, treating the precise labels as latent +variables.Instead of approximating the correct labels for training, it +considers the entire distribution of all possible labeling entailed by the +imprecise information. We demonstrate that ILL can seamlessly adapt to partial +label learning, semi-supervised learning, noisy label learning, and, more +importantly, a mixture of these settings. Notably, ILL surpasses the existing +specified techniques for handling imprecise labels, marking the first unified +framework with robust and effective performance across various challenging +settings. We hope our work will inspire further research on this topic, +unleashing the full potential of ILL in wider scenarios where precise labels +are expensive and complicated to obtain. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ On Unsupervised Partial Shape Correspondence + + +
+ While dealing with matching shapes to their parts, we often apply a tool +known as functional maps. The idea is to translate the shape matching problem +into "convenient" spaces by which matching is performed algebraically by +solving a least squares problem. Here, we argue that such formulations, though +popular in this field, introduce errors in the estimated match when partiality +is invoked. Such errors are unavoidable even for advanced feature extraction +networks, and they can be shown to escalate with increasing degrees of shape +partiality, adversely affecting the learning capability of such systems. To +circumvent these limitations, we propose a novel approach for partial shape +matching. Our study of functional maps led us to a novel method that +establishes direct correspondence between partial and full shapes through +feature matching bypassing the need for functional map intermediate spaces. The +Gromov Distance between metric spaces leads to the construction of the first +part of our loss functions. For regularization we use two options: a term based +on the area preserving property of the mapping, and a relaxed version that +avoids the need to resort to functional maps. The proposed approach shows +superior performance on the SHREC'16 dataset, outperforming existing +unsupervised methods for partial shape matching.Notably, it achieves +state-of-the-art results on the SHREC'16 HOLES benchmark, superior also +compared to supervised methods. We demonstrate the benefits of the proposed +unsupervised method when applied to a new dataset PFAUST for part-to-full shape +correspondence. + +
+
+ comment: Updated version, accepted for publication at the Asian Conference on + Computer Vision (ACCV) 2024 +
+
+
+
+
+ + ♻ ☆ Slight Corruption in Pre-training Data Makes Better Diffusion Models NeurIPS 2024 + + +
+ Diffusion models (DMs) have shown remarkable capabilities in generating +realistic high-quality images, audios, and videos. They benefit significantly +from extensive pre-training on large-scale datasets, including web-crawled data +with paired data and conditions, such as image-text and image-class pairs. +Despite rigorous filtering, these pre-training datasets often inevitably +contain corrupted pairs where conditions do not accurately describe the data. +This paper presents the first comprehensive study on the impact of such +corruption in pre-training data of DMs. We synthetically corrupt ImageNet-1K +and CC3M to pre-train and evaluate over 50 conditional DMs. Our empirical +findings reveal that various types of slight corruption in pre-training can +significantly enhance the quality, diversity, and fidelity of the generated +images across different DMs, both during pre-training and downstream adaptation +stages. Theoretically, we consider a Gaussian mixture model and prove that +slight corruption in the condition leads to higher entropy and a reduced +2-Wasserstein distance to the ground truth of the data distribution generated +by the corruptly trained DMs. Inspired by our analysis, we propose a simple +method to improve the training of DMs on practical datasets by adding condition +embedding perturbations (CEP). CEP significantly improves the performance of +various DMs in both pre-training and downstream tasks. We hope that our study +provides new insights into understanding the data and pre-training processes of +DMs and all models are released at https://huggingface.co/DiffusionNoise. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ♻ ☆ Scalable Ranked Preference Optimization for Text-to-Image Generation + + +
+ Direct Preference Optimization (DPO) has emerged as a powerful approach to +align text-to-image (T2I) models with human feedback. Unfortunately, successful +application of DPO to T2I models requires a huge amount of resources to collect +and label large-scale datasets, e.g., millions of generated paired images +annotated with human preferences. In addition, these human preference datasets +can get outdated quickly as the rapid improvements of T2I models lead to higher +quality images. In this work, we investigate a scalable approach for collecting +large-scale and fully synthetic datasets for DPO training. Specifically, the +preferences for paired images are generated using a pre-trained reward +function, eliminating the need for involving humans in the annotation process, +greatly improving the dataset collection efficiency. Moreover, we demonstrate +that such datasets allow averaging predictions across multiple models and +collecting ranked preferences as opposed to pairwise preferences. Furthermore, +we introduce RankDPO to enhance DPO-based methods using the ranking feedback. +Applying RankDPO on SDXL and SD3-Medium models with our synthetically generated +preference dataset "Syn-Pic" improves both prompt-following (on benchmarks like +T2I-Compbench, GenEval, and DPG-Bench) and visual quality (through user +studies). This pipeline presents a practical and scalable solution to develop +better preference datasets to enhance the performance of text-to-image models. + +
+
+ comment: Project Page: https://snap-research.github.io/RankDPO/ +
+
+
+
+
+ + ♻ ☆ MMBench-Video: A Long-Form Multi-Shot Benchmark for Holistic Video + Understanding NeurIPS 2024 + + +
+ The advent of large vision-language models (LVLMs) has spurred research into +their applications in multi-modal contexts, particularly in video +understanding. Traditional VideoQA benchmarks, despite providing quantitative +metrics, often fail to encompass the full spectrum of video content and +inadequately assess models' temporal comprehension. To address these +limitations, we introduce MMBench-Video, a quantitative benchmark designed to +rigorously evaluate LVLMs' proficiency in video understanding. MMBench-Video +incorporates lengthy videos from YouTube and employs free-form questions, +mirroring practical use cases. The benchmark is meticulously crafted to probe +the models' temporal reasoning skills, with all questions human-annotated +according to a carefully constructed ability taxonomy. We employ GPT-4 for +automated assessment, demonstrating superior accuracy and robustness over +earlier LLM-based evaluations. Utilizing MMBench-Video, we have conducted +comprehensive evaluations that include both proprietary and open-source LVLMs +for images and videos. MMBench-Video stands as a valuable resource for the +research community, facilitating improved evaluation of LVLMs and catalyzing +progress in the field of video understanding. The evalutation code of +MMBench-Video will be integrated into VLMEvalKit: +https://github.com/open-compass/VLMEvalKit. + +
+
+ comment: Accepted in NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ♻ ☆ High-fidelity Endoscopic Image Synthesis by Utilizing Depth-guided + Neural Surfaces + + +
+ In surgical oncology, screening colonoscopy plays a pivotal role in providing +diagnostic assistance, such as biopsy, and facilitating surgical navigation, +particularly in polyp detection. Computer-assisted endoscopic surgery has +recently gained attention and amalgamated various 3D computer vision +techniques, including camera localization, depth estimation, surface +reconstruction, etc. Neural Radiance Fields (NeRFs) and Neural Implicit +Surfaces (NeuS) have emerged as promising methodologies for deriving accurate +3D surface models from sets of registered images, addressing the limitations of +existing colon reconstruction approaches stemming from constrained camera +movement. + However, the inadequate tissue texture representation and confused scale +problem in monocular colonoscopic image reconstruction still impede the +progress of the final rendering results. In this paper, we introduce a novel +method for colon section reconstruction by leveraging NeuS applied to +endoscopic images, supplemented by a single frame of depth map. Notably, we +pioneered the exploration of utilizing only one frame depth map in +photorealistic reconstruction and neural rendering applications while this +single depth map can be easily obtainable from other monocular depth estimation +networks with an object scale. Through rigorous experimentation and validation +on phantom imagery, our approach demonstrates exceptional accuracy in +completely rendering colon sections, even capturing unseen portions of the +surface. This breakthrough opens avenues for achieving stable and consistently +scaled reconstructions, promising enhanced quality in cancer screening +procedures and treatment interventions. + +
+
+
+
+
+ + ♻ ☆ CT-AGRG: Automated Abnormality-Guided Report Generation from 3D Chest CT + Volumes + + +
+ The rapid increase of computed tomography (CT) scans and their time-consuming +manual analysis have created an urgent need for robust automated analysis +techniques in clinical settings. These aim to assist radiologists and help them +managing their growing workload. Existing methods typically generate entire +reports directly from 3D CT images, without explicitly focusing on observed +abnormalities. This unguided approach often results in repetitive content or +incomplete reports, failing to prioritize anomaly-specific descriptions. We +propose a new anomaly-guided report generation model, which first predicts +abnormalities and then generates targeted descriptions for each. Evaluation on +a public dataset demonstrates significant improvements in report quality and +clinical relevance. We extend our work by conducting an ablation study to +demonstrate its effectiveness. + +
+
+ comment: 15 pages, 9 figures, submitted to ISBI 2025 +
+
+
+
+
+ + ♻ ☆ HumanSplat: Generalizable Single-Image Human Gaussian Splatting with + Structure Priors + + +
+ Despite recent advancements in high-fidelity human reconstruction techniques, +the requirements for densely captured images or time-consuming per-instance +optimization significantly hinder their applications in broader scenarios. To +tackle these issues, we present HumanSplat which predicts the 3D Gaussian +Splatting properties of any human from a single input image in a generalizable +manner. In particular, HumanSplat comprises a 2D multi-view diffusion model and +a latent reconstruction transformer with human structure priors that adeptly +integrate geometric priors and semantic features within a unified framework. A +hierarchical loss that incorporates human semantic information is further +designed to achieve high-fidelity texture modeling and better constrain the +estimated multiple views. Comprehensive experiments on standard benchmarks and +in-the-wild images demonstrate that HumanSplat surpasses existing +state-of-the-art methods in achieving photorealistic novel-view synthesis. + +
+
+
+
+
+ + ♻ ☆ einspace: Searching for Neural Architectures from Fundamental Operations NeurIPS 2024 + + +
+ Neural architecture search (NAS) finds high performing networks for a given +task. Yet the results of NAS are fairly prosaic; they did not e.g. create a +shift from convolutional structures to transformers. This is not least because +the search spaces in NAS often aren't diverse enough to include such +transformations a priori. Instead, for NAS to provide greater potential for +fundamental design shifts, we need a novel expressive search space design which +is built from more fundamental operations. To this end, we introduce einspace, +a search space based on a parameterised probabilistic context-free grammar. Our +space is versatile, supporting architectures of various sizes and complexities, +while also containing diverse network operations which allow it to model +convolutions, attention components and more. It contains many existing +competitive architectures, and provides flexibility for discovering new ones. +Using this search space, we perform experiments to find novel architectures as +well as improvements on existing ones on the diverse Unseen NAS datasets. We +show that competitive architectures can be obtained by searching from scratch, +and we consistently find large improvements when initialising the search with +strong baselines. We believe that this work is an important advancement towards +a transformative NAS paradigm where search space expressivity and strategic +search initialisation play key roles. + +
+
+ comment: NeurIPS 2024. Project page at + https://linusericsson.github.io/einspace/ +
+
+
+
+
+ + ♻ ☆ Improving Hateful Meme Detection through Retrieval-Guided Contrastive + Learning + + +
+ Hateful memes have emerged as a significant concern on the Internet. +Detecting hateful memes requires the system to jointly understand the visual +and textual modalities. Our investigation reveals that the embedding space of +existing CLIP-based systems lacks sensitivity to subtle differences in memes +that are vital for correct hatefulness classification. We propose constructing +a hatefulness-aware embedding space through retrieval-guided contrastive +training. Our approach achieves state-of-the-art performance on the +HatefulMemes dataset with an AUROC of 87.0, outperforming much larger +fine-tuned large multimodal models. We demonstrate a retrieval-based hateful +memes detection system, which is capable of identifying hatefulness based on +data unseen in training. This allows developers to update the hateful memes +detection system by simply adding new examples without retraining, a desirable +feature for real services in the constantly evolving landscape of hateful memes +on the Internet. + +
+
+ comment: ACL 2024 Main. The code is available from: + https://github.com/JingbiaoMei/RGCL +
+
+
+
+
+ + ♻ ☆ A Strong Baseline for Semi-Supervised Incremental Few-Shot Learning + + +
+ Few-shot learning (FSL) aims to learn models that generalize to novel classes +with limited training samples. Recent works advance FSL towards a scenario +where unlabeled examples are also available and propose semi-supervised FSL +methods. Another line of methods also cares about the performance of base +classes in addition to the novel ones and thus establishes the incremental FSL +scenario. In this paper, we generalize the above two under a more realistic yet +complex setting, named by Semi-Supervised Incremental Few-Shot Learning (S2 +I-FSL). To tackle the task, we propose a novel paradigm containing two parts: +(1) a well-designed meta-training algorithm for mitigating ambiguity between +base and novel classes caused by unreliable pseudo labels and (2) a model +adaptation mechanism to learn discriminative features for novel classes while +preserving base knowledge using few labeled and all the unlabeled data. +Extensive experiments on standard FSL, semi-supervised FSL, incremental FSL, +and the firstly built S2 I-FSL benchmarks demonstrate the effectiveness of our +proposed method. + +
+
+ comment: Accepted by BMVC2021 +
+
+
+
+
+ + ♻ ☆ Implicit-ARAP: Efficient Handle-Guided Deformation of High-Resolution + Meshes and Neural Fields via Local Patch Meshing + + +
+ In this work, we present the local patch mesh representation for neural +signed distance fields. This technique allows to discretize local regions of +the level sets of an input SDF by projecting and deforming flat patch meshes +onto the level set surface, using exclusively the SDF information and its +gradient. Our analysis reveals this method to be more accurate than the +standard marching cubes algorithm for approximating the implicit surface. Then, +we apply this representation in the setting of handle-guided deformation: we +introduce two distinct pipelines, which make use of 3D neural fields to compute +As-Rigid-As-Possible deformations of both high-resolution meshes and neural +fields under a given set of constraints. We run a comprehensive evaluation of +our method and various baselines for neural field and mesh deformation which +show both pipelines achieve impressive efficiency and notable improvements in +terms of quality of results and robustness. With our novel pipeline, we +introduce a scalable approach to solve a well-established geometry processing +problem on high-resolution meshes, and pave the way for extending other +geometric tasks to the domain of implicit surfaces via local patch meshing. + +
+
+ comment: 12 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ MMSummary: Multimodal Summary Generation for Fetal Ultrasound Video + + +
+ We present the first automated multimodal summary generation system, +MMSummary, for medical imaging video, particularly with a focus on fetal +ultrasound analysis. Imitating the examination process performed by a human +sonographer, MMSummary is designed as a three-stage pipeline, progressing from +keyframe detection to keyframe captioning and finally anatomy segmentation and +measurement. In the keyframe detection stage, an innovative automated workflow +is proposed to progressively select a concise set of keyframes, preserving +sufficient video information without redundancy. Subsequently, we adapt a large +language model to generate meaningful captions for fetal ultrasound keyframes +in the keyframe captioning stage. If a keyframe is captioned as fetal biometry, +the segmentation and measurement stage estimates biometric parameters by +segmenting the region of interest according to the textual prior. The MMSummary +system provides comprehensive summaries for fetal ultrasound examinations and +based on reported experiments is estimated to reduce scanning time by +approximately 31.5%, thereby suggesting the potential to enhance clinical +workflow efficiency. + +
+
+ comment: MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ Image captioning in different languages + + +
+ This short position paper provides a manually curated list of non-English +image captioning datasets (as of May 2024). Through this list, we can observe +the dearth of datasets in different languages: only 23 different languages are +represented. With the addition of the Crossmodal-3600 dataset (Thapliyal et +al., 2022, 36 languages) this number increases somewhat, but still this number +is small compared to the +/-500 institutional languages that are out there. +This paper closes with some open questions for the field of Vision & Language. + +
+
+
+
+
+ + ♻ ☆ IntLoRA: Integral Low-rank Adaptation of Quantized Diffusion Models + + +
+ Fine-tuning large-scale text-to-image diffusion models for various downstream +tasks has yielded impressive results. However, the heavy computational burdens +of tuning large models prevent personal customization. Recent advances have +attempted to employ parameter-efficient fine-tuning (PEFT) techniques to adapt +the floating-point (FP) or quantized pre-trained weights. Nonetheless, the +adaptation parameters in existing works are still restricted to FP arithmetic, +hindering hardware-friendly acceleration. In this work, we propose IntLoRA, to +further push the efficiency limits by using integer type (INT) low-rank +parameters to adapt the quantized diffusion models. By working in the integer +arithmetic, our IntLoRA offers three key advantages: (i) for fine-tuning, the +pre-trained weights are quantized, reducing memory usage; (ii) for storage, +both pre-trained and low-rank weights are in INT which consumes less disk +space; (iii) for inference, IntLoRA weights can be naturally merged into +quantized pre-trained weights through efficient integer multiplication or +bit-shifting, eliminating additional post-training quantization. Extensive +experiments demonstrate that IntLoRA can achieve performance on par with or +even superior to the vanilla LoRA, accompanied by significant efficiency +improvements. Code is available at \url{https://github.com/csguoh/IntLoRA}. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Beyond Strong labels: Weakly-supervised Learning Based on Gaussian + Pseudo Labels for The Segmentation of Ellipse-like Vascular Structures in + Non-contrast CTs + + +
+ Deep-learning-based automated segmentation of vascular structures in +preoperative CT scans contributes to computer-assisted diagnosis and +intervention procedure in vascular diseases. While CT angiography (CTA) is the +common standard, non-contrast CT imaging is significant as a contrast-risk-free +alternative, avoiding complications associated with contrast agents. However, +the challenges of labor-intensive labeling and high labeling variability due to +the ambiguity of vascular boundaries hinder conventional strong-label-based, +fully-supervised learning in non-contrast CTs. This paper introduces a +weakly-supervised framework using ellipses' topology in slices, including 1) an +efficient annotation process based on predefined standards, 2) ellipse-fitting +processing, 3) the generation of 2D Gaussian heatmaps serving as pseudo labels, +4) a training process through a combination of voxel reconstruction loss and +distribution loss with the pseudo labels. We assess the effectiveness of the +proposed method on one local and two public datasets comprising non-contrast CT +scans, particularly focusing on the abdominal aorta. On the local dataset, our +weakly-supervised learning approach based on pseudo labels outperforms +strong-label-based fully-supervised learning (1.54\% of Dice score on average), +reducing labeling time by around 82.0\%. The efficiency in generating pseudo +labels allows the inclusion of label-agnostic external data in the training +set, leading to an additional improvement in performance (2.74\% of Dice score +on average) with a reduction of 66.3\% labeling time, where the labeling time +remains considerably less than that of strong labels. On the public dataset, +the pseudo labels achieve an overall improvement of 1.95\% in Dice score for 2D +models while a reduction of 11.65 voxel spacing in Hausdorff distance for 3D +model. + +
+
+ comment: Accepted by journal of Medical Image Analysis +
+
+
+
+
+ + ♻ ☆ ES-Gaussian: Gaussian Splatting Mapping via Error Space-Based Gaussian + Completion + + +
+ Accurate and affordable indoor 3D reconstruction is critical for effective +robot navigation and interaction. Traditional LiDAR-based mapping provides high +precision but is costly, heavy, and power-intensive, with limited ability for +novel view rendering. Vision-based mapping, while cost-effective and capable of +capturing visual data, often struggles with high-quality 3D reconstruction due +to sparse point clouds. We propose ES-Gaussian, an end-to-end system using a +low-altitude camera and single-line LiDAR for high-quality 3D indoor +reconstruction. Our system features Visual Error Construction (VEC) to enhance +sparse point clouds by identifying and correcting areas with insufficient +geometric detail from 2D error maps. Additionally, we introduce a novel 3DGS +initialization method guided by single-line LiDAR, overcoming the limitations +of traditional multi-view setups and enabling effective reconstruction in +resource-constrained environments. Extensive experimental results on our new +Dreame-SR dataset and a publicly available dataset demonstrate that ES-Gaussian +outperforms existing methods, particularly in challenging scenarios. The +project page is available at https://chenlu-china.github.io/ES-Gaussian/. + +
+
+ comment: This preprint has been withdrawn due to concerns regarding the + originality of certain technical elements, as well as its basis in a company + project report that was intended solely for internal discussions. To avoid + any potential misunderstandings, we have decided to withdraw this submission + from public access. We apologize for any confusion this may have caused +
+
+
+
+
+ + ♻ ☆ VISAGE: Video Synthesis using Action Graphs for Surgery + + +
+ Surgical data science (SDS) is a field that analyzes patient data before, +during, and after surgery to improve surgical outcomes and skills. However, +surgical data is scarce, heterogeneous, and complex, which limits the +applicability of existing machine learning methods. In this work, we introduce +the novel task of future video generation in laparoscopic surgery. This task +can augment and enrich the existing surgical data and enable various +applications, such as simulation, analysis, and robot-aided surgery. +Ultimately, it involves not only understanding the current state of the +operation but also accurately predicting the dynamic and often unpredictable +nature of surgical procedures. Our proposed method, VISAGE (VIdeo Synthesis +using Action Graphs for Surgery), leverages the power of action scene graphs to +capture the sequential nature of laparoscopic procedures and utilizes diffusion +models to synthesize temporally coherent video sequences. VISAGE predicts the +future frames given only a single initial frame, and the action graph triplets. +By incorporating domain-specific knowledge through the action graph, VISAGE +ensures the generated videos adhere to the expected visual and motion patterns +observed in real laparoscopic procedures. The results of our experiments +demonstrate high-fidelity video generation for laparoscopy procedures, which +enables various applications in SDS. + +
+
+ comment: Accepted at MICCAI 2024 Embodied AI and Robotics for HealTHcare + (EARTH) Workshop +
+
+
+
+
+ + ♻ ☆ PARE-Net: Position-Aware Rotation-Equivariant Networks for Robust Point + Cloud Registration ECCV 2025 + + +
+ Learning rotation-invariant distinctive features is a fundamental requirement +for point cloud registration. Existing methods often use rotation-sensitive +networks to extract features, while employing rotation augmentation to learn an +approximate invariant mapping rudely. This makes networks fragile to rotations, +overweight, and hinders the distinctiveness of features. To tackle these +problems, we propose a novel position-aware rotation-equivariant network, for +efficient, light-weighted, and robust registration. The network can provide a +strong model inductive bias to learn rotation-equivariant/invariant features, +thus addressing the aforementioned limitations. To further improve the +distinctiveness of descriptors, we propose a position-aware convolution, which +can better learn spatial information of local structures. Moreover, we also +propose a feature-based hypothesis proposer. It leverages rotation-equivariant +features that encode fine-grained structure orientations to generate reliable +model hypotheses. Each correspondence can generate a hypothesis, thus it is +more efficient than classic estimators that require multiple reliable +correspondences. Accordingly, a contrastive rotation loss is presented to +enhance the robustness of rotation-equivariant features against data +degradation. Extensive experiments on indoor and outdoor datasets demonstrate +that our method significantly outperforms the SOTA methods in terms of +registration recall while being lightweight and keeping a fast speed. Moreover, +experiments on rotated datasets demonstrate its robustness against rotation +variations. Code is available at https://github.com/yaorz97/PARENet. + +
+
+ comment: Accepted by ECCV 2025 +
+
+
+
+
+ + ♻ ☆ VCR-GauS: View Consistent Depth-Normal Regularizer for Gaussian Surface + Reconstruction + + +
+ Although 3D Gaussian Splatting has been widely studied because of its +realistic and efficient novel-view synthesis, it is still challenging to +extract a high-quality surface from the point-based representation. Previous +works improve the surface by incorporating geometric priors from the +off-the-shelf normal estimator. However, there are two main limitations: 1) +Supervising normals rendered from 3D Gaussians effectively updates the rotation +parameter but is less effective for other geometric parameters; 2) The +inconsistency of predicted normal maps across multiple views may lead to severe +reconstruction artifacts. In this paper, we propose a Depth-Normal regularizer +that directly couples normal with other geometric parameters, leading to full +updates of the geometric parameters from normal regularization. We further +propose a confidence term to mitigate inconsistencies of normal predictions +across multiple views. Moreover, we also introduce a densification and +splitting strategy to regularize the size and distribution of 3D Gaussians for +more accurate surface modeling. Compared with Gaussian-based baselines, +experiments show that our approach obtains better reconstruction quality and +maintains competitive appearance quality at faster training speed and 100+ FPS +rendering. + +
+
+ comment: Project page: https://hlinchen.github.io/projects/VCR-GauS/ +
+
+
+
+
+ + ♻ ☆ Transformer-Based Tooth Alignment Prediction With Occlusion And + Collision Constraints + + +
+ The planning of digital orthodontic treatment requires providing tooth +alignment, which not only consumes a lot of time and labor to determine +manually but also relays clinical experiences heavily. In this work, we +proposed a lightweight tooth alignment neural network based on +Swin-transformer. We first re-organized 3D point clouds based on virtual arch +lines and converted them into order-sorted multi-channel textures, which +improves the accuracy and efficiency simultaneously. We then designed two new +occlusal loss functions that quantitatively evaluate the occlusal relationship +between the upper and lower jaws. They are important clinical constraints, +first introduced to the best of our knowledge, and lead to cutting-edge +prediction accuracy. To train our network, we collected a large digital +orthodontic dataset that has 591 clinical cases, including various complex +clinical cases. This dataset will benefit the community after its release since +there is no open dataset so far. Furthermore, we also proposed two new +orthodontic dataset augmentation methods considering tooth spatial distribution +and occlusion. We evaluated our method with this dataset and extensive +experiments, including comparisons with STAT methods and ablation studies, and +demonstrate the high prediction accuracy of our method. + +
+
+ comment: add key words and email information +
+
+
+
+
+ + ♻ ☆ Continual Learning in the Frequency Domain NeurIPS 2024 + + +
+ Continual learning (CL) is designed to learn new tasks while preserving +existing knowledge. Replaying samples from earlier tasks has proven to be an +effective method to mitigate the forgetting of previously acquired knowledge. +However, the current research on the training efficiency of rehearsal-based +methods is insufficient, which limits the practical application of CL systems +in resource-limited scenarios. The human visual system (HVS) exhibits varying +sensitivities to different frequency components, enabling the efficient +elimination of visually redundant information. Inspired by HVS, we propose a +novel framework called Continual Learning in the Frequency Domain (CLFD). To +our knowledge, this is the first study to utilize frequency domain features to +enhance the performance and efficiency of CL training on edge devices. For the +input features of the feature extractor, CLFD employs wavelet transform to map +the original input image into the frequency domain, thereby effectively +reducing the size of input feature maps. Regarding the output features of the +feature extractor, CLFD selectively utilizes output features for distinct +classes for classification, thereby balancing the reusability and interference +of output features based on the frequency domain similarity of the classes +across various tasks. Optimizing only the input and output features of the +feature extractor allows for seamless integration of CLFD with various +rehearsal-based methods. Extensive experiments conducted in both cloud and edge +environments demonstrate that CLFD consistently improves the performance of +state-of-the-art (SOTA) methods in both precision and training efficiency. +Specifically, CLFD can increase the accuracy of the SOTA CL method by up to +6.83% and reduce the training time by 2.6$\times$. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Degradation Oriented and Regularized Network for Real-World Depth + Super-Resolution + + +
+ Recent RGB-guided depth super-resolution methods have achieved impressive +performance under the assumption of fixed and known degradation (e.g., bicubic +downsampling). However, in real-world scenarios, captured depth data often +suffer from unconventional and unknown degradation due to sensor limitations +and complex imaging environments (e.g., low reflective surfaces, varying +illumination). Consequently, the performance of these methods significantly +declines when real-world degradation deviate from their assumptions. In this +paper, we propose the Degradation Oriented and Regularized Network (DORNet), a +novel framework designed to adaptively address unknown degradation in +real-world scenes through implicit degradation representations. Our approach +begins with the development of a self-supervised degradation learning strategy, +which models the degradation representations of low-resolution depth data using +routing selection-based degradation regularization. To facilitate effective +RGB-D fusion, we further introduce a degradation-oriented feature +transformation module that selectively propagates RGB content into the depth +data based on the learned degradation priors. Extensive experimental results on +both real and synthetic datasets demonstrate the superiority of our DORNet. The +code is available at https://github.com/yanzq95/DORNet. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ MC-MKE: A Fine-Grained Multimodal Knowledge Editing Benchmark + Emphasizing Modality Consistency + + +
+ Multimodal large language models (MLLMs) are prone to non-factual or outdated +knowledge issues, which can manifest as misreading and misrecognition errors +due to the complexity of multimodal knowledge. Previous benchmarks have not +systematically analyzed the performance of editing methods in correcting these +two error types. To better represent and correct these errors, we decompose +multimodal knowledge into its visual and textual components. Different error +types correspond to different editing formats, which edit distinct parts of the +multimodal knowledge. We present MC-MKE, a fine-grained Multimodal Knowledge +Editing benchmark emphasizing Modality Consistency. Our benchmark facilitates +independent correction of misreading and misrecognition errors by editing the +corresponding knowledge component. We evaluate four multimodal knowledge +editing methods on MC-MKE, revealing their limitations, particularly in terms +of modality consistency. Our work highlights the challenges posed by multimodal +knowledge editing and motivates further research in developing effective +techniques for this task. + +
+
+
+
+
+ + ♻ ☆ Causal Deciphering and Inpainting in Spatio-Temporal Dynamics via + Diffusion Model + + +
+ Spatio-temporal (ST) prediction has garnered a De facto attention in earth +sciences, such as meteorological prediction, human mobility perception. +However, the scarcity of data coupled with the high expenses involved in sensor +deployment results in notable data imbalances. Furthermore, models that are +excessively customized and devoid of causal connections further undermine the +generalizability and interpretability. To this end, we establish a causal +framework for ST predictions, termed CaPaint, which targets to identify causal +regions in data and endow model with causal reasoning ability in a two-stage +process. Going beyond this process, we utilize the back-door adjustment to +specifically address the sub-regions identified as non-causal in the upstream +phase. Specifically, we employ a novel image inpainting technique. By using a +fine-tuned unconditional Diffusion Probabilistic Model (DDPM) as the generative +prior, we in-fill the masks defined as environmental parts, offering the +possibility of reliable extrapolation for potential data distributions. CaPaint +overcomes the high complexity dilemma of optimal ST causal discovery models by +reducing the data generation complexity from exponential to quasi-linear +levels. Extensive experiments conducted on five real-world ST benchmarks +demonstrate that integrating the CaPaint concept allows models to achieve +improvements ranging from 4.3% to 77.3%. Moreover, compared to traditional +mainstream ST augmenters, CaPaint underscores the potential of diffusion models +in ST enhancement, offering a novel paradigm for this field. Our project is +available at https://anonymous.4open.science/r/12345-DFCC. + +
+
+
+
+
+ + ♻ ☆ Incorporating Test-Time Optimization into Training with Dual Networks + for Human Mesh Recovery + + +
+ Human Mesh Recovery (HMR) is the task of estimating a parameterized 3D human +mesh from an image. There is a kind of methods first training a regression +model for this problem, then further optimizing the pretrained regression model +for any specific sample individually at test time. However, the pretrained +model may not provide an ideal optimization starting point for the test-time +optimization. Inspired by meta-learning, we incorporate the test-time +optimization into training, performing a step of test-time optimization for +each sample in the training batch before really conducting the training +optimization over all the training samples. In this way, we obtain a +meta-model, the meta-parameter of which is friendly to the test-time +optimization. At test time, after several test-time optimization steps starting +from the meta-parameter, we obtain much higher HMR accuracy than the test-time +optimization starting from the simply pretrained regression model. Furthermore, +we find test-time HMR objectives are different from training-time objectives, +which reduces the effectiveness of the learning of the meta-model. To solve +this problem, we propose a dual-network architecture that unifies the +training-time and test-time objectives. Our method, armed with meta-learning +and the dual networks, outperforms state-of-the-art regression-based and +optimization-based HMR approaches, as validated by the extensive experiments. +The codes are available at https://github.com/fmx789/Meta-HMR. + +
+
+
+
+
+ + ♻ ☆ VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio + Understanding in Video-LLMs + + +
+ In this paper, we present the VideoLLaMA 2, a set of Video Large Language +Models (Video-LLMs) designed to enhance spatial-temporal modeling and audio +understanding in video and audio-oriented tasks. Building upon its predecessor, +VideoLLaMA 2 incorporates a tailor-made Spatial-Temporal Convolution (STC) +connector, which effectively captures the intricate spatial and temporal +dynamics of video data. Additionally, we integrate an Audio Branch into the +model through joint training, thereby enriching the multimodal understanding +capabilities of the model by seamlessly incorporating audio cues. Comprehensive +evaluations on multiple-choice video question answering (MC-VQA), open-ended +video question answering (OE-VQA), and video captioning (VC) tasks demonstrate +that VideoLLaMA 2 consistently achieves competitive results among open-source +models and even gets close to some proprietary models on several benchmarks. +Furthermore, VideoLLaMA 2 exhibits reasonable improvements in audio-only and +audio-video question-answering (AQA & OE-AVQA) benchmarks over existing models. +These advancements underline VideoLLaMA 2's superior performance in multimodal +comprehension, setting a new standard for intelligent video analysis systems. +All models are public to facilitate further research. + +
+
+ comment: ZC, SL, HZ, YX, and XL contributed equally to this project. Code: + https://github.com/DAMO-NLP-SG/VideoLLaMA2 +
+
+
+
+
+ + ♻ ☆ TLCM: Training-efficient Latent Consistency Model for Image Generation + with 2-8 Steps + + +
+ Distilling latent diffusion models (LDMs) into ones that are fast to sample +from is attracting growing research interest. However, the majority of existing +methods face two critical challenges: (1) They hinge on long training using a +huge volume of real data. (2) They routinely lead to quality degradation for +generation, especially in text-image alignment. This paper proposes a novel +training-efficient Latent Consistency Model (TLCM) to overcome these +challenges. Our method first accelerates LDMs via data-free multistep latent +consistency distillation (MLCD), and then data-free latent consistency +distillation is proposed to efficiently guarantee the inter-segment consistency +in MLCD. Furthermore, we introduce bags of techniques, e.g., distribution +matching, adversarial learning, and preference learning, to enhance TLCM's +performance at few-step inference without any real data. TLCM demonstrates a +high level of flexibility by enabling adjustment of sampling steps within the +range of 2 to 8 while still producing competitive outputs compared to full-step +approaches. Notably, TLCM enjoys the data-free merit by employing synthetic +data from the teacher for distillation. With just 70 training hours on an A100 +GPU, a 3-step TLCM distilled from SDXL achieves an impressive CLIP Score of +33.68 and an Aesthetic Score of 5.97 on the MSCOCO-2017 5K benchmark, +surpassing various accelerated models and even outperforming the teacher model +in human preference metrics. We also demonstrate the versatility of TLCMs in +applications including image style transfer, controllable generation, and +Chinese-to-image generation. + +
+
+
+
+
+ + ♻ ☆ Analyzing Noise Models and Advanced Filtering Algorithms for Image + Enhancement + + +
+ Noise, an unwanted component in an image, can be the reason for the +degradation of Image at the time of transmission or capturing. Noise reduction +from images is still a challenging task. Digital Image Processing is a +component of Digital signal processing. A wide variety of algorithms can be +used in image processing to apply to an image or an input dataset and obtain +important outcomes. In image processing research, removing noise from images +before further analysis is essential. Post-noise removal of images improves +clarity, enabling better interpretation and analysis across medical imaging, +satellite imagery, and radar applications. While numerous algorithms exist, +each comes with its own assumptions, strengths, and limitations. The paper aims +to evaluate the effectiveness of different filtering techniques on images with +eight types of noise. It evaluates methodologies like Wiener, Median, Gaussian, +Mean, Low pass, High pass, Laplacian and bilateral filtering, using the +performance metric Peak signal to noise ratio. It shows us the impact of +different filters on noise models by applying a variety of filters to various +kinds of noise. Additionally, it also assists us in determining which filtering +strategy is most appropriate for a certain noise model based on the +circumstances. + +
+
+
+
+
+ + ♻ ☆ GuardT2I: Defending Text-to-Image Models from Adversarial Prompts NeurIPS2024 + + +
+ Recent advancements in Text-to-Image (T2I) models have raised significant +safety concerns about their potential misuse for generating inappropriate or +Not-Safe-For-Work (NSFW) contents, despite existing countermeasures such as +NSFW classifiers or model fine-tuning for inappropriate concept removal. +Addressing this challenge, our study unveils GuardT2I, a novel moderation +framework that adopts a generative approach to enhance T2I models' robustness +against adversarial prompts. Instead of making a binary classification, +GuardT2I utilizes a Large Language Model (LLM) to conditionally transform text +guidance embeddings within the T2I models into natural language for effective +adversarial prompt detection, without compromising the models' inherent +performance. Our extensive experiments reveal that GuardT2I outperforms leading +commercial solutions like OpenAI-Moderation and Microsoft Azure Moderator by a +significant margin across diverse adversarial scenarios. Our framework is +available at https://github.com/cure-lab/GuardT2I. + +
+
+ comment: NeurIPS2024 Poster +
+
+
+
+
+ + ♻ ☆ YourSkatingCoach: A Figure Skating Video Benchmark for Fine-Grained + Element Analysis + + +
+ Combining sports and machine learning involves leveraging ML algorithms and +techniques to extract insight from sports-related data such as player +statistics, game footage, and other relevant information. However, datasets +related to figure skating in the literature focus primarily on element +classification and are currently unavailable or exhibit only limited access, +which greatly raise the entry barrier to developing visual sports technology +for it. Moreover, when using such data to help athletes improve their skills, +we find they are very coarse-grained: they work for learning what an element +is, but they are poorly suited to learning whether the element is good or bad. +Here we propose air time detection, a novel motion analysis task, the goal of +which is to accurately detect the duration of the air time of a jump. We +present YourSkatingCoach, a large, novel figure skating dataset which contains +454 videos of jump elements, the detected skater skeletons in each video, along +with the gold labels of the start and ending frames of each jump, together as a +video benchmark for figure skating. In addition, although this type of task is +often viewed as classification, we cast it as a sequential labeling problem and +propose a Transformer-based model to calculate the duration. Experimental +results show that the proposed model yields a favorable results for a strong +baseline. To further verify the generalizability of the fine-grained labels, we +apply the same process to other sports as cross-sports tasks but for +coarse-grained task action classification. Here we fine-tune the classification +to demonstrate that figure skating, as it contains the essential body +movements, constitutes a strong foundation for adaptation to other sports. + +
+
+
+
+
+ + ♻ ☆ Enhancing CNN Classification with Lamarckian Memetic Algorithms and + Local Search + + +
+ Optimization is critical for optimal performance in deep neural networks +(DNNs). Traditional gradient-based methods often face challenges like local +minima entrapment. This paper explores population-based metaheuristic +optimization algorithms for image classification networks. We propose a novel +approach integrating a two-stage training technique with population-based +optimization algorithms incorporating local search capabilities. Our +experiments demonstrate that the proposed method outperforms state-of-the-art +gradient-based techniques, such as ADAM, in accuracy and computational +efficiency, particularly with high computational complexity and numerous +trainable parameters. The results suggest that our approach offers a robust +alternative to traditional methods for weight optimization in convolutional +neural networks (CNNs). Future work will explore integrating adaptive +mechanisms for parameter tuning and applying the proposed method to other types +of neural networks and real-time applications. + +
+
+ comment: Accepted in IEEE SPARC 2024 +
+
+
+
+
+ + ♻ ☆ Feature distribution Adaptation Network for Speech Emotion Recognition + + +
+ In this paper, we propose a novel deep inductive transfer learning framework, +named feature distribution adaptation network, to tackle the challenging +multi-modal speech emotion recognition problem. Our method aims to use deep +transfer learning strategies to align visual and audio feature distributions to +obtain consistent representation of emotion, thereby improving the performance +of speech emotion recognition. In our model, the pre-trained ResNet-34 is +utilized for feature extraction for facial expression images and acoustic Mel +spectrograms, respectively. Then, the cross-attention mechanism is introduced +to model the intrinsic similarity relationships of multi-modal features. +Finally, the multi-modal feature distribution adaptation is performed +efficiently with feed-forward network, which is extended using the local +maximum mean discrepancy loss. Experiments are carried out on two benchmark +datasets, and the results demonstrate that our model can achieve excellent +performance compared with existing ones. + +
+
+
+
+
+ + ♻ ☆ CycleCrash: A Dataset of Bicycle Collision Videos for Collision + Prediction and Analysis + + +
+ Self-driving research often underrepresents cyclist collisions and safety. To +address this, we present CycleCrash, a novel dataset consisting of 3,000 +dashcam videos with 436,347 frames that capture cyclists in a range of critical +situations, from collisions to safe interactions. This dataset enables 9 +different cyclist collision prediction and classification tasks focusing on +potentially hazardous conditions for cyclists and is annotated with +collision-related, cyclist-related, and scene-related labels. Next, we propose +VidNeXt, a novel method that leverages a ConvNeXt spatial encoder and a +non-stationary transformer to capture the temporal dynamics of videos for the +tasks defined in our dataset. To demonstrate the effectiveness of our method +and create additional baselines on CycleCrash, we apply and compare 7 models +along with a detailed ablation. We release the dataset and code at +https://github.com/DeSinister/CycleCrash/ . + +
+
+ comment: Accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ On filter design in deep convolutional neural network + + +
+ The deep convolutional neural network (DCNN) in computer vision has given +promising results. It is widely applied in many areas, from medicine, +agriculture, self-driving car, biometric system, and almost all computer +vision-based applications. Filters or weights are the critical elements +responsible for learning in DCNN. Backpropagation has been the primary learning +algorithm for DCNN and provides promising results, but the size and numbers of +the filters remain hyper-parameters. Various studies have been done in the last +decade on semi-supervised, self-supervised, and unsupervised methods and their +properties. The effects of filter initialization, size-shape selection, and the +number of filters on learning and optimization have not been investigated in a +separate publication to collate all the options. Such attributes are often +treated as hyper-parameters and lack mathematical understanding. Computer +vision algorithms have many limitations in real-life applications, and +understanding the learning process is essential to have some significant +improvement. To the best of our knowledge, no separate investigation has been +published discussing the filters; this is our primary motivation. This study +focuses on arguments for choosing specific physical parameters of filters, +initialization, and learning technic over scattered methods. The promising +unsupervised approaches have been evaluated. Additionally, the limitations, +current challenges, and future scope have been discussed in this paper. + +
+
+
+
+
+ + ♻ ☆ Hybrid SD: Edge-Cloud Collaborative Inference for Stable Diffusion + Models + + +
+ Stable Diffusion Models (SDMs) have shown remarkable proficiency in image +synthesis. However, their broad application is impeded by their large model +sizes and intensive computational requirements, which typically require +expensive cloud servers for deployment. On the flip side, while there are many +compact models tailored for edge devices that can reduce these demands, they +often compromise on semantic integrity and visual quality when compared to +full-sized SDMs. To bridge this gap, we introduce Hybrid SD, an innovative, +training-free SDMs inference framework designed for edge-cloud collaborative +inference. Hybrid SD distributes the early steps of the diffusion process to +the large models deployed on cloud servers, enhancing semantic planning. +Furthermore, small efficient models deployed on edge devices can be integrated +for refining visual details in the later stages. Acknowledging the diversity of +edge devices with differing computational and storage capacities, we employ +structural pruning to the SDMs U-Net and train a lightweight VAE. Empirical +evaluations demonstrate that our compressed models achieve state-of-the-art +parameter efficiency (225.8M) on edge devices with competitive image quality. +Additionally, Hybrid SD reduces the cloud cost by 66% with edge-cloud +collaborative inference. + +
+
+
+
+
+ + ♻ ☆ DiffGS: Functional Gaussian Splatting Diffusion NeurIPS 2024 + + +
+ 3D Gaussian Splatting (3DGS) has shown convincing performance in rendering +speed and fidelity, yet the generation of Gaussian Splatting remains a +challenge due to its discreteness and unstructured nature. In this work, we +propose DiffGS, a general Gaussian generator based on latent diffusion models. +DiffGS is a powerful and efficient 3D generative model which is capable of +generating Gaussian primitives at arbitrary numbers for high-fidelity rendering +with rasterization. The key insight is to represent Gaussian Splatting in a +disentangled manner via three novel functions to model Gaussian probabilities, +colors and transforms. Through the novel disentanglement of 3DGS, we represent +the discrete and unstructured 3DGS with continuous Gaussian Splatting +functions, where we then train a latent diffusion model with the target of +generating these Gaussian Splatting functions both unconditionally and +conditionally. Meanwhile, we introduce a discretization algorithm to extract +Gaussians at arbitrary numbers from the generated functions via octree-guided +sampling and optimization. We explore DiffGS for various tasks, including +unconditional generation, conditional generation from text, image, and partial +3DGS, as well as Point-to-Gaussian generation. We believe that DiffGS provides +a new direction for flexibly modeling and generating Gaussian Splatting. + +
+
+ comment: Accepted by NeurIPS 2024. Project page: + https://junshengzhou.github.io/DiffGS +
+
+
+
+
+ + ♻ ☆ Robots Pre-train Robots: Manipulation-Centric Robotic Representation + from Large-Scale Robot Datasets + + +
+ The pre-training of visual representations has enhanced the efficiency of +robot learning. Due to the lack of large-scale in-domain robotic datasets, +prior works utilize in-the-wild human videos to pre-train robotic visual +representation. Despite their promising results, representations from human +videos are inevitably subject to distribution shifts and lack the dynamics +information crucial for task completion. We first evaluate various pre-trained +representations in terms of their correlation to the downstream robotic +manipulation tasks (i.e., manipulation centricity). Interestingly, we find that +the "manipulation centricity" is a strong indicator of success rates when +applied to downstream tasks. Drawing from these findings, we propose +Manipulation Centric Representation (MCR), a foundation representation learning +framework capturing both visual features and the dynamics information such as +actions and proprioceptions of manipulation tasks to improve manipulation +centricity. Specifically, we pre-train a visual encoder on the DROID robotic +dataset and leverage motion-relevant data such as robot proprioceptive states +and actions. We introduce a novel contrastive loss that aligns visual +observations with the robot's proprioceptive state-action dynamics, combined +with a behavior cloning (BC)-like actor loss to predict actions during +pre-training, along with a time contrastive loss. Empirical results across 4 +simulation domains with 20 tasks verify that MCR outperforms the strongest +baseline method by 14.8%. Moreover, MCR boosts the performance of +data-efficient learning with a UR5e arm on 3 real-world tasks by 76.9%. Project +website: https://robots-pretrain-robots.github.io/. + +
+
+
+
+
+ + ♻ ☆ A Benchmark for AI-based Weather Data Assimilation + + +
+ Recent advancements in Artificial Intelligence (AI) have led to the +development of several Large Weather Models (LWMs) that rival State-Of-The-Art +(SOTA) Numerical Weather Prediction (NWP) systems. Until now, these models have +still relied on traditional NWP-generated analysis fields as input and are far +from autonomous. Currently, scientists are increasingly focusing on developing +data-driven data assimilation (DA) models for LWMs. To expedite advancements in +this field and facilitate the operationalization of data-driven end-to-end +weather forecasting systems, we propose DABench, a benchmark constructed by +simulated observations, real-world observations, and ERA5 reanalysis. DABench +contributes four standard features: (1) sparse and noisy observations provided +for both simulated and real-world experiments; (2) a Skillful pre-trained +Transformer-based weather prediction model, Sformer, designed to generate +background fields while rigorously assessing the impact of assimilation +outcomes on predictions; (3) standardized evaluation metrics for the model +comparison; (4) a strong DA baseline, 4DVarFormerV2. Our experimental results +demonstrate that the end-to-end weather forecasting system, integrating +4DVarFormerV2 and Sformer, can assimilate real-world observations, thereby +facilitating a stable DA cycle lasting one year and achieving a skillful +forecasting lead time of up to 7 days. The proposed DABench will significantly +advance research in AI-based DA, AI-based weather forecasting, and related +domains. + +
+
+ comment: 38pages, 21 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Unleashing the Potential of Open-set Noisy Samples Against Label Noise + for Medical Image Classification + + +
+ Addressing mixed closed-set and open-set label noise in medical image +classification remains a largely unexplored challenge. Unlike natural image +classification, which often separates and processes closed-set and open-set +noisy samples from clean ones, medical image classification contends with high +inter-class similarity, complicating the identification of open-set noisy +samples. Additionally, existing methods often fail to fully utilize open-set +noisy samples for label noise mitigation, leading to their exclusion or the +application of uniform soft labels. To address these challenges, we propose the +Extended Noise-robust Contrastive and Open-set Feature Augmentation framework +for medical image classification tasks. This framework incorporates the +Extended Noise-robust Supervised Contrastive Loss, which helps differentiate +features among both in-distribution and out-of-distribution classes. This loss +treats open-set noisy samples as an extended class, improving label noise +mitigation by weighting contrastive pairs according to label reliability. +Additionally, we develop the Open-set Feature Augmentation module that enriches +open-set samples at the feature level and then assigns them dynamic class +labels, thereby leveraging the model's capacity and reducing overfitting to +noisy data. We evaluated the proposed framework on both a synthetic noisy +dataset and a real-world noisy dataset. The results indicate the superiority of +our framework over four existing methods and the effectiveness of leveraging +open-set noisy samples to combat label noise. + +
+
+ comment: 14 pages, 6 figure +
+
+
+
+
+ + ♻ ☆ LLaVA-VSD: Large Language-and-Vision Assistant for Visual Spatial + Description + + +
+ Visual Spatial Description (VSD) aims to generate texts that describe the +spatial relationships between objects within images. Traditional visual spatial +relationship classification (VSRC) methods typically output the spatial +relationship between two objects in an image, often neglecting world knowledge +and lacking general language capabilities. In this paper, we propose a Large +Language-and-Vision Assistant for Visual Spatial Description, named LLaVA-VSD, +which is designed for the classification, description, and open-ended +description of visual spatial relationships. Specifically, the model first +constructs a VSD instruction-following dataset using given figure-caption pairs +for the three tasks. It then employs LoRA to fine-tune a Large Language and +Vision Assistant for VSD, which has 13 billion parameters and supports +high-resolution images. Finally, a large language model (Qwen-2) is used to +refine the generated sentences, enhancing their diversity and accuracy. +LLaVA-VSD demonstrates excellent multimodal conversational capabilities and can +follow open-ended instructions to assist with inquiries about object +relationships in images. + +
+
+ comment: We have discovered a significant error in the paper that affects the + main conclusions. To ensure the accuracy of our research, we have decided to + withdraw this paper and will resubmit it after making the necessary + corrections +
+
+
+
+
+ + ♻ ☆ Improving Apple Object Detection with Occlusion-Enhanced Distillation + + +
+ Apples growing in natural environments often face severe visual obstructions +from leaves and branches. This significantly increases the risk of false +detections in object detection tasks, thereby escalating the challenge. +Addressing this issue, we introduce a technique called "Occlusion-Enhanced +Distillation" (OED). This approach utilizes occlusion information to regularize +the learning of semantically aligned features on occluded datasets and employs +Exponential Moving Average (EMA) to enhance training stability. Specifically, +we first design an occlusion-enhanced dataset that integrates Grounding DINO +and SAM methods to extract occluding elements such as leaves and branches from +each sample, creating occlusion examples that reflect the natural growth state +of fruits. Additionally, we propose a multi-scale knowledge distillation +strategy, where the student network uses images with increased occlusions as +inputs, while the teacher network employs images without natural occlusions. +Through this setup, the strategy guides the student network to learn from the +teacher across scales of semantic and local features alignment, effectively +narrowing the feature distance between occluded and non-occluded targets and +enhancing the robustness of object detection. Lastly, to improve the stability +of the student network, we introduce the EMA strategy, which aids the student +network in learning more generalized feature expressions that are less affected +by the noise of individual image occlusions. Our method significantly +outperforms current state-of-the-art techniques through extensive comparative +experiments. + +
+
+
+
+
+ + ♻ ☆ MamMIL: Multiple Instance Learning for Whole Slide Images with State + Space Models + + +
+ Recently, pathological diagnosis has achieved superior performance by +combining deep learning models with the multiple instance learning (MIL) +framework using whole slide images (WSIs). However, the giga-pixeled nature of +WSIs poses a great challenge for efficient MIL. Existing studies either do not +consider global dependencies among instances, or use approximations such as +linear attentions to model the pair-to-pair instance interactions, which +inevitably brings performance bottlenecks. To tackle this challenge, we propose +a framework named MamMIL for WSI analysis by cooperating the selective +structured state space model (i.e., Mamba) with MIL, enabling the modeling of +global instance dependencies while maintaining linear complexity. Specifically, +considering the irregularity of the tissue regions in WSIs, we represent each +WSI as an undirected graph. To address the problem that Mamba can only process +1D sequences, we further propose a topology-aware scanning mechanism to +serialize the WSI graphs while preserving the topological relationships among +the instances. Finally, in order to further perceive the topological structures +among the instances and incorporate short-range feature interactions, we +propose an instance aggregation block based on graph neural networks. +Experiments show that MamMIL can achieve advanced performance than the +state-of-the-art frameworks. The code can be accessed at +https://github.com/Vison307/MamMIL. + +
+
+ comment: 6 pages, 2 figures. Accepted by IEEE International Conference on + Bioinformatics and Biomedicine (BIBM) +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 0000000..69f5da7 --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`