meta:
  title: Awesome Egocentric Atlas
  description: "Just launched Awesome Egocentric Atlas — a curated collection of first-person AI resources for egocentric vision, embodied AI, robotics, VLA, world models, WMA, memory, AR/VR, and hand-object interaction."
  last_major_audit: "2026-06-19"
  scope: Egocentric, first-person, wearable-camera, AR/VR headset, body/wrist camera, and ego-exo datasets, benchmarks, models, and tools.
  status_legend:
    open: Public download, public annotations, public code, or application-based access is clearly documented.
    request: Public project exists, but dataset access requires license, form, approval, or institutional agreement.
    benchmark: Mainly a benchmark, challenge, labels, or evaluation over existing videos.
    partial: Some assets are public, but full raw data, license, or long-term availability is unclear.
    watch: Recent or important resource whose release state still needs verification.

resources:
  - name: Xperience-10M
    kind: dataset
    released: "2026"
    venue: "Hugging Face"
    year: 2026
    status: request
    url: https://huggingface.co/datasets/ropedia-ai/xperience-10m
    scale: "10M experiences, 10K hours, six video streams, audio, stereo depth, camera pose, hand/body mocap, IMU, hierarchical language, about 1 PB total"
    modalities: [fisheye-video, stereo-video, audio, stereo-depth, camera-pose, hand-mocap, full-body-mocap, imu, hierarchical-language]
    tasks: [embodied-ai, world-modeling, robot-learning, sensor-fusion, 3d-4d-understanding, imitation-learning]
    created_by: Ropedia
    released_by: ropedia-ai on Hugging Face
    publisher: Hugging Face
    access: controlled non-commercial access; external agreement-signing step may be required
    license: other

  - name: Xperience-10M Sample
    kind: dataset
    released: "2026"
    venue: "Hugging Face"
    year: 2026
    status: open
    url: https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample
    scale: "Public sample episode; Hugging Face viewer reports 6 rows"
    modalities: [video, hdf5-annotations]
    tasks: [sample-data, loader-testing, visualization, task-suite-prototyping]
    created_by: Ropedia
    released_by: ropedia-ai on Hugging Face
    license: cc-by-nc-4.0

  - name: Ego4D
    milestone: 2021
    milestone_note: "The 3,670-hour massive-scale benchmark suite that catalyzed the modern egocentric era."
    kind: dataset
    released: "2021-10"
    venue: "CVPR 2022"
    year: 2021
    status: request
    url: https://ego4d-data.org/
    paper: https://arxiv.org/abs/2110.07058
    scale: "3,670+ hours, 900+ camera wearers, 74 locations, 9 countries"
    modalities: [video, audio, gaze, stereo, 3d, narrations]
    tasks: [episodic-memory, nlq, moment-query, hand-object, forecasting, social, audio-visual]
    code: https://github.com/facebookresearch/Ego4d
    license: Ego4D License Agreement
    verified_at: "2026-06-15"

  - name: Ego-Exo4D
    milestone: 2023
    milestone_note: "Synchronized ego and exo skilled-activity capture at scale; the reference for cross-view egocentric learning."
    kind: dataset
    released: "2023-11"
    venue: "CVPR 2024"
    year: 2023
    status: request
    url: https://ego-exo4d-data.org/
    paper: https://arxiv.org/abs/2311.18259
    scale: "1,286 hours, 740 participants, synchronized ego/exo views"
    modalities: [video, audio, gaze, imu, 3d-point-clouds, camera-poses, language]
    tasks: [ego-exo, skilled-activity, cross-view, proficiency, pose]
    code: https://github.com/facebookresearch/projectaria_tools
    license: Ego-Exo4D License Agreement
    verified_at: "2026-06-15"

  - name: EPIC-KITCHENS-100
    milestone: 2020
    milestone_note: "The defining large-scale egocentric action-recognition benchmark and annual challenge suite."
    kind: dataset
    released: "2020-06"
    venue: "IJCV 2022"
    year: 2020
    status: open
    url: https://epic-kitchens.github.io/
    paper: https://arxiv.org/abs/2006.13256
    scale: "100 hours, 20M frames, 90K action segments, 45 kitchens"
    modalities: [video, audio, narrations, object-boxes]
    tasks: [action-recognition, action-detection, anticipation, retrieval, domain-adaptation]
    code: https://github.com/epic-kitchens/epic-kitchens-100-annotations
    license: "CC BY-NC 4.0"
    license_url: https://creativecommons.org/licenses/by-nc/4.0/
    verified_at: "2026-06-15"

  - name: HD-EPIC
    kind: dataset
    released: "2025-02"
    venue: "CVPR 2025"
    year: 2025
    status: open
    url: https://arxiv.org/abs/2502.04144
    paper: https://arxiv.org/abs/2502.04144
    scale: "41 hours, 9 kitchens, 59K fine-grained actions, 26K VQA questions"
    modalities: [video, audio, gaze, object-masks, digital-twins, 3d]
    tasks: [vqa, action-recognition, audio-events, object-motion, gaze-reasoning]

  - name: HoloAssist
    kind: dataset
    released: "2023-09"
    venue: "ICCV 2023"
    year: 2023
    status: open
    url: https://holoassist.github.io/
    paper: https://arxiv.org/abs/2309.17024
    scale: "166 hours, 350 instructor-performer pairs"
    modalities: [rgb, depth, gaze, hand, head, imu, audio]
    tasks: [interactive-assistance, mistake-detection, intervention-prediction, hand-forecasting]

  - name: Assembly101
    kind: dataset
    released: "2022-03"
    venue: "CVPR 2022"
    year: 2022
    status: open
    url: https://assembly-101.github.io/
    paper: https://arxiv.org/abs/2203.14712
    scale: "4,321 videos, 8 static plus 4 egocentric views, 1M fine-grained action segments"
    modalities: [multiview-video, egocentric-video, 3d-hand-pose]
    tasks: [procedural-action, anticipation, temporal-segmentation, mistake-detection]

  - name: Ego-1K
    kind: dataset
    released: "2026-03"
    venue: "CVPR 2026"
    year: 2026
    status: open
    url: https://huggingface.co/datasets/facebook/ego-1k
    paper: https://arxiv.org/abs/2603.13741
    scale: "Nearly 1,000 synchronized multiview egocentric videos"
    modalities: [multiview-video, calibration]
    tasks: [dynamic-scene-understanding, novel-view-synthesis, 3d-reconstruction]

  - name: EgoObjects
    kind: dataset
    released: "2023-09"
    venue: "ICCV 2023"
    year: 2023
    status: open
    url: https://github.com/facebookresearch/EgoObjects
    paper: https://arxiv.org/abs/2309.08816
    scale: "9K+ videos, 650K annotations, 368 categories in pilot release"
    modalities: [video, object-boxes, instance-labels]
    tasks: [object-detection, instance-understanding, continual-learning]

  - name: EgoBlind
    kind: dataset
    released: "2025-03"
    venue: "NeurIPS 2025 D&B"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2503.08221
    paper: https://arxiv.org/abs/2503.08221
    scale: "1,392 first-person videos from blind and visually impaired users; 5,311 questions"
    modalities: [first-person-video, qa]
    tasks: [assistive-videoqa, blind-assistance, egocentric-vqa]
    created_by: Junbin Xiao, Nanxin Huang, Hao Qiu, Zhulin Tao, Xun Yang, Richang Hong, Meng Wang, Angela Yao
    release_note: NeurIPS 2025 D&B paper; verify stable dataset download and terms before benchmark use

  - name: EgoDex
    kind: dataset
    released: "2025-05"
    venue: "ICLR 2026"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2505.11709
    paper: https://arxiv.org/abs/2505.11709
    scale: "829 hours, 194 tabletop tasks, Apple Vision Pro capture"
    modalities: [egocentric-video, 3d-hand-pose, head-pose]
    tasks: [dexterous-manipulation, imitation-learning, vla]

  - name: EgoVerse
    kind: dataset
    released: "2026-04"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://egoverse.ai/
    paper: https://arxiv.org/abs/2604.07607
    scale: "1,362 hours, 80K episodes, 1,965 tasks, 2,087 demonstrators"
    modalities: [egocentric-video, annotations, tooling]
    tasks: [robot-learning, vla, human-demonstrations]

  - name: OpenEgo
    kind: dataset
    released: "2025-09"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2509.05513
    paper: https://arxiv.org/abs/2509.05513
    scale: "1,107 hours across six public datasets, 290 manipulation tasks"
    modalities: [egocentric-video, hand-pose, action-primitives]
    tasks: [dexterous-manipulation, standardized-pretraining, vla]

  - name: Dobb-E / HoNY
    kind: dataset
    released: "2023-11"
    venue: "arXiv"
    year: 2023
    status: partial
    url: https://dobb-e.com/
    paper: https://arxiv.org/abs/2311.16098
    code: https://github.com/notmahi/dobb-e
    scale: "Homes of New York dataset: 13 hours, 22 homes, 216 environments, 5,620 trajectories, 1.5M RGB-D frames collected with an iPhone-based Stick"
    modalities: [rgb, depth, robot-actions, wrist-camera-video]
    tasks: [robot-learning, imitation-learning, manipulation, data-collection]
    verified_at: "2026-06-17"
    release_note: "Public dataset, code, model, and hardware pages exist; dataset terms should be rechecked before benchmark use."
    citation_key: dobbe_hony_2023

  - name: Universal Manipulation Interface / UMI
    kind: toolkit
    released: "2024-02"
    venue: "RSS 2024"
    year: 2024
    status: open
    url: https://umi-gripper.github.io/
    paper: https://arxiv.org/abs/2402.10329
    code: https://github.com/real-stanford/universal_manipulation_interface
    scale: "Hand-held GoPro gripper interface and policy stack for in-the-wild robot teaching, including example and cup-arrangement data workflows"
    modalities: [wrist-camera-video, imu, robot-actions]
    tasks: [robot-learning, imitation-learning, cross-embodiment-transfer, bimanual-manipulation, data-collection]
    license: MIT
    license_url: https://github.com/real-stanford/universal_manipulation_interface/blob/main/LICENSE
    verified_at: "2026-06-17"
    citation_key: umi_2024

  - name: FastUMI
    kind: dataset
    released: "2024-09"
    venue: "arXiv"
    year: 2024
    status: watch
    url: https://arxiv.org/abs/2409.19499
    paper: https://arxiv.org/abs/2409.19499
    scale: "UMI redesign with a reported 10,000+ real-world demonstration trajectories across 22 everyday manipulation tasks"
    modalities: [wrist-camera-video, robot-actions]
    tasks: [robot-learning, imitation-learning, manipulation, data-collection]
    verified_at: "2026-06-17"
    release_note: "Paper reports an open-sourced dataset; verify stable repository, download path, and license."
    citation_key: fastumi_2024

  - name: FastUMI-100K
    kind: dataset
    released: "2025-10"
    venue: "arXiv"
    year: 2025
    status: open
    url: https://huggingface.co/datasets/IPEC-COMMUNITY/FastUMI_100k_lerobot
    paper: https://arxiv.org/abs/2510.08022
    code: https://github.com/MrKeee/FastUMI-100K
    scale: "100K+ UMI-style demonstration trajectories across 54 household manipulation tasks, standardized in LeRobot v2.1 with multi-view wrist-mounted fisheye images"
    modalities: [wrist-camera-video, robot-actions, language-instructions]
    tasks: [robot-learning, imitation-learning, cross-embodiment-transfer, bimanual-manipulation, data-collection]
    license: apache-2.0
    license_url: https://github.com/MrKeee/FastUMI-100K/blob/main/LICENSE
    verified_at: "2026-06-17"
    citation_key: fastumi_100k_2025

  - name: MV-UMI
    kind: toolkit
    released: "2025-09"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2509.18757
    paper: https://arxiv.org/abs/2509.18757
    scale: "Multi-view Universal Manipulation Interface adding third-person context to the egocentric wrist camera, evaluated across three manipulation tasks"
    modalities: [wrist-camera-video, third-person-video, robot-actions]
    tasks: [robot-learning, imitation-learning, cross-embodiment-transfer, manipulation, data-collection]
    verified_at: "2026-06-17"
    release_note: "arXiv paper found; verify public code, data, and license before treating as a reusable package."
    citation_key: mv_umi_2025

  - name: UMIGen
    kind: model
    released: "2025-11"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2511.09302
    paper: https://arxiv.org/abs/2511.09302
    scale: "Cloud-UMI handheld collection and visibility-aware point-cloud generation for egocentric 3D observations and cross-embodiment imitation learning"
    modalities: [wrist-camera-video, point-clouds, robot-actions]
    tasks: [robot-learning, imitation-learning, cross-embodiment-transfer, 3d-reconstruction, data-collection]
    verified_at: "2026-06-17"
    release_note: "arXiv paper found; verify public Cloud-UMI code, generated data, and license."
    citation_key: umigen_2025

  - name: Hoi!
    kind: dataset
    released: "2025-12"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2512.04884
    paper: https://arxiv.org/abs/2512.04884
    scale: "3,048 force-grounded articulated-manipulation sequences over 381 objects and 38 environments, spanning human hand, wrist camera, UMI gripper, and Hoi gripper embodiments"
    modalities: [wrist-camera-video, robot-actions, contact, articulated-objects]
    tasks: [robot-learning, imitation-learning, cross-embodiment-transfer, contact-understanding, hand-object]
    verified_at: "2026-06-17"
    release_note: "arXiv paper found; verify stable dataset download path and license."
    citation_key: hoi_force_grounded_2025

  - name: YUBI
    kind: dataset
    released: "2026-06"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2606.10244
    paper: https://arxiv.org/abs/2606.10244
    scale: "UMI-based bimanual dexterous manipulation stack with a reported 8,434 hours, 1.20M episodes, and 119 tasks"
    modalities: [wrist-camera-video, robot-actions, 6dof]
    tasks: [robot-learning, imitation-learning, cross-embodiment-transfer, bimanual-manipulation, dexterous-manipulation]
    verified_at: "2026-06-17"
    release_note: "Paper reports release of hardware, software, and dataset; verify stable public repository and license."
    citation_key: yubi_2026

  - name: Open X-Embodiment / RT-X
    kind: dataset
    released: "2023-10"
    venue: "ICRA 2024"
    year: 2023
    status: open
    url: https://robotics-transformer-x.github.io/
    paper: https://arxiv.org/abs/2310.08864
    mirror: https://huggingface.co/datasets/jxu124/OpenX-Embodiment
    scope: adjacent
    scale: "1M+ real robot trajectories, 22 robot embodiments, 60 pooled robot datasets (robot-mounted and wrist cameras, not human first-person)"
    modalities: [robot-camera-video, wrist-camera-video, language-instructions, robot-actions, rlds]
    tasks: [vla, robot-learning, cross-embodiment-transfer, imitation-learning, manipulation]
    created_by: Open X-Embodiment Collaboration
    released_by: Open X-Embodiment Collaboration; unofficial Hugging Face mirror by jxu124
    license: cc-by-4.0
    scope_note: Adjacent robot-learning dataset included for VLA and robot-camera policy learning; not a human wearable-camera dataset.

  - name: EgoLive
    kind: dataset
    released: "2026-04"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2604.23570
    paper: https://arxiv.org/abs/2604.23570
    scale: "Large-scale real-world task-oriented egocentric routines"
    modalities: [egocentric-video, multimodal-annotations]
    tasks: [robot-manipulation, real-world-routines]

  - name: World In Your Hands
    kind: dataset
    released: "2025-12"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2512.24310
    paper: https://arxiv.org/abs/2512.24310
    scale: "Reported 1,000+ hours of multimodal human-centric manipulation data"
    modalities: [egocentric-video, multiview, depth, hand-pose, wrist-trajectory]
    tasks: [dexterous-manipulation, vla, human-centric-policy-learning]

  - name: InterVLA
    kind: dataset
    released: "2025-08"
    venue: "ICCV 2025"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2508.04681
    paper: https://arxiv.org/abs/2508.04681
    scale: "11.4 hours, 1.2M frames, 2 egocentric and 5 exocentric videos"
    modalities: [egocentric-video, exocentric-video, human-motion, object-motion, verbal-commands]
    tasks: [vla, human-object-human-interaction, motion-estimation, interaction-prediction]

  - name: EgoPAT3D / EgoPAT3Dv2
    kind: dataset
    released: "2024-03"
    venue: "ICRA 2024"
    year: 2024
    status: open
    url: https://arxiv.org/abs/2403.05046
    paper: https://arxiv.org/abs/2403.05046
    scale: "Original EgoPAT3D reports 1M+ RGB-D and IMU frames; v2 expands 3D action-target prediction"
    modalities: [rgb, depth, imu, 3d-target-labels]
    tasks: [3d-action-target-prediction, hri, anticipation, manipulation-safety]

  - name: HOT3D
    milestone: 2024
    milestone_note: "Reference benchmark for 3D hand-object tracking from AR glasses (Project Aria and Quest 3)."
    kind: dataset
    released: "2024-06"
    venue: "CVPR 2025"
    year: 2024
    status: open
    url: https://facebookresearch.github.io/hot3d/
    paper: https://arxiv.org/abs/2406.09598
    scale: "833+ minutes, 3.7M+ images, 19 subjects, 33 objects"
    modalities: [aria, quest3, gaze, point-clouds, 3d-hand-pose, object-pose, camera-pose]
    tasks: [3d-hand-object-tracking, ar-vr, pose]
    license: "CC BY-NC-SA 4.0"
    license_url: https://creativecommons.org/licenses/by-nc-sa/4.0/
    verified_at: "2026-06-15"

  - name: HOI4D
    license: "CC BY-NC 4.0"
    license_url: https://creativecommons.org/licenses/by-nc/4.0/
    kind: dataset
    released: "2022-03"
    venue: "CVPR 2022"
    year: 2022
    status: open
    url: https://hoi4d.github.io/
    paper: https://arxiv.org/abs/2203.01577
    scale: "2.4M RGB-D egocentric frames, 4,000+ sequences, 800 objects"
    modalities: [rgbd, point-clouds, 3d-hand-pose, object-pose, segmentation]
    tasks: [4d-hoi, pose-tracking, action-segmentation]

  - name: H2O
    kind: dataset
    released: "2021-04"
    venue: "ICCV 2021"
    year: 2021
    status: open
    url: https://arxiv.org/abs/2104.11181
    paper: https://arxiv.org/abs/2104.11181
    modalities: [multiview-rgbd, 3d-hand-pose, object-pose, camera-pose, object-meshes, scene-point-clouds]
    tasks: [first-person-interaction-recognition, two-hand-pose, hand-object-pose]

  - name: ARCTIC
    kind: dataset
    released: "2022-04"
    venue: "CVPR 2023"
    year: 2022
    status: request
    url: https://arctic.is.tue.mpg.de/
    paper: https://arxiv.org/abs/2204.13662
    scale: "2.1M frames"
    modalities: [video, hand-meshes, object-meshes, contact, articulated-objects]
    tasks: [bimanual-manipulation, reconstruction, interaction-field-estimation]

  - name: EgoHands
    kind: dataset
    released: "2015"
    venue: "ICCV 2015"
    year: 2015
    status: open
    url: http://vision.soic.indiana.edu/projects/egohands/
    scale: "48 Google Glass videos, 4,800 annotated images"
    modalities: [rgb, hand-masks]
    tasks: [hand-detection, hand-segmentation]

  - name: FPHA
    kind: dataset
    released: "2017"
    venue: "CVPR 2018"
    year: 2017
    status: open
    url: https://guiggh.github.io/publications/first-person-hands/
    scale: "100K+ RGB-D frames, 45 action classes, 26 objects"
    modalities: [rgbd, 3d-hand-pose, object-pose]
    tasks: [hand-action-recognition, 3d-hand-pose]

  - name: EgoHOS
    kind: dataset
    released: "2022-08"
    venue: "ECCV 2022"
    year: 2022
    status: open
    url: https://github.com/owenzlz/EgoHOS
    paper: https://arxiv.org/abs/2208.03826
    scale: "11,243 egocentric images"
    modalities: [rgb, hand-object-contact-masks]
    tasks: [hand-object-segmentation, contact-understanding]

  - name: Ego2Hands
    kind: dataset
    released: "2020-11"
    venue: "arXiv"
    year: 2020
    status: open
    url: https://arxiv.org/abs/2011.07252
    paper: https://arxiv.org/abs/2011.07252
    modalities: [rgb, synthetic-composited-masks]
    tasks: [two-hand-segmentation, hand-detection]

  - name: Ego2HandsPose
    kind: dataset
    released: "2022-06"
    venue: "arXiv"
    year: 2022
    status: open
    url: https://arxiv.org/abs/2206.04927
    paper: https://arxiv.org/abs/2206.04927
    modalities: [rgb, 3d-hand-pose]
    tasks: [two-hand-3d-pose]

  - name: EgoBody
    kind: dataset
    released: "2022"
    venue: "ECCV 2022"
    year: 2022
    status: open
    url: https://sanweiliti.github.io/egobody/egobody.html
    modalities: [hololens2, rgb, depth, gaze, head-pose, hand-pose, body-pose]
    tasks: [3d-human-pose, body-shape, motion]

  - name: GIMO
    kind: dataset
    released: "2022-04"
    venue: "ECCV 2022"
    year: 2022
    status: open
    url: https://github.com/y-zheng18/GIMO
    paper: https://arxiv.org/abs/2204.09443
    modalities: [egocentric-video, gaze, body-pose, scene-scans]
    tasks: [gaze-informed-motion-prediction, human-motion, scene-context]

  - name: EgoHumans
    kind: benchmark
    released: "2023-05"
    venue: "ICCV 2023"
    year: 2023
    status: partial
    url: https://arxiv.org/abs/2305.16487
    paper: https://arxiv.org/abs/2305.16487
    scale: "125K+ egocentric images"
    modalities: [egocentric-video, multiview-video, 2d-pose, 3d-pose, mesh]
    tasks: [multi-human-tracking, 3d-pose, mesh-recovery]

  - name: UnrealEgo
    kind: dataset
    released: "2022-08"
    venue: "ECCV 2022"
    year: 2022
    status: open
    url: https://4dqv.mpi-inf.mpg.de/UnrealEgo/
    paper: https://arxiv.org/abs/2208.01633
    modalities: [synthetic-stereo, 3d-human-pose]
    tasks: [egocentric-3d-pose]

  - name: xR-EgoPose
    kind: dataset
    released: "2019"
    venue: "GitHub"
    year: 2019
    status: open
    url: https://github.com/facebookresearch/xR-EgoPose
    modalities: [synthetic-egocentric, 3d-human-pose]
    tasks: [xr-pose-estimation]

  - name: EgoGTA / EgoPW-Scene
    kind: dataset
    released: "2022-12"
    venue: "arXiv"
    year: 2022
    status: partial
    url: https://arxiv.org/abs/2212.11684
    paper: https://arxiv.org/abs/2212.11684
    modalities: [synthetic-egocentric, scene-depth, human-pose]
    tasks: [scene-aware-3d-human-pose, egocentric-depth, human-scene-interaction]

  - name: EgoTracks
    kind: benchmark
    released: "2023-01"
    venue: "NeurIPS 2023"
    year: 2023
    status: open
    url: https://arxiv.org/abs/2301.03213
    paper: https://arxiv.org/abs/2301.03213
    derived_from: Ego4D
    tasks: [long-term-object-tracking, redetection]

  - name: TREK-150
    kind: benchmark
    released: "2021"
    venue: "project page"
    year: 2021
    status: open
    url: https://machinelearning.uniud.it/datasets/trek150/
    derived_from: EPIC-KITCHENS
    tasks: [single-object-tracking]

  - name: Project Aria Datasets
    milestone: 2022
    milestone_note: "Meta's research smart-glasses platform that opened the modern wave of AR and wearable egocentric data."
    kind: collection
    released: "2022"
    venue: "project page"
    status: open
    url: https://www.projectaria.com/datasets/
    modalities: [aria, vrs, calibration, gaze, imu, mps]
    tasks: [ar-perception, scene-understanding, wearable-sensing]

  - name: Aria Digital Twin
    license: "CC BY-NC-SA 4.0"
    license_url: https://creativecommons.org/licenses/by-nc-sa/4.0/
    kind: dataset
    released: "2023-06"
    venue: "arXiv"
    year: 2023
    status: open
    url: https://www.projectaria.com/datasets/adt/
    paper: https://arxiv.org/abs/2306.06362
    scale: "200 sequences, 398 object instances"
    modalities: [aria, rgb, monochrome, imu, 6dof, depth, segmentation, synthetic-rendering]
    tasks: [3d-machine-perception, object-tracking, scene-reconstruction]

  - name: Aria Everyday Activities
    license: "CC BY-NC-SA 4.0"
    license_url: https://creativecommons.org/licenses/by-nc-sa/4.0/
    kind: dataset
    released: "2024-02"
    venue: "arXiv"
    year: 2024
    status: open
    url: https://www.projectaria.com/datasets/aea/
    paper: https://arxiv.org/abs/2402.13349
    scale: "143 daily activity sequences across five indoor locations"
    modalities: [aria, trajectory, point-cloud, gaze, speech]
    tasks: [everyday-activity, neural-scene-reconstruction, prompted-segmentation]

  - name: Nymeria
    license: "CC BY-NC-SA 4.0"
    license_url: https://creativecommons.org/licenses/by-nc-sa/4.0/
    kind: dataset
    released: "2024-06"
    venue: "ECCV 2024"
    year: 2024
    status: open
    url: https://www.projectaria.com/datasets/nymeria/
    paper: https://arxiv.org/abs/2406.09905
    scale: "300 hours, 264 participants, 50 locations"
    modalities: [aria, eye-tracking, imu, body-motion, language, observer-view]
    tasks: [motion-language, body-tracking, action-recognition]

  - name: Digital Twin Catalog
    kind: dataset
    released: "2025-04"
    venue: "CVPR 2025"
    year: 2025
    status: open
    url: https://www.projectaria.com/datasets/dtc/
    paper: https://arxiv.org/abs/2504.08541
    scale: "2,000 scanned digital-twin objects"
    modalities: [3d-objects, dslr-images, aria-images]
    tasks: [object-reconstruction, digital-twin-evaluation]

  - name: EgoLife
    kind: dataset
    released: "2025-03"
    venue: "CVPR 2025"
    year: 2025
    status: partial
    url: https://arxiv.org/abs/2503.03803
    paper: https://arxiv.org/abs/2503.03803
    scale: "300 hours, 6 participants, one week daily life"
    modalities: [meta-aria, video, audio, third-person-reference, qa]
    tasks: [life-assistant, long-context-qa, memory]

  - name: Ego-EXTRA
    kind: dataset
    released: "2025-12"
    venue: "arXiv"
    year: 2025
    status: open
    url: https://fpv-iplab.github.io/Ego-EXTRA/
    paper: https://arxiv.org/abs/2512.13238
    scale: "50 hours, 15K+ VQA pairs"
    modalities: [video, dialogue, qa]
    tasks: [expert-trainee-assistance, egocentric-vqa]

  - name: EgoSchema
    milestone: 2023
    milestone_note: "The benchmark that exposed how far models are from long-form egocentric video reasoning."
    kind: benchmark
    released: "2023-08"
    venue: "NeurIPS 2023"
    year: 2023
    status: open
    url: http://egoschema.github.io/
    paper: https://arxiv.org/abs/2308.09126
    derived_from: Ego4D
    scale: "5K+ multiple-choice QA pairs, 250+ hours"
    tasks: [long-video-qa, temporal-reasoning]
    code: https://github.com/egoschema/EgoSchema
    verified_at: "2026-06-15"

  - name: X-LeBench
    kind: benchmark
    released: "2025-01"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2501.06835
    paper: https://arxiv.org/abs/2501.06835
    scale: "432 simulated life logs, 23 minutes to 16.4 hours"
    derived_from: Ego4D
    tasks: [extremely-long-video-understanding, lifelogging, long-context-qa]

  - name: TeleEgo
    kind: benchmark
    released: "2025-10"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2510.23981
    paper: https://arxiv.org/abs/2510.23981
    scale: "3,291 human-verified QA items in streaming setting"
    modalities: [egocentric-video, audio, text, timeline]
    tasks: [streaming-assistant, memory, real-time-understanding, cross-memory-reasoning]

  - name: EgoMemReason
    kind: benchmark
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2605.09874
    paper: https://arxiv.org/abs/2605.09874
    derived_from: week-long egocentric video
    scale: "500 questions across entity, event, and behavior memory; average 25.9 hours of memory backtracking"
    tasks: [long-horizon-memory, entity-memory, event-memory, behavior-memory, egocentric-video-reasoning]
    created_by: Ziyang Wang, Yue Zhang, Shoubin Yu, Ce Zhang, Zengqi Zhao, Jaehong Yoon, Hyunji Lee, Gedas Bertasius, Mohit Bansal
    release_note: arXiv paper; verify public data/code release before benchmark use

  - name: EgoClip
    kind: dataset
    released: "2022-06"
    venue: "NeurIPS 2022"
    year: 2022
    status: open
    url: https://github.com/showlab/EgoVLP
    paper: https://arxiv.org/abs/2206.01670
    derived_from: Ego4D
    scale: "3.8M clip-text pairs"
    tasks: [video-language-pretraining, retrieval]

  - name: RefEgo
    kind: benchmark
    released: "2023-08"
    venue: "ICCV 2023"
    year: 2023
    status: open
    url: https://github.com/shuheikurita/RefEgo
    paper: https://arxiv.org/abs/2308.12035
    derived_from: Ego4D
    scale: "12K+ clips and 41 hours for video-based referring-expression comprehension"
    tasks: [referring-expression-comprehension, object-grounding, referred-object-tracking]
    created_by: Shuhei Kurita, Naoki Katsura, Eri Onami
    released_by: shuheikurita/RefEgo on GitHub

  - name: EgoBench
    kind: benchmark
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2605.27820
    paper: https://arxiv.org/abs/2605.27820
    scale: "1,045 egocentric-video-grounded interactive tasks"
    tasks: [tool-using-agents, multimodal-reasoning, interaction]

  - name: EgoIntrospect
    kind: dataset
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://ego-introspect.github.io/
    paper: https://arxiv.org/abs/2605.17262
    scale: "180 hours, 60 subjects"
    modalities: [video, audio, gaze, motion, physiological-signals, self-annotations]
    tasks: [internal-state-reasoning, affect, intent, memory]

  - name: Minerva-Ego
    kind: benchmark
    released: "2025-05"
    venue: "arXiv"
    year: 2025
    status: open
    scope: adjacent
    url: https://github.com/google-deepmind/neptune
    paper: https://arxiv.org/abs/2505.00681
    scale: "Part of the Neptune / MINERVA long-video reasoning collection over web (YouTube) videos with manually annotated reasoning traces"
    tasks: [multistep-reasoning, long-video-qa, reasoning-traces]

  - name: MA-EgoQA
    kind: benchmark
    released: "2026-03"
    venue: "arXiv"
    year: 2026
    status: open
    url: https://ma-egoqa.github.io/
    paper: https://arxiv.org/abs/2603.09827
    scale: "1.7K questions over multiple egocentric streams"
    tasks: [multi-agent-memory, social-reasoning, temporal-reasoning]

  - name: EASG-Bench
    kind: benchmark
    released: "2025-06"
    venue: "ICCV 2025 Workshop"
    year: 2025
    status: open
    url: https://github.com/fpv-iplab/EASG-bench
    paper: https://arxiv.org/abs/2506.05787
    tasks: [scene-graph-qa, relation-reasoning, temporal-reasoning]

  - name: MyEgo
    kind: benchmark
    released: "2026-04"
    venue: "CVPR 2026"
    year: 2026
    status: open
    url: https://github.com/Ryougetsu3606/MyEgo
    paper: https://arxiv.org/abs/2604.01966
    scale: "541 long videos, 5K personalized questions"
    tasks: [personalized-qa, ego-grounding, long-range-memory]

  - name: EgoToM
    kind: benchmark
    released: "2025-03"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2503.22152
    paper: https://arxiv.org/abs/2503.22152
    derived_from: Ego4D
    tasks: [theory-of-mind, goal-reasoning, belief-reasoning, next-action]
    created_by: Yuxuan Li, Vijay Veerabadran, Michael L. Iuzzolino, Brett D. Roads, Asli Celikyilmaz, Karl Ridgeway
    release_note: arXiv paper; verify public release route before benchmark use

  - name: EgoSound
    kind: benchmark
    released: "2026-02"
    venue: "CVPR 2026"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2602.14122
    paper: https://arxiv.org/abs/2602.14122
    derived_from: [Ego4D, EgoBlind]
    scale: "7,315 validated QA pairs across 900 videos"
    tasks: [sound-understanding, audio-visual-reasoning, spatial-localization, causal-reasoning]
    created_by: Bingwen Zhu, Yuqian Fu, Qiaole Dong, Guolei Sun, Tianwen Qian, Yuzheng Wu, Danda Pani Paudel, Xiangyang Xue, Yanwei Fu
    release_note: CVPR 2026 paper; verify public release route before benchmark use

  - name: EgoEsportsQA
    kind: benchmark
    released: "2026-04"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2604.12320
    paper: https://arxiv.org/abs/2604.12320
    scale: "1,745 QA pairs from professional matches across three first-person shooter games"
    modalities: [first-person-video, qa]
    tasks: [first-person-qa, virtual-egocentric-reasoning, esports-perception, tactical-reasoning]
    created_by: Jianzhe Ma, Zhonghao Cao, Shangkui Chen, Yichen Xu, Wenxuan Wang, Qin Jin
    release_note: arXiv paper; virtual first-person setting, not wearable-camera capture

  - name: Causal-Plan-1M
    kind: benchmark
    released: "2026-06"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2606.01810
    paper: https://arxiv.org/abs/2606.01810
    scale: "Million-scale explicit reasoning traces over egocentric videos"
    tasks: [causal-reasoning, planning, next-state-estimation]

  - name: AssistQ
    kind: benchmark
    released: "2022-03"
    venue: "ECCV 2022"
    year: 2022
    status: open
    url: https://showlab.github.io/assistq/
    paper: https://arxiv.org/abs/2203.04203
    scale: "531 QA samples from 100 instructional videos"
    tasks: [task-assistance, affordance-centric-qa, action-planning]

  - name: EgoTaskQA
    kind: benchmark
    released: "2022-10"
    venue: "NeurIPS 2022"
    year: 2022
    status: benchmark
    url: https://arxiv.org/abs/2210.03929
    paper: https://arxiv.org/abs/2210.03929
    tasks: [task-understanding, causal-reasoning, counterfactual-qa, belief-reasoning]

  - name: HowToDIV
    kind: dataset
    released: "2025-08"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2508.11192
    paper: https://arxiv.org/abs/2508.11192
    scale: "507 conversations, 6,636 QA pairs, 24 hours"
    modalities: [egocentric-video, dialogue, qa]
    tasks: [procedural-task-assistance, instructional-dialogue, egocentric-qa]

  - name: EgoThink
    kind: benchmark
    released: "2023-11"
    venue: "CVPR 2024"
    year: 2023
    status: benchmark
    url: https://arxiv.org/abs/2311.15596
    paper: https://arxiv.org/abs/2311.15596
    tasks: [first-person-vqa, vlm-evaluation, embodied-reasoning]

  - name: VidEgoThink
    kind: benchmark
    released: "2024-10"
    venue: "ICLR 2025"
    year: 2024
    status: benchmark
    url: https://arxiv.org/abs/2410.11623
    paper: https://arxiv.org/abs/2410.11623
    derived_from: Ego4D
    tasks: [video-qa, hierarchy-planning, visual-grounding, reward-modeling]

  - name: EgoCoT-Bench
    kind: benchmark
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: open
    url: https://dstardust.github.io/EgoCoT/
    paper: https://arxiv.org/abs/2605.19559
    scale: "3,172 QA pairs over 351 egocentric videos"
    tasks: [grounded-cot, operation-centric-reasoning, evidence-consistency]

  - name: NoRA
    kind: benchmark
    released: "2026-06"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2606.04806
    paper: https://arxiv.org/abs/2606.04806
    scale: "1,420 first-person video clips"
    tasks: [normative-action-reasoning, grounded-reasonableness, safety]

  - name: Ego2Web
    kind: benchmark
    released: "2026-03"
    venue: "CVPR 2026"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2603.22529
    paper: https://arxiv.org/abs/2603.22529
    tasks: [web-agents, egocentric-grounding, physical-digital-assistance]

  - name: Pause and Think
    kind: benchmark
    released: "2026-06"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2606.00616
    paper: https://arxiv.org/abs/2606.00616
    tasks: [assistive-action-suggestion, grounded-reasoning, temporal-consistency]

  - name: Ego-R1
    kind: benchmark
    released: "2025-06"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2506.13654
    paper: https://arxiv.org/abs/2506.13654
    scale: "Ego-CoTT-25K, Ego-QA-4.4K, and Ego-R1 Bench"
    tasks: [ultra-long-video-qa, chain-of-tool-thought, temporal-retrieval]

  - name: EgoTrigger / HME-QA
    kind: benchmark
    released: "2025-08"
    venue: "ISMAR 2025 / TVCG"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2508.01915
    paper: https://arxiv.org/abs/2508.01915
    derived_from: Ego4D
    scale: "340 human-annotated first-person QA pairs from full-length Ego4D videos curated for audio"
    modalities: [first-person-video, audio, qa]
    tasks: [memory-assistance, audio-triggered-capture, energy-efficient-smart-glasses, episodic-memory]
    created_by: Akshay Paruchuri, Sinan Hersek, Lavisha Aggarwal, Qiao Yang, Xin Liu, Achin Kulshrestha, Andrea Colaco, Henry Fuchs, Ishan Chatterjee
    release_note: Accepted to ISMAR 2025 as TVCG journal paper

  - name: GTEA / GTEA Gaze / EGTEA Gaze+
    milestone: 2011
    milestone_note: "Foundational hand-object and gaze egocentric activity datasets (GTEA, 2011) that seeded first-person action and attention research."
    kind: dataset
    released: "2018"
    venue: "project page"
    status: open
    url: https://cbs.ic.gatech.edu/fpv/
    modalities: [video, gaze, action-labels]
    tasks: [action-recognition, gaze, hand-object]

  - name: Charades-Ego
    kind: dataset
    released: "2018-04"
    venue: "CVPR 2018"
    year: 2018
    status: open
    url: https://prior.allenai.org/projects/charades
    paper: https://arxiv.org/abs/1804.09626
    scale: "68.8 hours, 68,536 activity instances"
    modalities: [first-person-video, third-person-video, descriptions]
    tasks: [action-recognition, localization, captioning, domain-transfer]

  - name: MECCANO
    kind: dataset
    released: "2020-10"
    venue: "WACV 2021"
    year: 2020
    status: open
    url: https://iplab.dmi.unict.it/MECCANO/
    paper: https://arxiv.org/abs/2010.05654
    modalities: [rgb, depth, gaze]
    tasks: [ehoi, active-object-detection, action-recognition, anticipation]

  - name: EgoProceL
    kind: dataset
    released: "2022-07"
    venue: "ECCV 2022"
    year: 2022
    status: open
    url: https://sid2697.github.io/egoprocel/
    paper: https://arxiv.org/abs/2207.10883
    scale: "62 hours, 130 subjects, 16 tasks"
    tasks: [procedure-learning, key-step-localization]

  - name: EGO-CH
    kind: dataset
    released: "2020-02"
    venue: "arXiv"
    year: 2020
    status: partial
    url: https://arxiv.org/abs/2002.00899
    paper: https://arxiv.org/abs/2002.00899
    scale: "27+ hours, 70 subjects, 2 cultural sites, 26 environments, 200+ points of interest"
    modalities: [egocentric-video, location-labels, poi-labels, surveys]
    tasks: [cultural-heritage, localization, object-recognition, retrieval, survey-prediction]

  - name: EgoExoLearn
    kind: dataset
    released: "2024-03"
    venue: "CVPR 2024"
    year: 2024
    status: open
    url: https://github.com/OpenGVLab/EgoExoLearn
    paper: https://arxiv.org/abs/2403.16182
    scale: "120 hours"
    modalities: [egocentric-video, demonstration-video, gaze, annotations]
    tasks: [cross-view-association, planning, skill-assessment]

  - name: DogCentric
    kind: dataset
    released: "2014"
    venue: "project page"
    status: open
    url: https://robotics.ait.kyushu-u.ac.jp/dog-centric-activity-dataset/
    modalities: [animal-egocentric-video]
    tasks: [animal-activity-recognition]

  - name: KrishnaCam / OAK
    kind: dataset
    released: "2021-08"
    venue: "ICCV 2021"
    year: 2021
    status: partial
    url: https://oakdata.github.io/
    paper: https://arxiv.org/abs/2108.11005
    scale: "OAK: 80 snippets, about 17.5 hours, 105 object categories from KrishnaCam"
    modalities: [egocentric-video, object-boxes]
    tasks: [continual-learning, object-detection, lifelogging]

  - name: EgoK360
    kind: dataset
    released: "2020-10"
    venue: "arXiv"
    year: 2020
    status: partial
    url: https://egok360.github.io/
    paper: https://arxiv.org/abs/2010.08055
    modalities: [first-person-360-video, action-labels]
    tasks: [360-video, activity-recognition]

  - name: EgoTraj
    kind: dataset
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: open
    url: https://github.com/yehiahmad/EgoTraj
    paper: https://arxiv.org/abs/2605.19004
    scale: "75 Meta Quest Pro navigation sequences"
    modalities: [rgb, head-pose, gaze, scene-labels]
    tasks: [trajectory-prediction, navigation, assistive-systems]

  - name: VISOR
    kind: benchmark
    released: "2022"
    venue: "NeurIPS 2022"
    year: 2022
    status: open
    url: https://epic-kitchens.github.io/VISOR/
    derived_from: EPIC-KITCHENS
    license: cc-by-nc-4.0
    verified_at: "2026-06-15"
    tasks: [hand-segmentation, active-object-segmentation, relations]

  - name: EPIC-Sounds
    kind: benchmark
    released: "2023"
    venue: "ICASSP 2023"
    status: open
    url: https://epic-kitchens.github.io/epic-sounds/
    derived_from: EPIC-KITCHENS
    tasks: [audio-event-recognition]

  - name: EPIC-Fields
    kind: benchmark
    released: "2023"
    venue: "NeurIPS 2023"
    status: open
    url: https://epic-kitchens.github.io/epic-fields/
    derived_from: EPIC-KITCHENS
    tasks: [3d-fields, spatial-reasoning]

  - name: EgoVLP
    milestone: 2022
    milestone_note: "First egocentric video-language pretraining (EgoClip, EgoNCE) and a basis for ego representation learning."
    kind: model
    released: "2022-06"
    venue: "NeurIPS 2022"
    year: 2022
    status: open
    url: https://github.com/showlab/EgoVLP
    paper: https://arxiv.org/abs/2206.01670
    tasks: [video-language-pretraining, retrieval, ego4d-transfer]

  - name: EgoVLPv2
    kind: model
    released: "2023-07"
    venue: "ICCV 2023"
    year: 2023
    status: open
    url: https://shramanpramanick.github.io/EgoVLPv2/
    paper: https://arxiv.org/abs/2307.05463
    tasks: [video-language-pretraining, cross-modal-fusion]

  - name: LaViLa
    kind: model
    released: "2022-12"
    venue: "CVPR 2023"
    year: 2022
    status: open
    url: https://arxiv.org/abs/2212.04501
    paper: https://arxiv.org/abs/2212.04501
    tasks: [video-language-representation, narration-generation]

  - name: EgoNCE++
    kind: model
    released: "2024-05"
    venue: "ICLR 2025"
    year: 2024
    status: open
    url: https://github.com/xuboshen/EgoNCEpp
    paper: https://arxiv.org/abs/2405.17719
    tasks: [open-vocabulary-hoi, video-language-pretraining]

  - name: EgoDTM
    kind: model
    released: "2025-03"
    venue: "arXiv"
    year: 2025
    status: open
    url: https://github.com/xuboshen/EgoDTM
    paper: https://arxiv.org/abs/2503.15470
    tasks: [3d-aware-vlp, depth-text-pretraining]

  - name: EgoVLM
    kind: model
    released: "2025-06"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2506.03097
    paper: https://arxiv.org/abs/2506.03097
    tasks: [egocentric-video-reasoning, policy-optimization]

  - name: EgoGraph
    kind: model
    released: "2026-02"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2602.23709
    paper: https://arxiv.org/abs/2602.23709
    tasks: [temporal-knowledge-graph, ultra-long-video-qa]

  - name: Ropedia Xperience-10M Task Suite
    kind: benchmark
    released: "2026"
    venue: "Hugging Face"
    year: 2026
    status: open
    url: https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite
    derived_from: Xperience-10M Sample
    scale: "12 embodied-AI task contracts over one public Xperience-10M sample episode"
    modalities: [video, audio, depth, camera-pose, hand-mocap, full-body-mocap, imu, language]
    tasks: [task-design, sample-evaluation, multimodal-baselines, embodied-ai]
    created_by: cy0307
    released_by: cy0307 on Hugging Face Spaces
    access: public Space and related artifacts

  - name: Ropedia Xperience-10M Task Baselines
    kind: model
    released: "2026"
    venue: "Hugging Face"
    year: 2026
    status: open
    url: https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines
    derived_from: [Xperience-10M, Xperience-10M Sample]
    scale: "Public task definitions, baseline artifacts, metrics, and scale-up notes for Xperience-10M"
    modalities: [video, audio, depth, camera-pose, mocap, imu, language]
    tasks: [baseline-evaluation, task-suite, multimodal-representation, qwen3-omni, cosmos]
    created_by: cy0307
    released_by: cy0307 on Hugging Face
    license: mit

  - name: GroundVQA
    kind: model
    released: "2023-12"
    venue: "CVPR 2024"
    year: 2023
    status: open
    url: https://github.com/Becomebright/GroundVQA
    paper: https://arxiv.org/abs/2312.06505
    tasks: [long-video-qa, temporal-grounding, ego4d-nlq]

  - name: HiERO
    kind: model
    released: "2025-05"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2505.12911
    paper: https://arxiv.org/abs/2505.12911
    tasks: [hierarchical-activity-reasoning, egomcq, egonlq, procedure-learning]

  - name: EgoAgent
    kind: model
    released: "2025-02"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2502.05857
    paper: https://arxiv.org/abs/2502.05857
    tasks: [joint-predictive-agent, future-state-prediction, action-prediction]

  - name: StillFast
    kind: model
    released: "2023-04"
    venue: "arXiv"
    year: 2023
    status: open
    url: https://iplab.dmi.unict.it/stillfast/
    paper: https://arxiv.org/abs/2304.03959
    tasks: [short-term-object-interaction-anticipation, next-active-object, ego4d]

  - name: CONE
    kind: model
    released: "2022-11"
    venue: "ECCV 2022 Workshop"
    year: 2022
    status: watch
    url: https://arxiv.org/abs/2211.08776
    paper: https://arxiv.org/abs/2211.08776
    tasks: [natural-language-query, video-language-grounding, ego4d-nlq]

  - name: EgoHandTrajPred / USST
    kind: model
    released: "2023-07"
    venue: "ICCV 2023"
    year: 2023
    status: open
    url: https://actionlab-cv.github.io/EgoHandTrajPred
    paper: https://arxiv.org/abs/2307.08243
    tasks: [3d-hand-trajectory-forecasting, anticipation, h2o, egopat3d]

  - name: Project Aria Tools
    kind: toolkit
    released: "2022"
    venue: "GitHub"
    status: open
    url: https://github.com/facebookresearch/projectaria_tools
    tasks: [vrs-loading, calibration, mps, gaze, trajectory]

  - name: EPIC-KITCHENS action models
    kind: toolkit
    released: "2019"
    venue: "GitHub"
    status: open
    url: https://github.com/epic-kitchens/action-models
    tasks: [action-recognition-baselines]

  - name: HOMIE-toolkit
    kind: toolkit
    released: "2026"
    venue: "GitHub"
    year: 2026
    status: open
    url: https://github.com/Ropedia/HOMIE-toolkit
    derived_from: Xperience-10M
    tasks: [xperience-10m-loading, hdf5-annotations, visualization, rerun, calibration, depth, mocap, imu]
    created_by: Ropedia
    released_by: Ropedia on GitHub
    license: MIT

  # --- Catalog expansion (2026-06-14): README entries reconciled into the catalog ---

  - name: EgoCom / Ego audio-visual correspondence
    kind: dataset
    released: "2020"
    venue: "project page"
    status: open
    url: http://vision.cs.utexas.edu/projects/ego_av_corr/
    scale: "Egocentric video with spatial audio for conversation and audio-visual correspondence tasks"
    tasks: [foundation-video, video-language]

  - name: EasyCom
    kind: dataset
    released: "2021-07"
    venue: "arXiv"
    year: 2021
    status: open
    url: https://arxiv.org/abs/2107.04174
    scale: "AR glasses egocentric multi-channel audio and wide-FOV RGB for noisy conversations"
    tasks: [foundation-video, video-language]

  - name: Look and Tell
    kind: dataset
    released: "2025-10"
    venue: "NeurIPS 2025 Workshop"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2510.22672
    scale: "25 participants, Project Aria plus stationary cameras, gaze/speech/video, 3D reconstructions"
    tasks: [foundation-video, video-language]

  - name: EgoVLA
    milestone: 2025
    milestone_note: "Showed vision-language-action policies can be learned from egocentric human video and transferred to robots."
    kind: model
    released: "2025"
    venue: "project page"
    status: watch
    url: https://rchalyang.github.io/EgoVLA/
    scale: "VLA training from egocentric human videos plus robot fine-tuning"
    tasks: [robot-learning, manipulation, vla]

  - name: EgoEngine
    kind: model
    released: "2026-06"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://egoengine.github.io/
    paper: https://arxiv.org/abs/2606.12604
    scale: "Framework converting egocentric human manipulation videos into high-fidelity robot observation videos and executable robot action trajectories"
    tasks: [robot-learning, manipulation, vla]
    verified_at: "2026-06-16"

  - name: EgoAERO
    kind: model
    released: "2026-06"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2606.08057
    scale: "Asset-free conversion from a single egocentric RGB-D demonstration; introduces EgoDex-R in the paper"
    tasks: [robot-learning, manipulation, vla]

  - name: HRDexDB
    kind: dataset
    released: "2026-04"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2604.14944
    scale: "1.4K human/robot grasping trials, tactile, multiview video, egocentric video streams"
    tasks: [robot-learning, manipulation, vla]

  - name: UnrealEgo2 / UnrealEgo-RW
    kind: dataset
    released: "2024-01"
    venue: "arXiv"
    year: 2024
    status: watch
    url: https://arxiv.org/abs/2401.00889
    scale: "Expanded stereo egocentric pose datasets from synthetic and real-world capture"
    tasks: [hand-object, 3d, pose]

  - name: TouchMoment
    kind: dataset
    released: "2026-04"
    venue: "CVPR 2026 Findings"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2604.12343
    scale: "4,021 egocentric videos, 8,456 annotated hand-object contact moments"
    tasks: [hand-object, 3d, pose]

  - name: EgoFun3D
    kind: dataset
    released: "2026-04"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2604.11038
    scale: "271 egocentric videos with 3D geometry, part segmentation, articulation and function-template annotations"
    tasks: [hand-object, 3d, pose]

  - name: EgoEMG
    kind: dataset
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2605.05712
    scale: "41 participants, bilateral EMG, IMU, RGB, external RGB-D, mocap hand labels"
    tasks: [hand-object, 3d, pose]

  - name: EgoEVHands
    kind: dataset
    released: "2024"
    venue: "GitHub"
    status: watch
    url: https://github.com/ZJUWang01/EgoEV-HandPose
    scale: "Stereo event-camera egocentric hand dataset with 5,419 sequences and 3D/2D keypoints"
    tasks: [hand-object, 3d, pose]

  - name: EventEgoHands
    kind: benchmark
    released: "2025-05"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2505.19169
    scale: "Event-based egocentric 3D hand mesh reconstruction benchmark over N-HOT3D"
    tasks: [hand-object, 3d, pose]

  - name: A multimodal RGB/events FPV hand dataset
    kind: dataset
    released: "2026-06"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2606.10790
    scale: "Synthetic event-based first-person hand detection from EgoHands plus v2e"
    tasks: [hand-object, 3d, pose]

  - name: EgoExoMem
    kind: dataset
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2605.18734
    scale: "2.6K MCQs across synchronized ego-exo videos"
    tasks: [memory, qa, assistant]

  - name: EgoSelf
    kind: model
    released: "2025"
    venue: "project page"
    status: watch
    url: https://abie-e.github.io/egoself_project/
    scale: "Personalized egocentric assistant framework with graph memory"
    tasks: [memory, qa, assistant]

  - name: EgoCross
    kind: dataset
    released: "2025"
    venue: "GitHub"
    status: watch
    url: https://github.com/MyUniverse0726/EgoCross
    scale: "About 1,000 QA pairs over surgery, industry, extreme sports, and animal-perspective clips"
    tasks: [memory, qa, assistant]

  - name: ADL Dataset
    kind: dataset
    released: "2012"
    venue: "project page"
    status: partial
    url: https://www.csc.kth.se/cvap/actions/
    scale: "Unscripted daily activity recordings with activity/object/hand annotations in classic literature"
    tasks: [action-recognition, procedure]

  - name: Wrist-mounted ADL
    kind: dataset
    released: "2015-11"
    venue: "CVPR 2016"
    year: 2015
    status: open
    url: https://arxiv.org/abs/1511.06783
    scale: "Synchronized head and wrist wearable-camera daily activities"
    tasks: [action-recognition, procedure]

  - name: Visual Experience Dataset / VEDB
    kind: dataset
    released: "2023"
    venue: "project page"
    status: partial
    url: http://tamaraberg.com/visualexperience/
    scale: "240+ hours egocentric video with gaze/head tracking in classic literature"
    tasks: [action-recognition, procedure]

  - name: UT Ego
    kind: dataset
    released: "2012"
    venue: "project page"
    status: partial
    url: http://vision.cs.utexas.edu/projects/egocentric/
    scale: "Long daily egocentric videos in classic summarization work"
    tasks: [action-recognition, procedure]

  - name: HUJI EgoSeg
    kind: dataset
    released: "2014"
    venue: "project page"
    status: partial
    url: https://www.vision.huji.ac.il/egoseg/
    scale: "Long egocentric videos for temporal segmentation"
    tasks: [action-recognition, procedure]

  - name: JPL First-Person Interaction
    kind: dataset
    released: "2013"
    venue: "IEEE"
    status: partial
    url: https://ieeexplore.ieee.org/document/6909626
    scale: "First-person videos of people interacting with a humanoid observer"
    tasks: [action-recognition, procedure]

  - name: FT-HID
    kind: dataset
    released: "2022"
    venue: "GitHub"
    status: open
    url: https://github.com/ENDLICHERE/FT-HID
    scale: "90K+ RGB-D first- and third-person human interaction samples from 109 subjects"
    tasks: [action-recognition, procedure]

  - name: LSC-ADL
    kind: dataset
    released: "2025-04"
    venue: "arXiv"
    year: 2025
    status: open
    url: https://arxiv.org/abs/2504.02060
    scale: "ADL annotations over lifelogging data generated with clustering plus human review"
    tasks: [action-recognition, procedure]

  - name: SEED4D
    kind: dataset
    released: "2024-12"
    venue: "WACV 2025"
    year: 2024
    status: open
    scope: adjacent
    url: https://seed4d.github.io/
    paper: https://arxiv.org/abs/2412.00730
    scale: "Synthetic ego-exo dynamic 4D generator and autonomous-driving dataset (16.8M images, vehicle inward/outward cameras, LiDAR; WACV 2025)"
    tasks: [driving, 4d-reconstruction, multi-view]

  - name: Ego4D Benchmarks
    kind: benchmark
    released: "2021"
    venue: "project page"
    status: benchmark
    url: https://ego4d-data.org/
    scale: "Derived benchmark suite built on Ego4D; tasks: Natural Language Query, Moment Query, episodic memory, state change, long-term anticipation, social/audio, hand-object"
    tasks: [benchmark, evaluation]

  - name: Ego-Exo4D Benchmarks
    kind: benchmark
    released: "2023"
    venue: "project page"
    status: benchmark
    url: https://ego-exo4d-data.org/
    scale: "Derived benchmark suite built on Ego-Exo4D; tasks: Fine-grained activity, proficiency, cross-view translation, 3D pose, object correspondence"
    tasks: [benchmark, evaluation]

  - name: EPIC-KITCHENS Challenges
    kind: benchmark
    released: "2018"
    venue: "project page"
    status: benchmark
    url: https://epic-kitchens.github.io/
    scale: "Derived benchmark suite built on EPIC-KITCHENS / EPIC-KITCHENS-100; tasks: Recognition, detection, anticipation, retrieval, domain adaptation"
    tasks: [benchmark, evaluation]

  - name: HD-EPIC VQA Challenge
    kind: benchmark
    released: "2025-02"
    venue: "arXiv"
    year: 2025
    status: benchmark
    url: https://arxiv.org/abs/2502.04144
    scale: "Derived benchmark suite built on HD-EPIC; tasks: Recipe, ingredient, nutrition, fine-grained action, 3D perception, object motion, gaze"
    tasks: [benchmark, evaluation]

  - name: EgoEnv
    kind: model
    released: "2022"
    venue: "project page"
    status: open
    url: https://vision.cs.utexas.edu/projects/ego-env/
    scale: "Environment-aware representation learning from egocentric video"
    tasks: [video-language, representation-learning]

  - name: EgoMAS
    kind: model
    released: "2026"
    venue: "project page"
    status: open
    url: https://ma-egoqa.github.io/
    scale: "Shared-memory baseline for multi-agent egocentric video QA"
    tasks: [video-language, representation-learning]

  - name: Ego-Exo representation transfer
    kind: model
    released: "2021-04"
    venue: "CVPR 2021"
    year: 2021
    status: open
    url: https://arxiv.org/abs/2104.07905
    scale: "Distillation from third-person video using ego-specific latent signals"
    tasks: [action, tracking, pose, hoi]

  - name: EgoPoseFormer
    kind: model
    released: "2024"
    venue: "GitHub"
    status: open
    url: https://github.com/ChenhongyiYang/egoposeformer
    scale: "Transformer baseline for stereo egocentric 3D human pose estimation"
    tasks: [action, tracking, pose, hoi]

  - name: EgoSTARK
    kind: model
    released: "2023-01"
    venue: "arXiv"
    year: 2023
    status: open
    url: https://arxiv.org/abs/2301.03213
    scale: "Adapted long-term tracker baseline for EgoTracks"
    tasks: [action, tracking, pose, hoi]

  - name: EgoHOS model
    kind: model
    released: "2022"
    venue: "GitHub"
    status: open
    url: https://github.com/owenzlz/EgoHOS
    scale: "Context-aware hand-object segmentation and augmentation pipeline"
    tasks: [action, tracking, pose, hoi]

  - name: AV-CONV
    kind: model
    released: "2023"
    venue: "project page"
    status: open
    url: https://vjwq.github.io/AV-CONV/
    scale: "Audio-visual conversational graph prediction from ego/exo conversation"
    tasks: [action, tracking, pose, hoi]

  - name: EgoAction
    kind: model
    released: "2026-05"
    venue: "CVPR 2026"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2605.24496
    scale: "CVPR 2026 EPIC-KITCHENS action detection challenge pipeline"
    tasks: [action, tracking, pose, hoi]

  - name: EgoAdapt
    kind: model
    released: "2026-05"
    venue: "CVPR 2026"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2605.24500
    scale: "CVPR 2026 HD-EPIC VQA challenge inference-time adaptation pipeline"
    tasks: [action, tracking, pose, hoi]

  - name: Ego4D CLI and docs
    kind: toolkit
    released: "2021"
    venue: "project page"
    status: open
    url: https://ego4d-data.org/
    scale: "Downloading and working with Ego4D data after license approval."
    tasks: [tooling]

  - name: Ego-Exo4D CLI and docs
    kind: toolkit
    released: "2023"
    venue: "project page"
    status: open
    url: https://ego-exo4d-data.org/
    scale: "Downloading synchronized ego-exo data and annotations."
    tasks: [tooling]

  - name: VISOR API
    kind: toolkit
    released: "2022"
    venue: "GitHub"
    status: open
    url: https://github.com/epic-kitchens/VISOR
    scale: "Loading dense EPIC-KITCHENS hand/object masks and relations."
    tasks: [tooling]

  - name: EgoObjects API
    kind: toolkit
    released: "2023"
    venue: "GitHub"
    status: open
    url: https://github.com/facebookresearch/EgoObjects
    scale: "Working with category and instance-level egocentric object labels."
    tasks: [tooling]

  - name: HOT3D tooling
    kind: toolkit
    released: "2024"
    venue: "project page"
    status: open
    url: https://facebookresearch.github.io/hot3d/
    scale: "Loading HOT3D hand/object/camera pose annotations and models."
    tasks: [tooling]

  - name: HOI4D tooling
    kind: toolkit
    released: "2022"
    venue: "project page"
    status: open
    url: https://hoi4d.github.io/
    scale: "Loading RGB-D frames, point clouds, object meshes, and pose/segmentation annotations."
    tasks: [tooling]

  # --- Catalog expansion (2026-06-14): added from web research ---

  - name: EgoVid-5M
    kind: dataset
    released: "2024-11"
    venue: "NeurIPS 2025"
    year: 2024
    status: open
    url: https://egovid.github.io/
    paper: https://arxiv.org/abs/2411.08380
    scale: "5M curated egocentric clips at 1080p with fine-grained kinematic and high-level textual action annotations (NeurIPS 2025)"
    tasks: [video-generation, world-modeling, action-conditioned-generation]

  - name: AoE
    kind: dataset
    released: "2026-02"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2602.23893
    scale: "Always-on egocentric human video collection pipeline and corpus for embodied AI data scaling"
    tasks: [robot-learning, data-collection, embodied-ai]

  - name: Seeing Across Views (MV-RoboBench)
    kind: benchmark
    released: "2025-10"
    venue: "ICLR 2026"
    year: 2025
    status: open
    scope: adjacent
    url: https://github.com/microsoft/MV-RoboBench
    paper: https://arxiv.org/abs/2510.19400
    scale: "MV-RoboBench: 1.7K curated QA items over eight subtasks testing multi-view spatial reasoning of VLMs in robotic manipulation scenes (ICLR 2026)"
    tasks: [multi-view, robot-manipulation, vlm-evaluation]

  - name: EgoGesture
    kind: dataset
    released: "2017"
    venue: "IEEE TMM 2018"
    year: 2017
    status: open
    url: https://ieeexplore.ieee.org/document/8299578/
    scale: "24K+ gesture samples, 3M RGB-D frames, 50 subjects, 83 static and dynamic gestures across six indoor/outdoor scenes"
    tasks: [gesture-recognition, hand, wearable-interaction]

  - name: EgoBrain
    kind: dataset
    released: "2025-06"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2506.01353
    scale: "Synchronized EEG and egocentric video for human action understanding from minds and eyes"
    tasks: [eeg, action-understanding, multimodal]

  - name: MM-Ego
    kind: model
    released: "2024-10"
    venue: "ICLR 2025"
    year: 2024
    status: open
    url: https://arxiv.org/abs/2410.07177
    scale: "Egocentric multimodal LLM with Memory Pointer Prompting; 7M-sample QA data engine and the EgoMemoria benchmark (629 videos, 7,026 questions)"
    tasks: [video-language, memory, qa]

  - name: EgoStream
    kind: benchmark
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2605.31557
    scale: "2,250 questions across seven memory dimensions with Answer Validity Windows, expanded to 8,528 recall-conditioned evaluations over streams up to 45.3 hours"
    tasks: [streaming-memory, episodic-memory, qa]

  - name: EgoMemory
    kind: benchmark
    released: "2026"
    venue: "OpenReview"
    year: 2026
    status: watch
    url: https://openreview.net/forum?id=T0em4hJCQb
    scale: "165,795 user-specific object annotations over 245 videos from 45 participants for memory-augmented personalized retrieval"
    tasks: [personalized-retrieval, episodic-memory, long-context]

  - name: EgoTextVQA
    kind: benchmark
    released: "2025"
    venue: "CVPR 2025"
    year: 2025
    status: open
    url: https://openaccess.thecvf.com/content/CVPR2025/papers/Zhou_EgoTextVQA_Towards_Egocentric_Scene-Text_Aware_Video_Question_Answering_CVPR_2025_paper.pdf
    scale: "Egocentric scene-text-aware video QA across indoor housekeeping and outdoor driving scenes (CVPR 2025)"
    tasks: [scene-text, qa, reading]

  - name: Gesture-Based Egocentric Video QA
    kind: benchmark
    released: "2026-03"
    venue: "CVPR 2026"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2603.12533
    scale: "Egocentric video QA grounded in the camera wearer's pointing and deictic gestures"
    tasks: [gesture-grounding, qa, referential]

  - name: EgoVQA
    kind: benchmark
    released: "2019"
    venue: "ICCV 2019"
    year: 2019
    status: open
    url: https://openaccess.thecvf.com/content_ICCVW_2019/html/EPIC/Fan_EgoVQA_-_An_Egocentric_Video_Question_Answering_Benchmark_Dataset_ICCVW_2019_paper.html
    scale: "600+ QA pairs over egocentric videos; an early first-person video question answering benchmark"
    tasks: [qa, action, classic]

  - name: ExAct
    kind: benchmark
    released: "2025-06"
    venue: "arXiv"
    year: 2025
    status: open
    url: https://arxiv.org/abs/2506.06277
    scale: "Video-language benchmark for expert action analysis and feedback over skilled egocentric/exocentric activity"
    tasks: [expert-feedback, skill-assessment, video-language]

  - name: Home Action Genome / HOMAGE
    kind: dataset
    released: "2021-05"
    venue: "CVPR 2021"
    year: 2021
    status: open
    url: https://homeactiongenome.org/
    paper: https://arxiv.org/abs/2105.05226
    scale: "27 participants, multi-modal synchronized ego and third-person views with 12 sensor types and hierarchical activity/action labels in home settings"
    tasks: [action-recognition, multi-view, compositional]

  - name: EgoExo-Fitness
    kind: dataset
    released: "2024-06"
    venue: "ECCV 2024"
    year: 2024
    status: open
    url: https://github.com/iSEE-Laboratory/EgoExo-Fitness
    paper: https://arxiv.org/abs/2406.08877
    scale: "Synchronized ego and exo fitness videos with keypoint verification, execution comments, and action quality scores (ECCV 2024)"
    tasks: [ego-exo, action-quality, skill-assessment]

  - name: WEAR
    kind: dataset
    released: "2023-04"
    venue: "IMWUT 2024"
    year: 2023
    status: open
    url: https://mariusbock.github.io/wear/
    paper: https://arxiv.org/abs/2304.05088
    scale: "22 participants, 18 outdoor workout activities, synchronized egocentric video and 3D acceleration across 11 locations (IMWUT 2024)"
    tasks: [action-recognition, sensor-fusion, imu]

  - name: Egocentric HOI Detection
    kind: benchmark
    released: "2025-06"
    venue: "arXiv"
    year: 2025
    status: open
    url: https://arxiv.org/abs/2506.14189
    scale: "New benchmark and method for detecting hand-object interactions in egocentric video"
    tasks: [hand-object, detection, interaction]

  # --- Catalog expansion (2026-06-15): verified new resources ---

  - name: SHOW3D
    kind: dataset
    released: "2026-03"
    venue: "CVPR 2026"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2603.28760
    paper: https://arxiv.org/abs/2603.28760
    scale: "In-the-wild 3D hand-object interaction dataset captured with a back-mounted multi-camera rig synchronized to a worn VR headset, with multi-view 3D shape/pose annotations and text descriptions"
    modalities: [multiview-video, 3d-hand-pose, object-pose, text]
    tasks: [hand-object, 3d-hand-object-tracking, 3d-reconstruction]
    raw_video_dependency: "self-captured in-the-wild video"
    scope_note: "Body-worn ego/ego-exo in-the-wild HOI capture (Meta Reality Labs, Yale)."
    verified_at: "2026-06-15"
    citation_key: show3d_2026

  - name: EgoTouch / TouchAnything
    kind: dataset
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://jianyi2004.github.io/TouchAnything-Website/
    paper: https://arxiv.org/abs/2605.13083
    scale: "EgoTouch: 302 manipulation tasks across 4,530 episodes with synchronized egocentric and dual wrist-camera video, bimanual 3D hand pose (42 joints), and dense tactile pressure maps; TouchAnything is the accompanying tactile-estimation framework"
    modalities: [egocentric-video, wrist-camera-video, 3d-hand-pose, contact]
    tasks: [hand-object, contact-understanding, bimanual-manipulation]
    scope_note: "Egocentric plus wrist cameras with tactile pressure; emerging sensing."
    verified_at: "2026-06-15"
    citation_key: egotouch_touchanything_2026

  - name: EgoEverything
    kind: benchmark
    released: "2026-04"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2604.08342
    paper: https://arxiv.org/abs/2604.08342
    scale: "5,000+ MCQ QA pairs over 100+ hours of egocentric video, with questions generated using human attention/gaze signals for AR long-context understanding"
    modalities: [egocentric-video, gaze, qa]
    tasks: [long-context, video-qa, gaze-reasoning]
    verified_at: "2026-06-15"
    citation_key: egoeverything_2026

  - name: EgoNight
    kind: benchmark
    released: "2025-10"
    venue: "ICLR 2026"
    year: 2025
    status: open
    url: https://insait-institute.github.io/EgoNight/
    paper: https://arxiv.org/abs/2510.06218
    code: https://github.com/dehezhang2/EgoNight
    scale: "Nighttime egocentric benchmark: EgoNight-VQA (3,658 QA over 90 videos, 12 QA types) plus day-night correspondence retrieval and egocentric depth estimation, with day-night aligned synthetic and real video"
    modalities: [egocentric-video, qa, depth, synthetic-rendering]
    tasks: [egocentric-vqa, egocentric-depth, retrieval]
    scope_note: "Low-light / nighttime robustness benchmark (INSAIT)."
    verified_at: "2026-06-15"
    citation_key: egonight_2025

  - name: EgoIntent
    kind: benchmark
    released: "2026-03"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2603.12147
    paper: https://arxiv.org/abs/2603.12147
    scale: "3,014 steps across 15 indoor/outdoor daily-life scenarios, evaluating local intent (What), global intent (Why), and next-step plan (Next), with future-frame leakage removed"
    modalities: [egocentric-video, qa]
    tasks: [intent, next-action, anticipation]
    verified_at: "2026-06-15"
    citation_key: egointent_2026

  - name: EgoMAGIC
    kind: dataset
    released: "2026-04"
    venue: "arXiv"
    year: 2026
    status: open
    url: https://arxiv.org/abs/2604.22036
    paper: https://arxiv.org/abs/2604.22036
    data: https://doi.org/10.5281/zenodo.19239154
    scale: "3,355 egocentric field-medicine videos over 50 medical tasks from a head-mounted stereo camera with audio; 1.95M labels for 124 medical objects and an action-detection challenge"
    modalities: [egocentric-video, stereo, audio, object-boxes]
    tasks: [action-detection, object-detection, procedural-task-assistance]
    raw_video_dependency: "self-captured field-medicine video"
    verified_at: "2026-06-15"
    citation_key: egomagic_2026

  - name: Audio Hallucination in Egocentric Video
    kind: benchmark
    released: "2026-04"
    venue: "ICASSP 2026"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2604.23860
    paper: https://arxiv.org/abs/2604.23860
    scale: "300 egocentric videos and 1,000 sound-focused questions probing audio hallucination in audio-visual LLMs that infer sounds from visual cues"
    modalities: [egocentric-video, audio, qa]
    tasks: [audio-visual-reasoning, sound-understanding, evaluation]
    verified_at: "2026-06-15"
    citation_key: audio_hallucination_ego_2026

  - name: Ego2World
    kind: benchmark
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2605.13335
    paper: https://arxiv.org/abs/2605.13335
    derived_from: HD-EPIC
    scale: "Compiles egocentric cooking videos (built on HD-EPIC) into executable symbolic worlds with graph-transition rules for belief-state planning under partial observability"
    modalities: [egocentric-video, annotations]
    tasks: [planning, belief-reasoning, embodied-reasoning]
    scope_note: "Egocentric-video-grounded executable simulator/benchmark, not a synthetic-only simulator."
    verified_at: "2026-06-15"
    citation_key: ego2world_2026

  - name: EgoCrowds / CrowdEraser
    kind: dataset
    released: "2026-03"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2603.29036
    paper: https://arxiv.org/abs/2603.29036
    scale: "EgoCrowds: semi-synthetic paired crowded/empty clips from real egocentric walking-tour footage; CrowdEraser is a video-diffusion framework that removes crowds to produce humanless environment walkthroughs"
    modalities: [egocentric-video, first-person-video]
    tasks: [video-generation, scene-understanding, scene-reconstruction]
    scope_note: "First-person walking-tour video editing/generation."
    verified_at: "2026-06-15"
    citation_key: egocrowds_crowderaser_2026

  # --- Catalog expansion (2026-06-15): growth additions ---

  - name: EgoPressure
    kind: dataset
    released: "2024-09"
    venue: "CVPR 2025"
    year: 2025
    status: open
    url: https://yiming-zhao.github.io/EgoPressure/
    paper: https://arxiv.org/abs/2409.02224
    scale: "5.0 hours, 21 participants, a moving egocentric camera plus 7 stationary RGB-D cameras, with hand pose meshes and fine-grained per-contact touch pressure (CVPR 2025)"
    modalities: [egocentric-video, rgbd, 3d-hand-pose, contact]
    tasks: [hand-object, contact-understanding, 3d-hand-pose]
    verified_at: "2026-06-15"
    citation_key: egopressure_2025

  - name: EgoTL
    kind: benchmark
    released: "2026-04"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2604.09535
    paper: https://arxiv.org/abs/2604.09535
    scale: "Think-aloud (say-before-act) egocentric capture with word-level spoken reasoning and metric-scale spatial annotations over 100+ household tasks; benchmarks VLMs and world models on six task dimensions"
    modalities: [egocentric-video, speech, annotations]
    tasks: [long-context, planning, embodied-reasoning]
    verified_at: "2026-06-15"
    citation_key: egotl_2026

  - name: SuperMemory-VQA
    kind: benchmark
    released: "2026-06"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2606.00825
    paper: https://arxiv.org/abs/2606.00825
    scale: "52.9 hours of AI-glasses daily activities with RGB, audio transcription, gaze, IMU, and SLAM; 4,853 human-verified QA pairs across object/location memory, intent recall, and scene recall"
    modalities: [egocentric-video, audio, gaze, imu, point-clouds]
    tasks: [episodic-memory, long-context, qa]
    verified_at: "2026-06-15"
    citation_key: supermemory_vqa_2026

  - name: EgoAVU
    kind: benchmark
    released: "2026-02"
    venue: "CVPR 2026"
    year: 2026
    status: open
    url: https://github.com/facebookresearch/EgoAVU
    paper: https://arxiv.org/abs/2602.06139
    code: https://github.com/facebookresearch/EgoAVU
    data: https://huggingface.co/datasets/facebook/EgoAVU_data
    project: https://cs20s030.github.io/EgoAVU/
    scale: "Scalable audio-visual data engine producing EgoAVU-Instruct (3M QAs) and the human-verified EgoAVU-Bench (3K QAs) for egocentric audio-visual understanding (CVPR 2026 highlight)"
    modalities: [egocentric-video, audio, qa]
    tasks: [audio-visual-reasoning, sound-understanding, evaluation]
    verified_at: "2026-06-15"
    citation_key: egoavu_2026

  - name: EgoScale
    milestone: 2026
    milestone_note: "Revealed the log-linear data-scaling law for egocentric human-video VLA pretraining."
    kind: dataset
    released: "2026-02"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2602.16710
    paper: https://arxiv.org/abs/2602.16710
    scale: "20,854 hours of action-labeled egocentric human video for VLA pretraining with a human-to-robot two-stage transfer recipe; reveals a log-linear human-data scaling law"
    modalities: [egocentric-video, action-labels]
    tasks: [vla, robot-learning, dexterous-manipulation]
    scope_note: "Egocentric human-video corpus for dexterous-manipulation transfer."
    verified_at: "2026-06-15"
    citation_key: egoscale_2026

  - name: StableHand
    kind: model
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://huajian-zeng.github.io/projects/stablehand/
    paper: https://arxiv.org/abs/2605.18553
    scale: "Quality-aware flow-matching baseline for world-space dual-hand motion estimation from egocentric video, evaluated on HOT3D and ARCTIC"
    modalities: [egocentric-video, 3d-hand-pose]
    tasks: [3d-hand-pose, pose, hand-object]
    verified_at: "2026-06-15"
    citation_key: stablehand_2026

  # --- Catalog expansion (2026-06-16): egocentric models / methods ---

  - name: EgoVideo
    kind: model
    released: "2024-06"
    venue: "CVPR 2024"
    year: 2024
    status: open
    url: https://github.com/OpenGVLab/EgoVideo
    paper: https://arxiv.org/abs/2406.18070
    code: https://github.com/OpenGVLab/EgoVideo
    scale: "Egocentric video foundation model with a slow-fast adaptation recipe; winner of multiple Ego4D / EPIC-KITCHENS challenge tracks (CVPR 2024 EgoVis)"
    modalities: [egocentric-video, text]
    tasks: [foundation-video, video-language, representation-learning]
    verified_at: "2026-06-16"
    citation_key: egovideo_2024

  - name: EgoHOD
    kind: model
    released: "2025-03"
    venue: "ICLR 2025"
    year: 2025
    status: open
    url: https://github.com/InternRobotics/EgoHOD
    paper: https://arxiv.org/abs/2503.00986
    code: https://github.com/InternRobotics/EgoHOD
    scale: "Fine-grained hand-object-dynamics pretraining (HOD pipeline) for egocentric video representation learning; SOTA on EK-100 retrieval/classification and EGTEA (ICLR 2025)"
    modalities: [egocentric-video, hand, text]
    tasks: [video-language-pretraining, hand-object, representation-learning]
    verified_at: "2026-06-16"
    citation_key: egohod_2025

  - name: EgoM2P
    kind: model
    released: "2025-06"
    venue: "ICCV 2025"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2506.07886
    paper: https://arxiv.org/abs/2506.07886
    scale: "Egocentric multimodal, multitask pretraining framework over RGB, depth, gaze, and camera pose with a masked modeling objective"
    modalities: [egocentric-video, depth, gaze, camera-pose]
    tasks: [multimodal, representation-learning, standardized-pretraining]
    verified_at: "2026-06-16"
    citation_key: egom2p_2025

  - name: AlanaVLM
    kind: model
    released: "2024-06"
    venue: "arXiv"
    year: 2024
    status: watch
    url: https://arxiv.org/abs/2406.13807
    paper: https://arxiv.org/abs/2406.13807
    scale: "Multimodal embodied-AI foundation model trained for egocentric video understanding with an egocentric video instruction dataset"
    modalities: [egocentric-video, text]
    tasks: [video-language, embodied-ai, video-qa]
    verified_at: "2026-06-16"
    citation_key: alanavlm_2024

  - name: Vinci
    kind: model
    released: "2024-12"
    venue: "IMWUT 2025"
    year: 2025
    status: open
    url: https://github.com/OpenGVLab/vinci
    paper: https://arxiv.org/abs/2412.21080
    code: https://github.com/OpenGVLab/vinci
    scale: "Real-time, always-on embodied smart assistant built on an egocentric vision-language model for portable devices, with historical-context QA, task planning, and step-by-step visual demonstrations"
    modalities: [egocentric-video, audio, text]
    tasks: [real-time-understanding, streaming-assistant, task-assistance]
    verified_at: "2026-06-16"
    citation_key: vinci_2025

  - name: AMEGO
    kind: model
    released: "2024-09"
    venue: "ECCV 2024"
    year: 2024
    status: open
    url: https://gabrielegoletto.github.io/AMEGO/
    paper: https://arxiv.org/abs/2409.10917
    code: https://github.com/gabrielegoletto/AMEGO
    scale: "Active-memory representation from a single long egocentric video, capturing key locations and object interactions for fast multi-query answering; introduces the Active Memories Benchmark (20K+ queries over EPIC-KITCHENS, ECCV 2024)"
    modalities: [egocentric-video]
    tasks: [long-context, memory, episodic-memory]
    verified_at: "2026-06-16"
    citation_key: amego_2024

  - name: EgoInstructor
    kind: model
    released: "2024-01"
    venue: "CVPR 2024"
    year: 2024
    status: open
    url: https://jazzcharles.github.io/Egoinstructor/
    paper: https://arxiv.org/abs/2401.00789
    scale: "Retrieval-augmented egocentric video captioning that retrieves relevant third-person instructional clips, trained with an EgoExoNCE cross-view objective (CVPR 2024)"
    modalities: [egocentric-video, exocentric-video, text]
    tasks: [captioning, video-language, cross-view]
    verified_at: "2026-06-16"
    citation_key: egoinstructor_2024

  - name: GroundNLQ
    kind: model
    released: "2023-06"
    venue: "CVPR 2023"
    year: 2023
    status: open
    url: https://github.com/houzhijian/GroundNLQ
    paper: https://arxiv.org/abs/2306.15255
    code: https://github.com/houzhijian/GroundNLQ
    scale: "Two-stage pretrained multi-scale grounding model for Ego4D Natural Language Queries; champion of the CVPR 2023 Ego4D NLQ challenge"
    modalities: [egocentric-video, text]
    tasks: [natural-language-query, temporal-grounding, ego4d]
    verified_at: "2026-06-16"
    citation_key: groundnlq_2023

  - name: OSGNet
    kind: model
    released: "2025-05"
    venue: "CVPR 2025"
    year: 2025
    status: open
    url: https://github.com/Yisen-Feng/OSGNet
    paper: https://arxiv.org/abs/2505.04270
    code: https://github.com/Yisen-Feng/OSGNet
    scale: "Object-shot enhanced grounding network for egocentric video temporal grounding; top entry on Ego4D NLQ / Goal-Step moment queries (CVPR 2025)"
    modalities: [egocentric-video, text]
    tasks: [natural-language-query, temporal-grounding, object-grounding]
    verified_at: "2026-06-16"
    citation_key: osgnet_2025

  # --- Catalog expansion (2026-06-16): 2026 H1 egocentric models ---

  - name: EgoThinker
    kind: model
    released: "2025-10"
    venue: "NeurIPS 2025"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2510.23569
    paper: https://arxiv.org/abs/2510.23569
    scale: "Egocentric reasoning model with spatio-temporal chain-of-thought supervision and reinforcement fine-tuning for first-person video understanding"
    modalities: [egocentric-video, text]
    tasks: [egocentric-video-reasoning, grounded-cot, video-qa]
    verified_at: "2026-06-16"
    citation_key: egothinker_2025

  - name: Walk through Paintings
    kind: model
    released: "2026-01"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2601.15284
    paper: https://arxiv.org/abs/2601.15284
    scale: "Egocentric world models built from internet priors, generating first-person walkthroughs of static scenes such as paintings"
    modalities: [egocentric-video]
    tasks: [world-modeling, video-generation, scene-understanding]
    verified_at: "2026-06-16"
    citation_key: walk_through_paintings_2026

  - name: Temporal-Aware Ego VLM
    kind: model
    released: "2026-03"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2603.27184
    paper: https://arxiv.org/abs/2603.27184
    scale: "Training scheme that incentivizes temporal awareness in egocentric video-understanding models, improving order- and timing-sensitive reasoning"
    modalities: [egocentric-video, text]
    tasks: [temporal-reasoning, video-language, representation-learning]
    verified_at: "2026-06-16"
    citation_key: temporal_aware_ego_vlm_2026

  - name: EgoMotion
    kind: model
    released: "2026-04"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2604.19105
    paper: https://arxiv.org/abs/2604.19105
    scale: "Hierarchical reasoning plus diffusion framework for egocentric vision-language motion generation from first-person context"
    modalities: [egocentric-video, motion, text]
    tasks: [motion, motion-estimation, video-language]
    verified_at: "2026-06-16"
    citation_key: egomotion_2026

  - name: Gaze-SoM HOI Anticipation
    kind: model
    released: "2026-04"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2604.03667
    paper: https://arxiv.org/abs/2604.03667
    scale: "Leverages gaze and set-of-mark prompting in vision-language LLMs for hand-object-interaction anticipation from egocentric video"
    modalities: [egocentric-video, gaze, text]
    tasks: [anticipation, hand-object, gaze-reasoning]
    verified_at: "2026-06-16"
    citation_key: gaze_som_hoi_anticipation_2026

  - name: Sanpo-D
    kind: benchmark
    released: "2026-01"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2601.18100
    paper: https://arxiv.org/abs/2601.18100
    scale: "Fine-grained re-annotation of the Sanpo egocentric navigation data with spatial signals, benchmarking spatial-conditioned reasoning of VLMs over long first-person videos"
    modalities: [egocentric-video, qa]
    tasks: [spatial-reasoning, navigation, video-qa]
    verified_at: "2026-06-16"
    citation_key: sanpo_d_2026

  # --- Catalog expansion (2026-06-16): exhaustive scan additions ---

  - name: EgoSurgery
    kind: dataset
    year: 2024
    released: "2024"
    venue: "MICCAI 2024"
    status: open
    url: https://github.com/Fujiry0/EgoSurgery
    paper: https://arxiv.org/abs/2503.18755
    code: https://github.com/Fujiry0/EgoSurgery
    scale: "Egocentric open-surgery video dataset family: EgoSurgery-Phase for surgical phase recognition and EgoSurgery-HTS for pixel-wise hand-tool segmentation of 14 surgical tools (MICCAI 2024)"
    modalities: [egocentric-video, segmentation, object-boxes]
    tasks: [action-recognition, hand-object-segmentation, procedural-task-assistance]
    verified_at: "2026-06-16"
    citation_key: egosurgery_2024

  - name: EgoEMS
    kind: dataset
    year: 2025
    released: "2025-11"
    venue: "AAAI 2026"
    status: watch
    url: https://arxiv.org/abs/2511.09894
    paper: https://arxiv.org/abs/2511.09894
    scale: "High-fidelity multimodal egocentric dataset for cognitive assistance in emergency medical services, capturing time-critical team actions in high-stakes scenarios"
    modalities: [egocentric-video, audio, multimodal-annotations]
    tasks: [procedural-task-assistance, action-recognition, real-time-understanding]
    verified_at: "2026-06-16"
    citation_key: egoems_2025

  - name: LEMMA
    kind: dataset
    year: 2020
    released: "2020-07"
    venue: "ECCV 2020"
    status: open
    url: https://arxiv.org/abs/2007.15781
    paper: https://arxiv.org/abs/2007.15781
    scale: "Multi-view multi-agent multi-task daily-activity dataset across 14 kitchens and living rooms, densely annotated with atomic actions and human-object interactions (ECCV 2020)"
    modalities: [egocentric-video, third-person-video, multimodal-annotations]
    tasks: [compositional, action-recognition, anticipation]
    verified_at: "2026-06-16"
    citation_key: lemma_2020

  - name: EgoMe
    kind: dataset
    year: 2025
    released: "2025-01"
    venue: "arXiv"
    status: watch
    url: https://arxiv.org/abs/2501.19061
    paper: https://arxiv.org/abs/2501.19061
    scale: "Real-world dataset and challenge for following a demonstrator via egocentric view, pairing exocentric demonstrations with egocentric imitation across everyday tasks"
    modalities: [egocentric-video, exocentric-video]
    tasks: [imitation-learning, cross-view, action-understanding]
    verified_at: "2026-06-16"
    citation_key: egome_2025

  - name: ENIGMA-360
    kind: dataset
    year: 2026
    released: "2026-03"
    venue: "arXiv"
    status: watch
    url: https://arxiv.org/abs/2603.09741
    paper: https://arxiv.org/abs/2603.09741
    scale: "Ego-exo dataset for human behavior understanding in industrial scenarios with 360-degree exocentric and egocentric capture"
    modalities: [egocentric-video, exocentric-video, first-person-360-video]
    tasks: [action-understanding, ego-exo, procedure]
    verified_at: "2026-06-16"
    citation_key: enigma_360_2026

  - name: EgoIllusion
    kind: benchmark
    year: 2025
    released: "2025-08"
    venue: "arXiv"
    status: watch
    url: https://arxiv.org/abs/2508.12687
    paper: https://arxiv.org/abs/2508.12687
    scale: "Benchmark for hallucinations in egocentric video understanding, probing where multimodal models fabricate objects, actions, and sounds"
    modalities: [egocentric-video, qa]
    tasks: [evaluation, multimodal-reasoning, audio-visual-reasoning]
    verified_at: "2026-06-16"
    citation_key: egoillusion_2025

  - name: EgoExo-Con
    kind: benchmark
    year: 2025
    released: "2025-10"
    venue: "arXiv"
    status: watch
    url: https://arxiv.org/abs/2510.26113
    paper: https://arxiv.org/abs/2510.26113
    scale: "Synchronized ego-exo video pairs with human-refined queries for view-invariant temporal verification and grounding; introduces the View-GRPO training framework"
    modalities: [egocentric-video, exocentric-video, qa]
    tasks: [cross-view, temporal-grounding, evaluation]
    verified_at: "2026-06-16"
    citation_key: egoexo_con_2025

  - name: WearVQA
    kind: benchmark
    year: 2025
    released: "2025-11"
    venue: "NeurIPS 2025"
    status: watch
    url: https://arxiv.org/abs/2511.22154
    paper: https://arxiv.org/abs/2511.22154
    scale: "First wearable-device VQA benchmark for smart glasses: 2,520 image-question-answer triplets across 7 image domains and 10 cognitive task types under occluded, low-light, and blurry egocentric capture (NeurIPS 2025)"
    modalities: [egocentric-video, qa]
    tasks: [vqa, evaluation, wearable-sensing]
    verified_at: "2026-06-16"
    citation_key: wearvqa_2025

  - name: WearVox
    kind: benchmark
    year: 2026
    released: "2026-01"
    venue: "arXiv"
    status: watch
    url: https://arxiv.org/abs/2601.02391
    paper: https://arxiv.org/abs/2601.02391
    scale: "Egocentric multichannel voice-assistant benchmark for wearables, evaluating spoken interaction grounded in first-person audio-visual context"
    modalities: [egocentric-video, audio, speech]
    tasks: [audio-visual-reasoning, streaming-assistant, evaluation]
    verified_at: "2026-06-16"
    citation_key: wearvox_2026

  - name: SuperGlasses
    kind: benchmark
    year: 2026
    released: "2026-02"
    venue: "CVPR 2026 Findings"
    status: watch
    url: https://arxiv.org/abs/2602.22683
    paper: https://arxiv.org/abs/2602.22683
    scale: "Benchmark evaluating vision-language models as intelligent agents for AI smart glasses across realistic wearable assistant tasks"
    modalities: [egocentric-video, qa]
    tasks: [tool-using-agents, evaluation, task-assistance]
    verified_at: "2026-06-16"
    citation_key: superglasses_2026

  - name: PhysBrain
    kind: model
    year: 2025
    released: "2025-12"
    venue: "arXiv"
    status: watch
    url: https://arxiv.org/abs/2512.16793
    paper: https://arxiv.org/abs/2512.16793
    scale: "Uses human egocentric data as a bridge to adapt vision-language models toward physical intelligence and embodied control"
    modalities: [egocentric-video, text]
    tasks: [embodied-ai, representation-learning, robot-learning]
    verified_at: "2026-06-16"
    citation_key: physbrain_2025

  # --- Catalog expansion (2026-06-16): verified additions ---

  - name: EgoCampus
    kind: dataset
    released: "2025-12"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2512.07668
    paper: https://arxiv.org/abs/2512.07668
    scale: "Egocentric pedestrian eye-gaze dataset and model over campus walking routes with synchronized first-person video and gaze"
    modalities: [egocentric-video, gaze, scene-labels]
    tasks: [gaze, navigation, trajectory-prediction]
    verified_at: "2026-06-16"
    citation_key: egocampus_2025

  - name: Egocentric Clinical Intent
    kind: benchmark
    released: "2026-01"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2601.06750
    paper: https://arxiv.org/abs/2601.06750
    scale: "Benchmark for egocentric clinical intent understanding by medical multimodal LLMs over first-person clinical procedure video"
    modalities: [egocentric-video, qa]
    tasks: [intent, procedural-task-assistance, evaluation]
    verified_at: "2026-06-16"
    citation_key: egocentric_clinical_intent_2026

  # --- Catalog expansion (2026-06-16): late-2026 scan additions ---

  - name: 1M-HUGs / HUG
    kind: dataset
    released: "2026-06"
    venue: "arXiv"
    year: 2026
    status: open
    url: https://grasping.io/
    paper: https://arxiv.org/abs/2606.17054
    data: https://grasping.io/
    benchmark: https://grasping.io/
    scale: "Egocentric human-grasp dataset with 1M frames (27.8 hours), 6,707 object instances across 41 buildings, plus HUG-Bench with 90 unseen objects for zero-shot grasping"
    modalities: [egocentric-video, rgbd, hand-pose, object-meshes]
    tasks: [robot-learning, dexterous-manipulation, hand-object, manipulation]
    verified_at: "2026-06-16"
    citation_key: hug_1m_2026

  - name: EgoKit
    kind: toolkit
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: open
    url: https://egokit.chuange.org/
    paper: https://arxiv.org/abs/2605.16797
    scale: "Unified low-cost egocentric recording workflow across Android, iPhone, iPad, smart-glasses, and XR hosts, with synchronized ego and wrist views plus OpenXR hand tracking where available"
    modalities: [egocentric-video, wrist-camera-video, head-pose, hand-pose]
    tasks: [data-collection, robot-learning, wearable-sensing, tooling]
    verified_at: "2026-06-16"
    citation_key: egokit_2026

  - name: MobileEgo Anywhere
    kind: toolkit
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2605.05945
    paper: https://arxiv.org/abs/2605.05945
    scale: "Commodity-phone infrastructure for hour-plus egocentric trajectories; releases STERA processing, a mobile recording app, and a reported 200-hour / 584-session long-form dataset"
    modalities: [egocentric-video, trajectory, action-labels]
    tasks: [data-collection, robot-learning, vla, long-horizon-memory]
    verified_at: "2026-06-16"
    citation_key: mobileego_anywhere_2026

  - name: UCS-Bench / DirectMe
    kind: benchmark
    released: "2026-06"
    venue: "ICML 2026"
    year: 2026
    status: open
    url: https://github.com/cocowy1/UCS-Bench
    paper: https://arxiv.org/abs/2606.15200
    code: https://github.com/cocowy1/UCS-Bench
    data: https://github.com/cocowy1/UCS-Bench
    scale: "170+ hours of egocentric observations with 8.1K+ timestamped questions for user-centric continual spatial intelligence; DirectMe maintains streaming spatial memory"
    modalities: [egocentric-video, qa, trajectory]
    tasks: [spatial-reasoning, episodic-memory, long-context, streaming-memory, qa]
    verified_at: "2026-06-16"
    citation_key: ucs_bench_directme_2026

  - name: StreamMemBench
    kind: benchmark
    released: "2026-06"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2606.14571
    paper: https://arxiv.org/abs/2606.14571
    scale: "Streaming benchmark over EgoLife egocentric streams, pairing each evidence anchor with an initial task and a future follow-up to test recall, feedback incorporation, and reuse"
    modalities: [egocentric-video, qa, dialogue]
    tasks: [streaming-memory, memory-assistance, evaluation, egocentric-vqa]
    verified_at: "2026-06-16"
    citation_key: streammembench_2026

  - name: Ego-METAS
    kind: benchmark
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://maria-sanvil.github.io/Ego-METAS-website/
    paper: https://arxiv.org/abs/2606.02246
    scale: "Online multimodal energy-efficient temporal action segmentation benchmark over 100+ hours from EgoExo4D, CMU-MMAC, and CaptainCook4D with RGB, audio, gaze, IMU, and monochrome streams"
    modalities: [egocentric-video, audio, gaze, imu, monochrome]
    tasks: [temporal-segmentation, action-segmentation, sensor-fusion, evaluation]
    verified_at: "2026-06-16"
    citation_key: ego_metas_2026

  - name: Plan Watch Recover / EgoProactive
    kind: benchmark
    released: "2026-06"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2606.04970
    paper: https://arxiv.org/abs/2606.04970
    scale: "Proactive procedural-assistance benchmark introducing EgoProactive with out-of-plan annotations and Pro2Bench over Ego4D, EPIC-KITCHENS, EgoExo4D, HoloAssist, and HowTo100M"
    modalities: [egocentric-video, qa, action-labels]
    tasks: [procedural-task-assistance, mistake-detection, planning, interactive-assistance]
    verified_at: "2026-06-16"
    citation_key: plan_watch_recover_2026

  - name: EgoExoBench
    kind: benchmark
    released: "2025-07"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2507.18342
    paper: https://arxiv.org/abs/2507.18342
    scale: "First-/third-person video reasoning benchmark with 7,300+ QA pairs across 11 subtasks covering semantic alignment, viewpoint association, and temporal reasoning"
    modalities: [egocentric-video, exocentric-video, qa]
    tasks: [ego-exo, cross-view, temporal-reasoning, video-qa, evaluation]
    verified_at: "2026-06-16"
    citation_key: egoexobench_2025

  - name: EgoProx
    kind: benchmark
    released: "2026-05"
    venue: "CVPR 2026"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2605.24456
    paper: https://arxiv.org/abs/2605.24456
    scale: "Egocentric 3D proximity reasoning benchmark organized across intention, exploration, exploitation, and chain-of-actions reasoning for MLLMs"
    modalities: [egocentric-video, qa, 3d]
    tasks: [spatial-reasoning, 3d, vqa, evaluation]
    verified_at: "2026-06-16"
    citation_key: egoprox_2026

  - name: BARISTA
    kind: benchmark
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2605.12074
    paper: https://arxiv.org/abs/2605.12074
    scale: "185 egocentric coffee-preparation videos with per-frame scene graphs, persistent object IDs, masks, tracks, boxes, attributes, relations, hand-object interactions, activities, and process steps"
    modalities: [egocentric-video, object-masks, object-boxes, action-labels]
    tasks: [scene-graph-qa, procedural-task-assistance, hand-object, temporal-reasoning]
    verified_at: "2026-06-16"
    citation_key: barista_2026

  - name: TAVIS
    kind: benchmark
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2605.07943
    paper: https://arxiv.org/abs/2605.07943
    scale: "Active-vision imitation-learning benchmark with TAVIS-Head (5 tasks), TAVIS-Hands (3 tasks), GR1T2 and Reachy2 embodiments, and the GALT anticipatory-gaze metric"
    modalities: [egocentric-video, gaze, robot-actions]
    tasks: [imitation-learning, gaze, robot-learning, evaluation]
    verified_at: "2026-06-16"
    citation_key: tavis_2026

  - name: EgoPoint-Bench
    kind: benchmark
    released: "2026-04"
    venue: "ACL 2026"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2604.21461
    paper: https://arxiv.org/abs/2604.21461
    scale: "11K+ simulated and real egocentric pointing samples spanning five evaluation dimensions and three levels of referential complexity"
    modalities: [egocentric-video, qa, hand-pose]
    tasks: [gesture-grounding, referential, spatial-reasoning, vqa]
    verified_at: "2026-06-16"
    citation_key: egopoint_bench_2026

  - name: ReFocus / EM-QnF
    kind: benchmark
    released: "2026-04"
    venue: "CVPR 2026"
    year: 2026
    status: watch
    url: https://nsubedi11.github.io/refocus
    paper: https://arxiv.org/abs/2604.24893
    scale: "Episodic Memory with Questions and Feedback task for ambiguous egocentric NLQ, with feedback datasets and a plug-and-play Feedback Alignment Module"
    modalities: [egocentric-video, qa, dialogue]
    tasks: [episodic-memory, natural-language-query, interactive-assistance, temporal-grounding]
    verified_at: "2026-06-16"
    citation_key: refocus_em_qnf_2026

  - name: HumanEgo
    kind: model
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://humanego-ai.github.io
    paper: https://arxiv.org/abs/2605.24934
    scale: "Zero-shot robot learning from minutes of human egocentric videos via entity-level hand-object representations and flow-matching policies"
    modalities: [egocentric-video, hand-pose, robot-actions]
    tasks: [robot-learning, imitation-learning, cross-embodiment-transfer, hand-object]
    verified_at: "2026-06-16"
    citation_key: humanego_2026

  - name: EgoGuide
    kind: model
    released: "2026-06"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://silicx.github.io/EgoGuide
    paper: https://arxiv.org/abs/2606.14665
    scale: "Robot-free demonstration collection with synchronized wrist and head/egocentric observations, online visual-geometric data-quality guidance, and a gated egocentric residual policy"
    modalities: [egocentric-video, wrist-camera-video, robot-actions]
    tasks: [data-collection, robot-learning, imitation-learning, vla]
    verified_at: "2026-06-16"
    citation_key: egoguide_2026

  - name: Ego-Pi
    kind: model
    released: "2026-06"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://egopipaper.github.io/
    paper: https://arxiv.org/abs/2606.08107
    scale: "VLA fine-tuning study over egocentric human and robot data using the pi0.5 foundation model and dexterous five-finger embodiments"
    modalities: [egocentric-video, robot-actions, hand-pose]
    tasks: [vla, robot-learning, cross-embodiment-transfer, dexterous-manipulation]
    verified_at: "2026-06-16"
    citation_key: ego_pi_2026

  - name: EgoTactile
    kind: model
    released: "2026-06"
    venue: "ICML 2026 spotlight"
    year: 2026
    status: watch
    url: https://egotactile.github.io/
    paper: https://arxiv.org/abs/2606.09243
    scale: "Benchmark and diffusion/baseline models pairing egocentric video with full-hand grasp-pressure supervision for everyday-object interactions"
    modalities: [egocentric-video, hand-pose, contact, depth]
    tasks: [contact-understanding, hand-object, robot-learning, evaluation]
    verified_at: "2026-06-16"
    citation_key: egotactile_2026

  - name: EgoPressDiff
    kind: model
    released: "2026-06"
    venue: "ICASSP 2026"
    year: 2026
    status: watch
    url: https://egopressdiff.github.io/
    paper: https://arxiv.org/abs/2606.06872
    scale: "Conditional video diffusion model for UV-domain egocentric hand-pressure maps, conditioned on hand pose, 3D mesh vertices, and depth"
    modalities: [egocentric-video, hand-pose, mesh, depth]
    tasks: [contact-understanding, hand-object, robot-learning]
    verified_at: "2026-06-16"
    citation_key: egopressdiff_2026

  - name: EgoForce Hand Pose
    kind: model
    released: "2026-05"
    venue: "SIGGRAPH 2026"
    year: 2026
    status: open
    url: https://dfki-av.github.io/EgoForce
    paper: https://arxiv.org/abs/2605.12498
    code: https://dfki-av.github.io/EgoForce
    data: https://dfki-av.github.io/EgoForce
    scale: "Monocular egocentric 3D hand-pose and shape reconstruction across fisheye, perspective, and wide-FOV camera models, with source code, data, and demo available"
    modalities: [egocentric-video, hand-pose, hand-meshes]
    tasks: [3d-hand-pose, hand-object, ar-vr, pose]
    verified_at: "2026-06-16"
    citation_key: egoforce_hand_pose_2026

  - name: PROSE
    kind: model
    released: "2026-06"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://rckola.github.io/prose/
    paper: https://arxiv.org/abs/2606.16569
    scale: "Training-free RGB-only egocentric scene registration using VLM-derived object-level 3D scene graphs for persistent spatial memory in robots and AR"
    modalities: [egocentric-video, 3d, object-masks]
    tasks: [scene-reconstruction, spatial-reasoning, ar-perception, 3d]
    verified_at: "2026-06-16"
    citation_key: prose_2026

  - name: OSGNet + MLLM Reranking
    kind: model
    released: "2026-05"
    venue: "CVPR 2026 EgoVis"
    year: 2026
    status: open
    url: https://github.com/iLearn-Lab/CVPR25-OSGNet
    paper: https://arxiv.org/abs/2605.20818
    code: https://github.com/iLearn-Lab/CVPR25-OSGNet
    scale: "Champion solution for the Ego4D Episodic Memory Challenge 2026 Natural Language Queries and GoalStep tracks, reranking OSGNet candidates with an MLLM"
    modalities: [egocentric-video, text]
    tasks: [natural-language-query, temporal-grounding, ego4d, video-language]
    verified_at: "2026-06-16"
    citation_key: osgnet_mllm_reranking_2026

  - name: OmniEgo-R2
    kind: model
    released: "2026-05"
    venue: "CVPR 2026 EgoVis"
    year: 2026
    status: open
    url: https://github.com/Lee-zixu/OmniEgo-R2
    paper: https://arxiv.org/abs/2605.24481
    code: https://github.com/Lee-zixu/OmniEgo-R2
    scale: "Routed reasoning framework for EgoCross; second place in both Source-Limited and Open-Source tracks of the 1st Cross-Domain EgoCross Challenge"
    modalities: [egocentric-video, qa]
    tasks: [video-qa, egocentric-video-reasoning, domain-transfer, grounded-reasoning]
    verified_at: "2026-06-16"
    citation_key: omniego_r2_2026

  - name: CASTLE KG Retrieval
    kind: model
    released: "2026-06"
    venue: "CVPR 2026 EgoVis"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2606.01933
    paper: https://arxiv.org/abs/2606.01933
    scale: "Third-place CASTLE 2026 challenge framework using video knowledge graphs and adaptive hierarchical retrieval over 600+ hours of synchronized 15-camera ego/exo streams"
    modalities: [egocentric-video, exocentric-video, multimodal-annotations]
    tasks: [long-context, multi-view, video-qa, temporal-reasoning]
    verified_at: "2026-06-16"
    citation_key: castle_kg_retrieval_2026

  - name: Reflective Dialogue EgoCross
    kind: model
    released: "2026-05"
    venue: "CVPR 2026 EgoVis"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2605.27885
    paper: https://arxiv.org/abs/2605.27885
    scale: "Inference-time Teacher/Solver reflective dialogue method for EgoCross, achieving third place in the Open-source Track without fine-tuning"
    modalities: [egocentric-video, qa, dialogue]
    tasks: [video-qa, egocentric-video-reasoning, domain-transfer, grounded-reasoning]
    verified_at: "2026-06-16"
    citation_key: reflective_dialogue_egocross_2026

  - name: EARL
    kind: model
    released: "2026-05"
    venue: "ICML 2026"
    year: 2026
    status: open
    url: https://github.com/yuggiehk/EARL
    paper: https://arxiv.org/abs/2605.14742
    code: https://github.com/yuggiehk/EARL
    scale: "Analysis-guided RL framework for egocentric interaction reasoning and pixel grounding, with structured coarse-to-fine interaction parsing and GRPO optimization"
    modalities: [egocentric-video, object-masks, qa]
    tasks: [interaction, visual-grounding, segmentation, embodied-reasoning]
    verified_at: "2026-06-16"
    citation_key: earl_2026

  - name: EgoExo-WM
    kind: model
    released: "2026-05"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://vision.cs.utexas.edu/projects/EgoExo-WM/
    paper: https://arxiv.org/abs/2605.15477
    scale: "Transforms exocentric video into egocentric video using body-pose priors to train whole-body action-conditioned egocentric world models"
    modalities: [egocentric-video, exocentric-video, human-pose]
    tasks: [world-modeling, cross-view, action-conditioned-generation, human-motion]
    verified_at: "2026-06-16"
    citation_key: egoexo_wm_2026

  - name: MotionGRPO
    kind: model
    released: "2026-05"
    venue: "ICML 2026"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2605.05680
    paper: https://arxiv.org/abs/2605.05680
    scale: "GRPO-based reinforcement post-training for diffusion egocentric full-body 3D motion recovery from head-mounted device signals"
    modalities: [egocentric-video, body-pose, human-motion]
    tasks: [egocentric-3d-pose, motion-estimation, policy-optimization]
    verified_at: "2026-06-16"
    citation_key: motiongrpo_2026

  - name: Hand Trajectory Fusion for Ego NLQ
    kind: model
    released: "2026-06"
    venue: "CVPR 2026 EgoVis"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2606.02962
    paper: https://arxiv.org/abs/2606.02962
    scale: "Hand-trajectory encoder and cross-attention fusion for Ego4D Natural Language Query grounding, with gains on hand-object-interaction and quantity/state queries"
    modalities: [egocentric-video, hand-pose, text]
    tasks: [natural-language-query, temporal-grounding, hand-object, ego4d]
    verified_at: "2026-06-16"
    citation_key: hand_trajectory_fusion_ego_nlq_2026

  - name: ACE-Ego-0
    kind: model
    released: "2026-06"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2606.17200
    paper: https://arxiv.org/abs/2606.17200
    scale: "Unifies 4.53K hours of robot/sim data with 1.48K hours of pseudo-action-labeled egocentric human video via reliability-aware weighting for VLA pretraining; evaluated on RoboCasa GR1 and RoboTwin 2.0"
    modalities: [egocentric-video, language, robot-actions]
    tasks: [vla, robot-learning, imitation-learning, manipulation]
    release_note: "arXiv preprint (June 2026); watch for code and checkpoint release."
    verified_at: "2026-06-18"
    citation_key: ace_ego0_2026

  - name: EgoXtreme
    kind: dataset
    released: "2026-03"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2603.25135
    paper: https://arxiv.org/abs/2603.25135
    scale: "Smart-glasses egocentric 6D object-pose dataset spanning industrial, sports, and rescue scenes with extreme motion blur, dynamic lighting, and heavy occlusion where current pose estimators fail"
    modalities: [egocentric-video, rgb]
    tasks: [3d-4d-understanding, pose-and-body]
    release_note: "arXiv preprint (March 2026); watch for dataset and benchmark release."
    verified_at: "2026-06-18"
    citation_key: egoxtreme_2026

  - name: EgoForge
    kind: model
    released: "2026-03"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2603.20169
    paper: https://arxiv.org/abs/2603.20169
    scale: "Goal-directed egocentric world simulator that rolls out coherent first-person video from a single egocentric image and a high-level instruction"
    modalities: [egocentric-video, text]
    tasks: [world-modeling, video-generation, hand-object-interaction]
    release_note: "arXiv preprint (March 2026); watch for code and weights."
    verified_at: "2026-06-18"
    citation_key: egoforge_2026

  - name: EgoHOI World Model
    kind: model
    released: "2026-03"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2603.13615
    paper: https://arxiv.org/abs/2603.13615
    scale: "Physics-informed egocentric world model that synthesizes photorealistic, contact-consistent hand-object interactions from action signals alone, without future-state inputs"
    modalities: [egocentric-video]
    tasks: [world-modeling, hand-object-interaction]
    release_note: "arXiv preprint (March 2026); watch for code release."
    verified_at: "2026-06-18"
    citation_key: egohoi_wm_2026

  - name: EgoControl
    kind: model
    released: "2025-11"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2511.18173
    paper: https://arxiv.org/abs/2511.18173
    scale: "Pose-controllable egocentric video diffusion model that generates temporally coherent first-person frames conditioned on sequences of 3D full-body poses"
    modalities: [egocentric-video, body-pose]
    tasks: [video-generation, world-modeling, pose-and-body]
    release_note: "arXiv preprint (November 2025); watch for code release."
    verified_at: "2026-06-18"
    citation_key: egocontrol_2025

  - name: EgoFlow
    kind: model
    released: "2026-04"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2604.01421
    paper: https://arxiv.org/abs/2604.01421
    scale: "Gradient-guided flow-matching framework that generates physically plausible 6DoF object trajectories from egocentric video using differentiable collision and smoothness constraints"
    modalities: [egocentric-video]
    tasks: [3d-4d-understanding, hand-object-interaction]
    release_note: "arXiv preprint (April 2026); watch for code release."
    verified_at: "2026-06-18"
    citation_key: egoflow_2026

  - name: WholeBodyVLA
    kind: model
    released: "2025-12"
    venue: "ICLR 2026"
    year: 2025
    status: open
    url: https://github.com/OpenDriveLab/WholebodyVLA
    paper: https://arxiv.org/abs/2512.11047
    code: https://github.com/OpenDriveLab/WholebodyVLA
    scale: "Unified latent VLA that learns latent actions from action-free egocentric videos via a Latent Action Model for whole-body humanoid loco-manipulation; +21.3% over GR00T on AgiBot X2"
    modalities: [egocentric-video, language, robot-actions]
    tasks: [vla, robot-learning, manipulation]
    verified_at: "2026-06-19"
    citation_key: wholebodyvla_2025

  - name: In-N-On
    kind: model
    released: "2025-11"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2511.15704
    paper: https://arxiv.org/abs/2511.15704
    scale: "Scales egocentric manipulation with 1,000+ hours of in-the-wild human egocentric video (PHSD) plus 20+ hours on-task data to train the Human0 language-conditioned flow-matching policy"
    modalities: [egocentric-video, language]
    tasks: [robot-learning, manipulation, imitation-learning]
    release_note: "arXiv preprint (November 2025); watch for code and data release."
    verified_at: "2026-06-19"
    citation_key: in_n_on_2025

  - name: EgoZero
    kind: model
    released: "2025-05"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2505.20290
    paper: https://arxiv.org/abs/2505.20290
    scale: "Trains robot manipulation policies from Project Aria smart-glasses human demonstrations with zero robot training data; ~70% zero-shot success across 7 tasks on a Franka arm"
    modalities: [egocentric-video]
    tasks: [robot-learning, manipulation, imitation-learning]
    release_note: "arXiv preprint (May 2025); watch for code release."
    verified_at: "2026-06-19"
    citation_key: egozero_2025

  - name: Ego-PM
    kind: model
    released: "2025-08"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2508.19852
    paper: https://arxiv.org/abs/2508.19852
    scale: "Egocentric predictive world model that jointly forecasts future actions and video frames conditioned on hand trajectories via a latent diffusion model; tested on Ego4D, BridgeData, and RLBench"
    modalities: [egocentric-video, hand-pose, text]
    tasks: [world-modeling, video-generation, robot-learning]
    release_note: "arXiv preprint (August 2025); watch for code release."
    verified_at: "2026-06-19"
    citation_key: ego_pm_2025

  - name: PEDESTRIAN
    kind: dataset
    released: "2025-12"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2512.19190
    paper: https://arxiv.org/abs/2512.19190
    scale: "340 first-person pavement videos covering 29 urban sidewalk obstacle types, with deep-learning baselines for pedestrian-safety obstacle detection"
    modalities: [egocentric-video, rgb]
    tasks: [detection-segmentation, ar-sensing-navigation]
    release_note: "arXiv preprint (December 2025); watch for dataset release."
    verified_at: "2026-06-19"
    citation_key: pedestrian_2025

  - name: DeltaDorsal
    kind: model
    released: "2026-01"
    venue: "arXiv"
    year: 2026
    status: open
    url: https://github.com/hilab-open-source/deltadorsal
    paper: https://arxiv.org/abs/2601.15516
    code: https://github.com/hilab-open-source/deltadorsal
    scale: "Dual-stream delta encoder that uses dorsal hand-skin deformation features to improve egocentric hand pose under self-occlusion (~18% lower mean per-joint angle error)"
    modalities: [egocentric-video, hand-pose]
    tasks: [pose-and-body, hand-object-interaction]
    verified_at: "2026-06-19"
    citation_key: deltadorsal_2026

  - name: Edge Episodic Memory QA
    kind: model
    released: "2026-02"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2602.22455
    paper: https://arxiv.org/abs/2602.22455
    scale: "On-device dual-thread MLLM pipeline (continuous video-to-text summarization plus asynchronous querying) for real-time egocentric episodic-memory QA on QAEgo4D-Closed"
    modalities: [egocentric-video, text]
    tasks: [question-answering, memory-and-long-context]
    release_note: "arXiv preprint (February 2026); watch for code release."
    verified_at: "2026-06-19"
    citation_key: edge_episodic_memory_qa_2026

  - name: LOME
    kind: model
    released: "2026-03"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2603.27449
    paper: https://arxiv.org/abs/2603.27449
    scale: "Action-conditioned egocentric world model that generates photorealistic human-object interactions from an image, text, and per-frame body/hand actions, with realistic physical consequences"
    modalities: [egocentric-video, body-pose, hand-pose, text]
    tasks: [world-modeling, video-generation, hand-object-interaction]
    release_note: "arXiv preprint (March 2026); watch for code release."
    verified_at: "2026-06-19"
    citation_key: lome_2026

  - name: Ego-Nav Co-training
    kind: model
    released: "2026-06"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2606.01951
    paper: https://arxiv.org/abs/2606.01951
    scale: "Converts egocentric human walking videos into robot-action datasets and co-trains a VLA with robot demonstrations for mobile navigation (fruit-search task)"
    modalities: [egocentric-video, robot-actions, language]
    tasks: [vla, robot-learning, ar-sensing-navigation]
    release_note: "arXiv preprint (June 2026); watch for code release."
    verified_at: "2026-06-19"
    citation_key: ego_nav_cotraining_2026

  - name: EXPLORE-Bench
    kind: benchmark
    released: "2026-03"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2603.09731
    paper: https://arxiv.org/abs/2603.09731
    scale: "Egocentric scene-prediction benchmark over real first-person videos: predict the final scene state after long atomic-action sequences, with object, attribute, and relation annotations; large human-model gap"
    modalities: [egocentric-video, text]
    tasks: [reasoning-intent-planning, question-answering, world-modeling]
    release_note: "arXiv preprint (March 2026); watch for benchmark release."
    verified_at: "2026-06-19"
    citation_key: explore_bench_2026

  - name: Hand2World
    kind: model
    released: "2026-02"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2602.09600
    paper: https://arxiv.org/abs/2602.09600
    scale: "Autoregressive egocentric world model that generates photorealistic first-person interaction video from free-space hand gestures using occlusion-invariant hand conditioning and Plucker-ray camera embeddings; best on ARCTIC, HOT3D, HOI4D"
    modalities: [egocentric-video, hand-pose]
    tasks: [world-modeling, video-generation, hand-object-interaction]
    release_note: "arXiv preprint (February 2026); watch for code release."
    verified_at: "2026-06-19"
    citation_key: hand2world_2026

  - name: EgoSim
    kind: model
    released: "2026-04"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2604.01001
    paper: https://arxiv.org/abs/2604.01001
    scale: "Closed-loop egocentric world simulator that generates spatially consistent first-person interaction videos with explicit 3D grounding and dynamic world-state updates for multi-stage simulation and cross-embodiment transfer"
    modalities: [egocentric-video, camera-pose]
    tasks: [world-modeling, video-generation, robot-learning]
    release_note: "arXiv preprint (April 2026); watch for code release."
    verified_at: "2026-06-19"
    citation_key: egosim_2026

  - name: EgoPush
    kind: model
    released: "2026-02"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2602.18071
    paper: https://arxiv.org/abs/2602.18071
    scale: "End-to-end egocentric multi-object non-prehensile rearrangement for mobile robots from a single first-person camera, using object-centric relative-pose latents and teacher-student RL with zero-shot sim-to-real transfer"
    modalities: [egocentric-video]
    tasks: [robot-learning, manipulation, ar-sensing-navigation]
    release_note: "arXiv preprint (February 2026); watch for code release."
    verified_at: "2026-06-19"
    citation_key: egopush_2026

  - name: MEgoHand
    kind: model
    released: "2025-05"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2505.16602
    paper: https://arxiv.org/abs/2505.16602
    scale: "Multimodal egocentric hand-object interaction motion generator pairing a VLM cerebrum with a flow-matching policy over a unified 3.35M RGB-D frame, 24K interaction, 1.2K object dataset"
    modalities: [egocentric-video, rgb-d, text, hand-pose]
    tasks: [hand-object-interaction, video-generation, pose-and-body]
    release_note: "arXiv preprint (May 2025); watch for code and data release."
    verified_at: "2026-06-19"
    citation_key: megohand_2025

  - name: EgoH4
    kind: model
    released: "2025-04"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2504.08654
    paper: https://arxiv.org/abs/2504.08654
    scale: "Diffusion transformer that forecasts both-hand 3D trajectories and poses from egocentric video and camera pose, including out-of-frame hands; trained on Ego-Exo4D (156K train / 34K eval sequences)"
    modalities: [egocentric-video, camera-pose, hand-pose]
    tasks: [pose-and-body, hand-object-interaction, world-modeling]
    release_note: "arXiv preprint (April 2025); watch for code release."
    verified_at: "2026-06-19"
    citation_key: egoh4_2025

  - name: Diff-IP2D
    kind: model
    released: "2024-05"
    venue: "arXiv"
    year: 2024
    status: open
    url: https://github.com/IRMVLab/Diff-IP2D
    paper: https://arxiv.org/abs/2405.04370
    code: https://github.com/IRMVLab/Diff-IP2D
    scale: "Non-autoregressive diffusion model that jointly forecasts 2D hand trajectories and object affordances on egocentric video with explicit camera-egomotion conditioning"
    modalities: [egocentric-video, hand-pose]
    tasks: [hand-object-interaction, pose-and-body]
    verified_at: "2026-06-19"
    citation_key: diff_ip2d_2024

  - name: IndEgo
    kind: dataset
    released: "2025-11"
    venue: "NeurIPS 2025"
    year: 2025
    status: open
    url: https://huggingface.co/datasets/FraunhoferIPK/IndEgo
    paper: https://arxiv.org/abs/2511.19684
    scale: "~197 hours of egocentric (plus ~97 hours exocentric) industrial collaborative-work video over assembly, logistics, inspection, and repair, with gaze, narration, sound, motion, hand pose, and point clouds"
    modalities: [egocentric-video, gaze, audio, motion, hand-pose, point-cloud]
    tasks: [action-and-procedure, assistance-and-agents, detection-segmentation]
    verified_at: "2026-06-19"
    citation_key: indego_2025

  - name: LaMAria
    kind: benchmark
    released: "2025-09"
    venue: "arXiv"
    year: 2025
    status: open
    url: https://lamaria.ethz.ch
    paper: https://arxiv.org/abs/2509.26639
    scale: "City-scale egocentric visual-inertial SLAM benchmark from glasses-like wearables, hours/kilometers of trajectories with centimeter-accurate surveyed ground truth and difficulty-tiered tracks (incl. night and vehicle motion)"
    modalities: [egocentric-video, imu, camera-pose]
    tasks: [three-d-and-scene, ar-sensing-navigation, tracking]
    verified_at: "2026-06-19"
    citation_key: lamaria_2025

  - name: EgoLifter
    kind: model
    released: "2024-03"
    venue: "ECCV 2024"
    year: 2024
    status: open
    url: https://github.com/facebookresearch/egolifter
    paper: https://arxiv.org/abs/2403.18118
    code: https://github.com/facebookresearch/egolifter
    scale: "Open-world 3D segmentation that decomposes natural egocentric video into individual 3D objects using 3D Gaussians and SAM weak supervision, with a transient module for dynamic objects (Aria Digital Twin benchmark)"
    modalities: [egocentric-video, camera-pose]
    tasks: [detection-segmentation, three-d-and-scene]
    verified_at: "2026-06-19"
    citation_key: egolifter_2024

  - name: EgoSplat
    kind: model
    released: "2025-03"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2503.11345
    paper: https://arxiv.org/abs/2503.11345
    scale: "Open-vocabulary egocentric scene understanding with language-embedded 3D Gaussian splatting, using SAM2-tracked multi-view instance features and an instance-aware spatio-temporal module (gains on Aria Digital Twin)"
    modalities: [egocentric-video, language]
    tasks: [detection-segmentation, three-d-and-scene]
    release_note: "arXiv preprint (March 2025); watch for code release."
    verified_at: "2026-06-19"
    citation_key: egosplat_2025

  - name: 3D-Aware Ego Instance Tracking
    kind: model
    released: "2024-08"
    venue: "arXiv"
    year: 2024
    status: watch
    url: https://arxiv.org/abs/2408.09860
    paper: https://arxiv.org/abs/2408.09860
    scale: "3D-aware instance segmentation and multi-object tracking in egocentric video that lifts 2D masks with scene geometry to survive rapid motion and occlusion (large ID-switch reductions on EPIC Fields)"
    modalities: [egocentric-video, camera-pose]
    tasks: [detection-segmentation, tracking, three-d-and-scene]
    release_note: "arXiv preprint (August 2024); watch for code release."
    verified_at: "2026-06-19"
    citation_key: ego_3d_instance_tracking_2024

  - name: EgoCogNav
    kind: model
    released: "2025-11"
    venue: "arXiv"
    year: 2025
    status: watch
    url: https://arxiv.org/abs/2511.17581
    paper: https://arxiv.org/abs/2511.17581
    scale: "Cognition-aware egocentric navigation framework that forecasts human trajectory and head motion from perceived path uncertainty, modeling scanning, hesitation, and backtracking, with the 6-hour CEN dataset"
    modalities: [egocentric-video, motion]
    tasks: [ar-sensing-navigation, reasoning-intent-planning, pose-and-body]
    release_note: "arXiv preprint (November 2025); watch for dataset and code release."
    verified_at: "2026-06-19"
    citation_key: egocognav_2025

  - name: Neck-Mounted Gaze (GLC)
    kind: model
    released: "2026-02"
    venue: "arXiv"
    year: 2026
    status: watch
    url: https://arxiv.org/abs/2602.11669
    paper: https://arxiv.org/abs/2602.11669
    scale: "Transformer gaze estimator (GLC) for a shoulder-level neck-mounted camera with an out-of-bound gaze classification task and head/neck multi-view co-learning, plus a ~4-hour 8-subject dataset"
    modalities: [egocentric-video, gaze]
    tasks: [ar-sensing-navigation, pose-and-body]
    release_note: "arXiv preprint (February 2026); watch for code and data release."
    verified_at: "2026-06-19"
    citation_key: neck_gaze_glc_2026

  - name: CMU-MMAC
    milestone: 2009
    milestone_note: "The earliest egocentric dataset; launched first-person activity recognition at the first IEEE Workshop on Egocentric Vision (CVPR 2009)."
    kind: dataset
    released: "2009"
    venue: "CMU tech report 2009"
    year: 2009
    status: open
    url: http://kitchen.cs.cmu.edu/
    paper: http://kitchen.cs.cmu.edu/
    scale: "Among the earliest egocentric datasets: a multimodal kitchen-activity database with head-mounted egocentric video plus body IMUs, motion capture, and audio for 43 subjects cooking 5 recipes"
    modalities: [egocentric-video, imu, motion, audio]
    tasks: [action-and-procedure, detection-segmentation]
    scope_note: "2009 CMU Multimodal Activity database; one of the first egocentric (ego+exo) activity datasets."
    verified_at: "2026-06-19"
    citation_key: cmu_mmac_2009

  - name: First-Person Social Interactions
    kind: dataset
    released: "2012"
    venue: "CVPR 2012"
    year: 2012
    status: open
    url: http://ai.stanford.edu/~alireza/publication/CVPR12.pdf
    paper: http://ai.stanford.edu/~alireza/publication/CVPR12.pdf
    scale: "The first egocentric social-interaction dataset: day-long head-mounted video of 8 subjects at a theme park, annotated for social interactions, roles, attention, and turn-taking"
    modalities: [egocentric-video]
    tasks: [audio-and-social, reasoning-intent-planning, detection-segmentation]
    verified_at: "2026-06-19"
    citation_key: fpsi_2012

  - name: BEOID
    kind: dataset
    released: "2014"
    venue: "BMVC 2014"
    year: 2014
    status: open
    url: https://dimadamen.github.io/BEOID/
    paper: https://dimadamen.github.io/BEOID/
    scale: "Gaze-tracked egocentric video of 8 users interacting with objects across 6 everyday locations (kitchen, workspace, printer, corridor, gym) for discovering task-relevant objects and interaction modes"
    modalities: [egocentric-video, gaze]
    tasks: [hand-object-interaction, action-and-procedure, detection-segmentation]
    verified_at: "2026-06-19"
    citation_key: beoid_2014