{
  "title": "Xperience-10M Official Dataset Card Alignment",
  "checked_at_utc": "2026-06-01T11:14:51+00:00",
  "source_urls": {
    "official_hf_dataset": "https://huggingface.co/datasets/ropedia-ai/xperience-10m",
    "official_hf_api": "https://huggingface.co/api/datasets/ropedia-ai/xperience-10m",
    "official_sample": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample",
    "ropedia_dataset_site": "https://ropedia.com/dataset",
    "ropedia_release_page": "https://ropedia.com/blog/20260316_xperience_10m",
    "homie_toolkit": "https://github.com/Ropedia/HOMIE-toolkit"
  },
    "hf_repo_metadata_observed": {
        "repo_id": "ropedia-ai/xperience-10m",
        "pretty_name": "Xperience-10M",
        "repo_sha": "ce943cf271a758b60240084892d05cf6dc12dd90",
        "last_modified": "2026-04-21T05:03:45.000Z",
        "gated": "manual",
        "task_categories": [
            "video-classification",
      "image-to-text",
      "depth-estimation",
            "robotics"
        ],
        "card_tags": [
            "egocentric",
            "first-person",
            "multimodal",
            "3d",
            "4d",
            "embodied-ai",
            "robotics",
            "human-motion",
            "mocap",
            "imu",
            "audio",
            "depth",
            "captions",
            "video"
        ],
        "modalities": [
            "3d",
            "audio",
      "video"
    ],
    "language": [
      "en"
    ],
    "size_categories": [
      "1M<n<10M"
        ],
        "license": "other",
        "access_note": "Reviewed gated access for approved non-commercial use; an external agreement-signing step may be required before approval.",
        "live_hf_page_observed": {
            "source": "Hugging Face dataset page/API public metadata",
            "total_file_size_display": "31.9 TB",
            "used_storage_bytes_observed": 31871115497224,
            "note": "This live HF-hosted file-size display is separate from the dataset card's about-1PB full-scale data statement."
        },
        "api_file_listing_observed": {
            "scope": "public Hugging Face API metadata, not local data possession",
            "sibling_count": 85258,
            "session_folder_count": 803,
            "episode_folder_count": 12103,
            "annotation_hdf5_count": 12103,
            "mp4_count": 72612,
            "visualization_rrd_count": 541,
            "canonical_episode_file_counts": {
                "annotation.hdf5": 12103,
                "fisheye_cam0.mp4": 12102,
                "fisheye_cam1.mp4": 12102,
                "fisheye_cam2.mp4": 12102,
                "fisheye_cam3.mp4": 12102,
                "stereo_left.mp4": 12102,
                "stereo_right.mp4": 12102,
                "visualization.rrd": 541
            }
        }
    },
    "public_sample_card_observed": {
        "repo_id": "ropedia-ai/xperience-10m-sample",
        "pretty_name": "Xperience-10M-Sample",
        "license": "cc-by-nc-4.0",
        "tags": [
            "sample",
            "xperience-10k"
        ],
        "size_categories": [
            "n<1K"
        ],
        "card_summary": "A sample episode for Xperience-10M. The card says videos and annotations can be downloaded and inspected with HOMIE Toolkit, and the RRD file can be visualized with Rerun 0.29.0.",
        "tooling": [
            "HOMIE Toolkit",
            "Rerun 0.29.0 for visualization.rrd"
        ]
    },
  "official_dataset_summary": {
    "description": "Large-scale egocentric multimodal human-experience data for embodied AI, robotics, world models, and spatial intelligence.",
    "experience_units": "about 10 million",
    "recording_hours": "about 10,000",
    "storage_described_by_card": "about 1 PB"
  },
  "official_scale_statistics": {
    "rgb_frames": "about 2.88 billion",
    "depth_frames": "about 720 million",
    "camera_pose_records": "about 576 million",
    "motion_capture_frames": "about 576 million",
    "imu_records": "about 7.2 billion",
    "caption_sentences": "about 16 million",
    "caption_words": "about 200 million",
    "vocabulary_words": "about 6,000",
    "object_annotations": "about 350,000",
    "trajectory_distance": "about 39,000 km"
  },
  "official_modalities": [
    "six RGB video streams: four fisheye views and two rectified stereo views",
    "audio embedded in the video streams",
    "stereo depth and confidence",
    "camera pose, SLAM trajectory, and point cloud",
    "two-hand motion capture",
    "full-body motion capture",
    "inertial accelerometer and gyroscope streams",
    "hierarchical language and caption annotations",
    "metadata and calibration records"
  ],
    "episode_layout": {
        "folder_pattern": "<session_uuid>/ep<episode_id>/",
    "required_for_valid_episode_in_this_repo": [
      "annotation.hdf5"
    ],
    "preferred_for_full_omni_in_this_repo": [
      "fisheye_cam0.mp4",
      "fisheye_cam1.mp4",
      "fisheye_cam2.mp4",
      "fisheye_cam3.mp4",
      "stereo_left.mp4",
      "stereo_right.mp4"
        ],
        "optional_or_excluded": [
            "visualization.rrd"
        ],
        "training_policy": "Use annotation.hdf5 plus available MP4 streams; keep visualization.rrd for optional human inspection only and exclude it from training/public bundles."
    },
  "annotation_hdf5_groups": [
    "calibration",
    "slam / camera pose",
    "depth",
    "hand_mocap",
    "full_body_mocap",
    "imu",
    "video timing",
    "metadata",
    "caption / language annotations"
  ],
    "official_intended_uses": [
        "egocentric video and action understanding",
        "task and subtask recognition",
        "temporal action localization",
        "action-language grounding and action captioning",
        "human-object interaction analysis",
        "object grounding and caption/language grounding",
        "audio-visual learning and multimodal pretraining",
    "embodied reasoning and world-model learning",
    "robotics imitation learning",
    "depth estimation, odometry, SLAM, and scene reconstruction",
    "hand/body pose and human motion understanding",
    "sensor fusion"
  ],
  "current_repo_alignment": {
    "validated_episode_count": 1,
    "validated_frames": 5821,
    "validated_windows": 1161,
    "current_feature_dim": 8378,
    "raw_data_redistributed": false,
    "audio_feature_status": "Audio is present in the sample MP4 streams and visualized, but not extracted into the current baseline feature vector.",
    "implemented_task_count": 12,
    "neural_head_count": 12,
    "covered_by_current_tasks": [
      "action/subtask recognition",
      "next-action prediction",
      "transition and temporal diagnostics",
      "hand trajectory forecasting",
      "contact prediction",
      "object relevance",
      "caption grounding",
      "cross-modal retrieval",
      "modality reconstruction",
      "misalignment detection"
    ],
    "not_yet_claimed": [
      "full audio-visual learning",
      "caption generation",
      "depth-pixel estimation",
      "SLAM estimation",
      "neural rendering",
      "policy learning",
      "cross-episode generalization",
      "real 32-episode Qwen3-Omni model quality"
    ]
  },
    "responsible_use_boundary": [
        "No raw MP4, raw annotation.hdf5, private gated data, raw visualization.rrd, or full Qwen weights are redistributed.",
        "The public sample card lists cc-by-nc-4.0; the full gated dataset uses the official Ropedia/Xperience-10M access terms and license field.",
        "The official card describes the open-source dataset as limited in diversity and showcase/production quality, so robust evaluation and downstream safeguards are still required.",
        "The project does not support identity recognition, re-identification, biometric profiling, surveillance, sensitive attribute inference, or safety-critical deployment.",
        "Dataset use remains governed by the official Ropedia/Xperience-10M terms."
    ]
}
